# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

# Modules loading

In [1]:
import numpy as np
import pandas as pd

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.RECOUNT2FULL
# GTEX_TISSUE = None
GENE_SEL_STRATEGY = "var_pc_log2"

# Paths

In [3]:
INPUT_GENE_EXPR_DATA_FILE = DATASET_CONFIG["DATA_DIR"] / "recount2_rpkm.pkl"
display(INPUT_GENE_EXPR_DATA_FILE)

assert INPUT_GENE_EXPR_DATA_FILE.exists()

PosixPath('/opt/data/data/recount2full/recount2_rpkm.pkl')

In [4]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/recount2full/similarity_matrices/recount2_rpkm-{gene_sel_strategy}-{corr_method}.pkl')

In [5]:
OUTPUT_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(OUTPUT_FILE)

PosixPath('/opt/data/results/recount2full/similarity_matrices/recount2_rpkm-var_pc_log2-all.pkl')

# Load data

## Gene Ensembl ID -> Symbol mapping

In [6]:
# gene_map = pd.read_pickle(
#     DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
# )

In [7]:
# gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [8]:
# assert gene_map["ENSG00000145309.5"] == "CABS1"

## Gene expression

In [9]:
# data = pd.read_pickle(INPUT_GENE_EXPR_DATA_FILE)

In [10]:
# data.shape

(57096, 37032)

In [11]:
# data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
ENSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.929351,1.175312,0.459859,1.254482,...,0.0179,0.029771,0.081792,0.0,0.064336,0.408565,0.0,0.010008,0.0,0.02744
ENSG00000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.0,0.0
ENSG00000000419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.561367,2.945875,...,29.761841,37.385839,20.649367,30.926951,23.025321,24.162726,11.302146,30.588071,20.268225,14.817692
ENSG00000000457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262267,...,3.886637,4.945604,2.421947,4.12707,2.59754,2.692441,3.784122,2.715137,3.255437,2.210011
ENSG00000000460,0.084565,0.0,0.497083,0.0,0.0,0.0,1.177199,0.0,0.0,0.0,...,2.585671,3.58065,1.800513,3.013335,2.352776,1.680907,2.41318,2.202502,1.720274,2.199518


## Clustermatch

In [12]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="clustermatch",
    )
)

In [13]:
clustermatch_df.shape

(5000, 5000)

In [14]:
clustermatch_df.head()

Unnamed: 0,ENSG00000283293,ENSG00000202198,ENSG00000277027,ENSG00000269900,ENSG00000274012,ENSG00000276168,ENSG00000222328,ENSG00000277209,ENSG00000199153,ENSG00000198975,...,ENSG00000091140,ENSG00000271029,ENSG00000053372,ENSG00000113845,ENSG00000231864,ENSG00000141574,ENSG00000169750,ENSG00000129559,ENSG00000237441,ENSG00000204103
ENSG00000283293,1.0,0.997409,0.388822,0.388148,0.307624,0.309664,0.188582,0.362346,0.016348,0.019026,...,0.04656,0.032001,0.052698,0.054409,0.070712,0.073847,0.082497,0.057451,0.08066,0.07111
ENSG00000202198,0.997409,1.0,0.388687,0.388014,0.307145,0.309304,0.189155,0.362346,0.016569,0.019276,...,0.046592,0.032045,0.052699,0.054446,0.070764,0.073885,0.082517,0.057564,0.080703,0.071115
ENSG00000277027,0.388822,0.388687,1.0,0.997248,0.312436,0.323767,0.253694,0.442272,0.04458,0.050314,...,0.055954,0.055008,0.052847,0.068255,0.144014,0.12681,0.13399,0.060372,0.097734,0.106721
ENSG00000269900,0.388148,0.388014,0.997248,1.0,0.312436,0.323521,0.254239,0.44299,0.045083,0.050946,...,0.056069,0.055043,0.0529,0.068313,0.144032,0.126865,0.134004,0.060549,0.097829,0.106795
ENSG00000274012,0.307624,0.307145,0.312436,0.312436,1.0,0.813849,0.167205,0.301306,0.02338,0.024478,...,0.036689,0.020403,0.03192,0.040169,0.051892,0.052831,0.055428,0.043394,0.060808,0.054081


## Pearson

In [16]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="pearson",
    )
)

In [17]:
pearson_df.shape

(5000, 5000)

In [18]:
pearson_df.head()

Unnamed: 0,ENSG00000283293,ENSG00000202198,ENSG00000277027,ENSG00000269900,ENSG00000274012,ENSG00000276168,ENSG00000222328,ENSG00000277209,ENSG00000199153,ENSG00000198975,...,ENSG00000091140,ENSG00000271029,ENSG00000053372,ENSG00000113845,ENSG00000231864,ENSG00000141574,ENSG00000169750,ENSG00000129559,ENSG00000237441,ENSG00000204103
ENSG00000283293,1.0,0.999999,0.515202,0.514507,0.6533,0.649241,0.166681,0.518923,-0.040069,-0.043148,...,-0.064204,-0.058628,-0.04691,-0.052973,-0.025782,-0.046511,-0.024716,-0.062468,-0.071035,-0.037985
ENSG00000202198,0.999999,1.0,0.515256,0.514562,0.65325,0.649193,0.166957,0.518955,-0.040066,-0.043145,...,-0.064222,-0.058622,-0.046917,-0.052972,-0.025804,-0.046519,-0.024727,-0.062482,-0.071055,-0.037999
ENSG00000277027,0.515202,0.515256,1.0,0.999971,0.440194,0.465146,0.147535,0.461809,-0.023337,-0.022233,...,-0.062248,-0.032562,-0.020159,-0.045516,0.019484,-0.038532,0.000263,-0.050129,-0.050948,-0.036801
ENSG00000269900,0.514507,0.514562,0.999971,1.0,0.439583,0.464473,0.148328,0.463137,-0.023019,-0.021799,...,-0.062479,-0.032622,-0.020378,-0.045693,0.019221,-0.038585,0.000123,-0.050248,-0.051162,-0.036874
ENSG00000274012,0.6533,0.65325,0.440194,0.439583,1.0,0.959574,0.113967,0.599963,-0.037321,-0.040465,...,-0.072874,-0.052946,-0.033689,-0.044979,-0.011856,-0.043838,-0.012399,-0.057019,-0.073321,-0.037971


In [19]:
assert clustermatch_df.index.equals(pearson_df.index)

## Spearman

In [20]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="spearman",
    )
)

In [21]:
spearman_df.shape

(5000, 5000)

In [22]:
spearman_df.head()

Unnamed: 0,ENSG00000283293,ENSG00000202198,ENSG00000277027,ENSG00000269900,ENSG00000274012,ENSG00000276168,ENSG00000222328,ENSG00000277209,ENSG00000199153,ENSG00000198975,...,ENSG00000091140,ENSG00000271029,ENSG00000053372,ENSG00000113845,ENSG00000231864,ENSG00000141574,ENSG00000169750,ENSG00000129559,ENSG00000237441,ENSG00000204103
ENSG00000283293,1.0,0.999995,0.775942,0.775672,0.746934,0.742736,0.597757,0.754145,0.184315,0.18393,...,-0.065746,-0.000966,0.04388,-0.056071,0.163562,0.167481,0.201552,-0.104922,0.120857,0.186545
ENSG00000202198,0.999995,1.0,0.776191,0.775924,0.746909,0.74266,0.598109,0.754395,0.185232,0.184681,...,-0.066264,-0.001348,0.043305,-0.056482,0.16285,0.166897,0.201062,-0.105502,0.120152,0.186211
ENSG00000277027,0.775942,0.776191,1.0,0.99999,0.745275,0.738482,0.660559,0.815825,0.334711,0.326562,...,-0.117128,0.033452,0.049611,-0.086283,0.188504,0.142549,0.18812,-0.154408,0.048148,0.130833
ENSG00000269900,0.775672,0.775924,0.99999,1.0,0.745103,0.738208,0.660972,0.816017,0.336193,0.327756,...,-0.117902,0.032839,0.048756,-0.087053,0.187588,0.141678,0.187281,-0.155177,0.047189,0.130198
ENSG00000274012,0.746934,0.746909,0.745275,0.745103,1.0,0.970719,0.607713,0.721553,0.217818,0.215162,...,-0.119052,0.01708,0.023516,-0.08992,0.211201,0.193238,0.214187,-0.132113,0.129397,0.199827


In [23]:
assert clustermatch_df.index.equals(spearman_df.index)

## Merge

In [24]:
def get_upper_triag(data, k=1):
    mask = np.triu(np.ones(data.shape), k=k).astype(bool)
    return data.where(mask)

In [25]:
# # make sure genes match
# clustermatch_df = clustermatch_df.loc[pearson_df.index, pearson_df.columns]

In [26]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [27]:
clustermatch_df = clustermatch_df.unstack().rename_axis((None, None)).dropna()

In [28]:
clustermatch_df.shape

(12497500,)

In [29]:
clustermatch_df.head()

ENSG00000202198  ENSG00000283293    0.997409
ENSG00000277027  ENSG00000283293    0.388822
                 ENSG00000202198    0.388687
ENSG00000269900  ENSG00000283293    0.388148
                 ENSG00000202198    0.388014
dtype: float64

In [30]:
pearson_df = get_upper_triag(pearson_df)

In [31]:
# make pearson abs
pearson_df = pearson_df.unstack().rename_axis((None, None)).dropna().abs()

In [32]:
pearson_df.shape

(12497500,)

In [33]:
pearson_df.head()

ENSG00000202198  ENSG00000283293    0.999999
ENSG00000277027  ENSG00000283293    0.515202
                 ENSG00000202198    0.515256
ENSG00000269900  ENSG00000283293    0.514507
                 ENSG00000202198    0.514562
dtype: float64

In [34]:
assert clustermatch_df.index.equals(pearson_df.index)

In [35]:
spearman_df = get_upper_triag(spearman_df)

In [36]:
# make spearman abs
spearman_df = spearman_df.unstack().rename_axis((None, None)).dropna().abs()

In [37]:
spearman_df.shape

(12497500,)

In [38]:
spearman_df.head()

ENSG00000202198  ENSG00000283293    0.999995
ENSG00000277027  ENSG00000283293    0.775942
                 ENSG00000202198    0.776191
ENSG00000269900  ENSG00000283293    0.775672
                 ENSG00000202198    0.775924
dtype: float64

In [39]:
assert clustermatch_df.index.equals(spearman_df.index)

In [40]:
df = pd.DataFrame(
    {
        "clustermatch": clustermatch_df,
        "pearson": pearson_df,
        "spearman": spearman_df,
    }
).sort_index()

In [41]:
assert not df.isna().any().any()

In [42]:
df.shape

(12497500, 3)

In [43]:
df.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman
ENSG00000000003,ENSG00000000419,0.16396,0.130873,0.471673
ENSG00000000003,ENSG00000000938,0.083831,0.107588,0.178042
ENSG00000000003,ENSG00000001036,0.274535,0.047559,0.494995
ENSG00000000003,ENSG00000001630,0.194099,0.226343,0.547457
ENSG00000000003,ENSG00000002586,0.067899,0.011981,0.204876


# Save

In [44]:
df.to_pickle(OUTPUT_FILE)