# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

# Modules loading

In [1]:
import numpy as np
import pandas as pd

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.RECOUNT2
# GTEX_TISSUE = None
GENE_SEL_STRATEGY = "var_pc_log2"

# Paths

In [5]:
INPUT_GENE_EXPR_DATA_FILE = conf.RECOUNT2["DATA_FILE"]
display(INPUT_GENE_EXPR_DATA_FILE)

assert INPUT_GENE_EXPR_DATA_FILE.exists()

PosixPath('/opt/data/data/recount2/recount_data_prep_PLIER.pkl')

In [6]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/recount2/similarity_matrices/recount_data_prep_PLIER-{corr_method}.pkl')

In [7]:
OUTPUT_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    corr_method="all",
)
display(OUTPUT_FILE)

PosixPath('/opt/data/results/recount2/similarity_matrices/recount_data_prep_PLIER-all.pkl')

# Load data

## Gene Ensembl ID -> Symbol mapping

In [None]:
gene_map = pd.read_pickle(
    DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
)

In [None]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [None]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

## Gene expression

In [9]:
data = pd.read_pickle(INPUT_GENE_EXPR_DATA_FILE)

In [10]:
data.shape

(6750, 37032)

In [11]:
data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
GAS6,-0.3125,-0.312931,-0.312931,-0.312931,-0.312931,-0.308253,-0.312931,-0.312931,-0.312931,-0.312931,...,-0.301711,-0.305581,-0.303344,-0.2978,-0.307122,-0.285499,-0.309599,-0.30022,-0.297667,-0.310151
MMP14,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.32514,...,-0.314587,-0.322952,-0.326439,-0.325994,-0.326272,-0.322523,-0.326375,-0.326339,-0.322127,-0.327438
DSP,-0.286319,-0.286859,-0.286859,-0.286859,-0.286859,-0.286859,-0.277195,-0.256862,-0.27879,-0.269701,...,-0.286859,-0.286859,-0.286745,-0.286688,-0.286725,-0.286529,-0.286859,-0.286671,-0.286859,-0.28674
MARCKSL1,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,...,0.807663,1.294564,1.527655,1.404788,1.047931,0.892119,1.507099,2.458255,2.919662,1.410846
SPARC,-0.370498,-0.370498,-0.369171,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,...,-0.345409,-0.31075,-0.34812,-0.356938,-0.355206,-0.366197,-0.351174,-0.363703,-0.350825,-0.360762


## Clustermatch

In [13]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        corr_method="clustermatch_k2to5",
    )
)

In [14]:
clustermatch_df.shape

(6750, 6750)

In [15]:
clustermatch_df.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.359879,0.161606,0.096449,0.179718,0.241732,0.273178,0.240247,0.22299,0.220751,...,0.059111,0.078427,0.108364,0.251818,0.129664,0.085054,0.106808,0.144849,0.11544,0.255519
MMP14,0.359879,1.0,0.144209,0.078681,0.271937,0.192011,0.258037,0.207851,0.187494,0.193339,...,0.052428,0.079176,0.089386,0.1653,0.096144,0.079255,0.076691,0.144493,0.123675,0.264664
DSP,0.161606,0.144209,1.0,0.05851,0.067755,0.067983,0.207457,0.277144,0.066032,0.147427,...,0.02879,0.068707,0.073084,0.084873,0.135008,0.031805,0.161311,0.151766,0.064669,0.312919
MARCKSL1,0.096449,0.078681,0.05851,1.0,0.041695,0.101608,0.052736,0.068859,0.112262,0.077731,...,0.099796,0.064212,0.112536,0.140569,0.091698,0.131773,0.037773,0.089773,0.076254,0.108979
SPARC,0.179718,0.271937,0.067755,0.041695,1.0,0.067178,0.123818,0.169873,0.081537,0.117885,...,0.053518,0.033891,0.04988,0.042892,0.128344,0.036825,0.104421,0.039843,0.090324,0.13684


In [16]:
assert data.index.equals(clustermatch_df.index)

## Pearson

In [18]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        corr_method="pearson",
    )
)

In [19]:
pearson_df.shape

(6750, 6750)

In [20]:
pearson_df.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.294974,0.056459,-0.025141,0.200635,0.103548,0.131867,0.276463,0.112967,0.130925,...,-0.056567,-0.007278,-0.030214,0.148051,0.039434,0.047084,-0.029201,0.028263,-0.010433,0.386991
MMP14,0.294974,1.0,0.055232,0.019443,0.162131,0.288713,0.078418,0.152664,0.200287,0.230657,...,-0.075008,-0.017159,-0.022373,0.195568,-0.008378,0.06035,-0.035702,0.019021,-0.001449,0.348298
DSP,0.056459,0.055232,1.0,-0.036595,0.035897,0.040336,0.044387,0.14176,-0.023011,0.027952,...,-0.052872,-0.010162,0.033149,0.003191,-0.025231,-0.019664,-0.020029,0.135147,-0.017221,0.061731
MARCKSL1,-0.025141,0.019443,-0.036595,1.0,0.029414,-0.003729,-0.030208,0.018872,0.130432,0.022778,...,0.069346,-0.003312,-0.015761,0.201729,-0.022687,0.018768,0.024176,0.027592,0.004332,0.146742
SPARC,0.200635,0.162131,0.035897,0.029414,1.0,-0.007102,0.079556,0.278528,0.028905,0.089256,...,0.084137,-0.009165,-0.058129,-0.005859,0.10202,0.077243,-0.020692,-0.009489,-0.016504,0.41296


In [21]:
assert data.index.equals(pearson_df.index)

## Spearman

In [22]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        corr_method="spearman",
    )
)

In [23]:
spearman_df.shape

(6750, 6750)

In [24]:
spearman_df.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.767277,0.530435,0.376058,0.492968,0.652806,0.694218,0.598491,0.599584,0.59529,...,0.096758,0.319135,0.243757,0.669781,0.433053,0.397045,0.298712,0.484416,0.357766,0.689587
MMP14,0.767277,1.0,0.490369,0.372943,0.593585,0.608753,0.652577,0.566899,0.596954,0.591671,...,0.108111,0.328656,0.295322,0.573781,0.372953,0.388056,0.23266,0.498833,0.371384,0.707476
DSP,0.530435,0.490369,1.0,0.309156,0.25051,0.378827,0.5705,0.603818,0.261268,0.48351,...,0.102505,0.322113,0.18112,0.336312,0.389993,0.143608,0.396473,0.567024,0.022467,0.615777
MARCKSL1,0.376058,0.372943,0.309156,1.0,0.251116,0.380241,0.254586,0.286491,0.441862,0.339611,...,0.386577,0.314713,0.302932,0.548047,0.313833,0.454887,0.228858,0.352535,0.312589,0.478413
SPARC,0.492968,0.593585,0.25051,0.251116,1.0,0.299277,0.404538,0.481395,0.353212,0.4263,...,0.259254,0.219185,0.128572,0.276951,0.441842,0.182518,0.339385,0.180845,0.240672,0.503508


In [25]:
assert data.index.equals(spearman_df.index)

## Merge

In [26]:
def get_upper_triag(data, k=1):
    mask = np.triu(np.ones(data.shape), k=k).astype(bool)
    return data.where(mask)

In [27]:
# # make sure genes match
# clustermatch_df = clustermatch_df.loc[pearson_df.index, pearson_df.columns]

In [28]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [29]:
clustermatch_df = clustermatch_df.unstack().rename_axis((None, None)).dropna()

In [30]:
clustermatch_df.shape

(22777875,)

In [31]:
clustermatch_df.head()

MMP14     GAS6     0.359879
DSP       GAS6     0.161606
          MMP14    0.144209
MARCKSL1  GAS6     0.096449
          MMP14    0.078681
dtype: float64

In [32]:
pearson_df = get_upper_triag(pearson_df)

In [33]:
# make pearson abs
pearson_df = pearson_df.unstack().rename_axis((None, None)).dropna().abs()

In [34]:
pearson_df.shape

(22777875,)

In [35]:
pearson_df.head()

MMP14     GAS6     0.294974
DSP       GAS6     0.056459
          MMP14    0.055232
MARCKSL1  GAS6     0.025141
          MMP14    0.019443
dtype: float64

In [36]:
assert clustermatch_df.index.equals(pearson_df.index)

In [37]:
spearman_df = get_upper_triag(spearman_df)

In [38]:
# make spearman abs
spearman_df = spearman_df.unstack().rename_axis((None, None)).dropna().abs()

In [39]:
spearman_df.shape

(22777875,)

In [40]:
spearman_df.head()

MMP14     GAS6     0.767277
DSP       GAS6     0.530435
          MMP14    0.490369
MARCKSL1  GAS6     0.376058
          MMP14    0.372943
dtype: float64

In [41]:
assert clustermatch_df.index.equals(spearman_df.index)

In [42]:
df = pd.DataFrame(
    {
        "clustermatch": clustermatch_df,
        "pearson": pearson_df,
        "spearman": spearman_df,
    }
).sort_index()

In [43]:
assert not df.isna().any().any()

In [44]:
df.shape

(22777875, 3)

In [45]:
df.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman
A2M,AATK,0.134712,0.032274,0.50452
A2M,ABCA1,0.10958,0.110068,0.443315
A2M,ABCA6,0.189934,0.050592,0.536541
A2M,ABCC3,0.088957,0.053941,0.373502
A2M,ABHD14A,0.082554,0.012871,0.367407


# Save

In [46]:
df.to_pickle(OUTPUT_FILE)