# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

This notebook incorporates results using MIC, which was computed only in a subset of gene pairs due to its computational complexity.

# Modules loading

In [1]:
import pandas as pd

from ccc import conf
from ccc.utils import get_upper_triag

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
# whole blood by default, but this is a parameters cells that can be changed when running papermill
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

METHOD_NAME = "mic"

# Paths

In [3]:
COMPARISONS_DIR = DATASET_CONFIG["RESULTS_DIR"] / "comparison_others"
COMPARISONS_DIR.mkdir(parents=True, exist_ok=True)
display(COMPARISONS_DIR)

PosixPath('/opt/data/results/gtex_v8/comparison_others')

In [4]:
INPUT_DIR = COMPARISONS_DIR / METHOD_NAME
display(INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/comparison_others/mic')

In [5]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [6]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [7]:
OUTPUT_FILE = (
    COMPARISONS_DIR / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}-all.pkl"
)
display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/comparison_others/gtex_v8_data_whole_blood-var_pc_log2-all.pkl')

# Load data

## CCC

In [8]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="ccc",
    )
)

In [9]:
clustermatch_df.shape

(5000, 5000)

In [10]:
clustermatch_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.212632,0.037166,0.012264,0.016958,0.053866,0.133457,0.081476,0.212632,0.112897,0.022719
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.237858,0.051243,0.042777,0.035816,0.094566,0.145388,0.075009,0.217565,0.099542,0.01966
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.153623,0.041971,0.008775,0.009087,0.053706,0.090792,0.021106,0.153623,0.047663,0.009211
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.222554,0.036365,0.023933,0.017412,0.044225,0.129593,0.065463,0.188812,0.147426,0.011364
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.048843,0.012149,0.018039,0.005696,0.021103,0.022719,0.003596,0.100699,0.018589,0.010845


In [11]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [12]:
clustermatch_df = (
    clustermatch_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .sort_index()
    .rename("ccc")
)

In [13]:
clustermatch_df.shape

(12497500,)

In [14]:
clustermatch_df.head()

ENSG00000000419.12  ENSG00000002834.17    0.418721
                    ENSG00000002919.14    0.405090
                    ENSG00000002933.7     0.007466
                    ENSG00000003402.19    0.391683
                    ENSG00000004478.7     0.099013
Name: ccc, dtype: float64

## Pearson

In [15]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="pearson",
    )
)

In [16]:
pearson_df.shape

(5000, 5000)

In [17]:
pearson_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.571672,0.126906,0.456538,0.12714,0.256347,0.300879,-0.268024,0.472277,0.355504,...,-0.087559,0.181072,0.098032,0.012809,0.048569,-0.125537,0.449339,-0.198314,0.298775,0.041489
ENSG00000135245.9,0.571672,1.0,0.110612,0.438929,0.209161,0.223671,0.20337,-0.325667,0.532544,0.406272,...,-0.198912,0.018133,0.165431,0.052732,-0.123459,-0.192481,0.478504,-0.223546,0.259206,0.116396
ENSG00000163631.16,0.126906,0.110612,1.0,0.129248,0.021091,0.054572,0.072717,-0.131141,0.235882,0.098683,...,-0.095401,-0.077452,-0.084369,-0.077808,-0.09349,-0.037774,0.02526,-0.12335,0.049174,0.106998
ENSG00000277632.1,0.456538,0.438929,0.129248,1.0,0.199647,0.238356,0.366739,-0.361011,0.406964,0.416603,...,-0.17595,0.107436,0.118997,0.027369,0.060482,-0.212037,0.420059,-0.304038,0.509854,0.027239
ENSG00000239839.6,0.12714,0.209161,0.021091,0.199647,1.0,0.229287,0.022563,-0.183318,0.07741,0.541731,...,-0.058021,0.051451,-0.040586,-0.013576,-0.015835,-0.115248,0.072186,-0.219001,0.076205,0.112401


In [18]:
pearson_df = get_upper_triag(pearson_df)

In [19]:
# make pearson abs
pearson_df = (
    pearson_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .abs()
    .sort_index()
    .rename("pearson")
)

In [20]:
pearson_df.shape

(12497500,)

In [21]:
pearson_df.head()

ENSG00000000419.12  ENSG00000002834.17    0.681847
                    ENSG00000002919.14    0.734699
                    ENSG00000002933.7     0.013825
                    ENSG00000003402.19    0.727347
                    ENSG00000004478.7     0.094147
Name: pearson, dtype: float64

## Spearman

In [22]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="spearman",
    )
)

In [23]:
spearman_df.shape

(5000, 5000)

In [24]:
spearman_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.853836,0.70924,0.850371,0.379835,0.760102,0.243599,-0.694446,0.866183,0.657452,...,-0.376521,-0.008912,-0.002147,-0.069521,-0.11974,-0.341219,0.457231,-0.458056,0.43551,-0.126189
ENSG00000135245.9,0.853836,1.0,0.675719,0.834496,0.422539,0.73584,0.233504,-0.574572,0.827193,0.626538,...,-0.350526,-0.017997,0.052715,0.029882,-0.129671,-0.361843,0.4594,-0.373947,0.449254,0.043939
ENSG00000163631.16,0.70924,0.675719,1.0,0.683018,0.388814,0.730922,0.177607,-0.558213,0.667917,0.613913,...,-0.427955,-0.190712,-0.125631,-0.109937,-0.253229,-0.332238,0.205314,-0.452284,0.309117,-0.073344
ENSG00000277632.1,0.850371,0.834496,0.683018,1.0,0.402836,0.756369,0.281333,-0.667793,0.808998,0.659031,...,-0.359078,-0.009396,0.04782,-0.001894,-0.07627,-0.343856,0.41941,-0.436411,0.546328,-0.08034
ENSG00000239839.6,0.379835,0.422539,0.388814,0.402836,1.0,0.626519,0.045501,-0.341542,0.297492,0.691271,...,-0.21208,-0.041432,-0.137451,-0.072243,-0.097927,-0.151915,0.086806,-0.350749,0.188874,0.140163


In [25]:
spearman_df = get_upper_triag(spearman_df)

In [26]:
# make spearman abs
spearman_df = (
    spearman_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .abs()
    .sort_index()
    .rename("spearman")
)

In [27]:
spearman_df.shape

(12497500,)

In [28]:
spearman_df.head()

ENSG00000000419.12  ENSG00000002834.17    0.786595
                    ENSG00000002919.14    0.816991
                    ENSG00000002933.7     0.004128
                    ENSG00000003402.19    0.803653
                    ENSG00000004478.7     0.231269
Name: spearman, dtype: float64

## MIC

In [29]:
mic_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="mic_parallel",
    )
)

In [30]:
mic_df.shape

(5000, 5000)

In [31]:
mic_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.898121,0.559862,0.928075,0.274772,0.650528,0.214942,0.512846,0.920077,0.477458,...,0.360014,0.236945,0.175348,0.174412,0.240515,0.273663,0.271071,0.348093,0.272177,0.162018
ENSG00000135245.9,0.898121,1.0,0.600884,0.893781,0.272898,0.651888,0.184421,0.489723,0.869179,0.497482,...,0.424468,0.262601,0.25913,0.230842,0.334221,0.291673,0.294321,0.376545,0.307811,0.214136
ENSG00000163631.16,0.559862,0.600884,1.0,0.557656,0.247089,0.549351,0.161035,0.369418,0.553536,0.388951,...,0.27446,0.181878,0.151251,0.146727,0.190666,0.250033,0.157165,0.275665,0.192901,0.16343
ENSG00000277632.1,0.928075,0.893781,0.557656,1.0,0.27596,0.682808,0.214711,0.501672,0.87271,0.528294,...,0.358499,0.209312,0.195997,0.174349,0.241799,0.27128,0.236502,0.357468,0.311581,0.14656
ENSG00000239839.6,0.274772,0.272898,0.247089,0.27596,1.0,0.393518,0.164051,0.237542,0.251768,0.480873,...,0.207756,0.185433,0.153967,0.14414,0.184685,0.166501,0.142441,0.245541,0.174094,0.156663


In [32]:
mic_df = get_upper_triag(mic_df)

In [33]:
mic_df = mic_df.unstack().rename_axis((None, None)).dropna().sort_index().rename("mic")

In [34]:
mic_df.shape

(12497500,)

In [35]:
mic_df.head()

ENSG00000000419.12  ENSG00000002834.17    0.569503
                    ENSG00000002919.14    0.613696
                    ENSG00000002933.7     0.146093
                    ENSG00000003402.19    0.565137
                    ENSG00000004478.7     0.253103
Name: mic, dtype: float64

## Checks

In [36]:
assert (
    len(set(clustermatch_df.index).intersection(set(mic_df.index)))
    == clustermatch_df.index.shape[0]
)

In [37]:
assert (
    len(set(clustermatch_df.index).intersection(set(mic_df.index)))
    == mic_df.index.shape[0]
)

In [38]:
assert (
    len(set(pearson_df.index).intersection(set(mic_df.index))) == mic_df.index.shape[0]
)

In [39]:
assert (
    len(set(spearman_df.index).intersection(set(mic_df.index))) == mic_df.index.shape[0]
)

## Merge

In [40]:
df = pd.concat(
    [clustermatch_df, pearson_df, spearman_df, mic_df], join="inner", axis=1
).sort_index()

In [41]:
display(df.shape)
assert df.shape[0] == mic_df.shape[0]

(12497500, 4)

In [42]:
assert not df.isna().any().any()

In [43]:
df.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman,mic
ENSG00000000419.12,ENSG00000002834.17,0.418721,0.681847,0.786595,0.569503
ENSG00000000419.12,ENSG00000002919.14,0.40509,0.734699,0.816991,0.613696
ENSG00000000419.12,ENSG00000002933.7,0.007466,0.013825,0.004128,0.146093
ENSG00000000419.12,ENSG00000003402.19,0.391683,0.727347,0.803653,0.565137
ENSG00000000419.12,ENSG00000004478.7,0.099013,0.094147,0.231269,0.253103


# Save

In [44]:
df.to_pickle(OUTPUT_FILE)