# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

# Modules loading

In [1]:
import pandas as pd

from ccc import conf
from ccc.utils import get_upper_triag

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
# whole blood by default, but this is a parameters cells that can be changed when running papermill
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
assert GTEX_TISSUE is not None, "Tissue not selected"

# Paths

In [4]:
INPUT_GENE_EXPR_DATA_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_DATA_FILE)

assert INPUT_GENE_EXPR_DATA_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [5]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [6]:
OUTPUT_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    tissue=GTEX_TISSUE,
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_whole_blood-var_pc_log2-all.pkl')

# Load data

## Gene Ensembl ID -> Symbol mapping

In [7]:
gene_map = pd.read_pickle(
    DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
)

In [8]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [9]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

## Gene expression

In [10]:
data = pd.read_pickle(INPUT_GENE_EXPR_DATA_FILE)

In [11]:
data.shape

(5000, 755)

In [12]:
data.head()

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,0.5623,0.8067,116.9,4.047,211.0,58.11,68.38,249.5,5.095,295.9,...,39.96,0.1393,0.2238,245.0,513.6,1626.0,0.5633,515.7,1.194,1163.0
ENSG00000135245.9,0.6529,1.385,199.2,2.266,116.7,192.3,161.5,263.5,23.54,251.9,...,114.3,1.833,0.4115,149.0,935.3,233.6,0.8882,134.0,1.12,295.7
ENSG00000163631.16,1.848,0.2503,0.08429,1.251,1348.0,9.971,101.3,95.09,1.264,119.3,...,2.092,2.11,0.03588,171.8,107.1,71.25,1.772,309.6,0.07361,17.75
ENSG00000277632.1,1.696,1.345,235.1,11.77,141.7,199.1,525.5,659.9,10.91,209.3,...,61.34,2.25,0.7231,261.2,400.0,288.5,2.696,287.5,3.323,618.9
ENSG00000239839.6,185.2,1.779,694.3,23.84,297.3,3122.0,2521.0,1504.0,80.06,652.0,...,1010.0,253.8,94.52,6083.0,2768.0,52.06,34.57,17.36,352.3,63.85


## CCC

In [13]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="ccc",
    )
)

In [14]:
clustermatch_df.shape

(5000, 5000)

In [15]:
clustermatch_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.212632,0.037166,0.012264,0.016958,0.053866,0.133457,0.081476,0.212632,0.112897,0.022719
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.237858,0.051243,0.042777,0.035816,0.094566,0.145388,0.075009,0.217565,0.099542,0.01966
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.153623,0.041971,0.008775,0.009087,0.053706,0.090792,0.021106,0.153623,0.047663,0.009211
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.222554,0.036365,0.023933,0.017412,0.044225,0.129593,0.065463,0.188812,0.147426,0.011364
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.048843,0.012149,0.018039,0.005696,0.021103,0.022719,0.003596,0.100699,0.018589,0.010845


In [16]:
assert data.index.equals(clustermatch_df.index)

## Pearson

In [17]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="pearson",
    )
)

In [18]:
pearson_df.shape

(5000, 5000)

In [19]:
pearson_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.571672,0.126906,0.456538,0.12714,0.256347,0.300879,-0.268024,0.472277,0.355504,...,-0.087559,0.181072,0.098032,0.012809,0.048569,-0.125537,0.449339,-0.198314,0.298775,0.041489
ENSG00000135245.9,0.571672,1.0,0.110612,0.438929,0.209161,0.223671,0.20337,-0.325667,0.532544,0.406272,...,-0.198912,0.018133,0.165431,0.052732,-0.123459,-0.192481,0.478504,-0.223546,0.259206,0.116396
ENSG00000163631.16,0.126906,0.110612,1.0,0.129248,0.021091,0.054572,0.072717,-0.131141,0.235882,0.098683,...,-0.095401,-0.077452,-0.084369,-0.077808,-0.09349,-0.037774,0.02526,-0.12335,0.049174,0.106998
ENSG00000277632.1,0.456538,0.438929,0.129248,1.0,0.199647,0.238356,0.366739,-0.361011,0.406964,0.416603,...,-0.17595,0.107436,0.118997,0.027369,0.060482,-0.212037,0.420059,-0.304038,0.509854,0.027239
ENSG00000239839.6,0.12714,0.209161,0.021091,0.199647,1.0,0.229287,0.022563,-0.183318,0.07741,0.541731,...,-0.058021,0.051451,-0.040586,-0.013576,-0.015835,-0.115248,0.072186,-0.219001,0.076205,0.112401


In [20]:
assert data.index.equals(pearson_df.index)

## Spearman

In [21]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="spearman",
    )
)

In [22]:
spearman_df.shape

(5000, 5000)

In [23]:
spearman_df.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.853836,0.70924,0.850371,0.379835,0.760102,0.243599,-0.694446,0.866183,0.657452,...,-0.376521,-0.008912,-0.002147,-0.069521,-0.11974,-0.341219,0.457231,-0.458056,0.43551,-0.126189
ENSG00000135245.9,0.853836,1.0,0.675719,0.834496,0.422539,0.73584,0.233504,-0.574572,0.827193,0.626538,...,-0.350526,-0.017997,0.052715,0.029882,-0.129671,-0.361843,0.4594,-0.373947,0.449254,0.043939
ENSG00000163631.16,0.70924,0.675719,1.0,0.683018,0.388814,0.730922,0.177607,-0.558213,0.667917,0.613913,...,-0.427955,-0.190712,-0.125631,-0.109937,-0.253229,-0.332238,0.205314,-0.452284,0.309117,-0.073344
ENSG00000277632.1,0.850371,0.834496,0.683018,1.0,0.402836,0.756369,0.281333,-0.667793,0.808998,0.659031,...,-0.359078,-0.009396,0.04782,-0.001894,-0.07627,-0.343856,0.41941,-0.436411,0.546328,-0.08034
ENSG00000239839.6,0.379835,0.422539,0.388814,0.402836,1.0,0.626519,0.045501,-0.341542,0.297492,0.691271,...,-0.21208,-0.041432,-0.137451,-0.072243,-0.097927,-0.151915,0.086806,-0.350749,0.188874,0.140163


In [24]:
assert data.index.equals(spearman_df.index)

## Merge

In [25]:
# # make sure genes match
# clustermatch_df = clustermatch_df.loc[pearson_df.index, pearson_df.columns]

In [26]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [27]:
clustermatch_df = clustermatch_df.unstack().rename_axis((None, None)).dropna()

In [28]:
clustermatch_df.shape

(12497500,)

In [29]:
clustermatch_df.head()

ENSG00000135245.9   ENSG00000169429.10    0.780181
ENSG00000163631.16  ENSG00000169429.10    0.401717
                    ENSG00000135245.9     0.429092
ENSG00000277632.1   ENSG00000169429.10    0.789583
                    ENSG00000135245.9     0.780181
dtype: float64

In [30]:
pearson_df = get_upper_triag(pearson_df)

In [31]:
# make pearson abs
pearson_df = pearson_df.unstack().rename_axis((None, None)).dropna().abs()

In [32]:
pearson_df.shape

(12497500,)

In [33]:
pearson_df.head()

ENSG00000135245.9   ENSG00000169429.10    0.571672
ENSG00000163631.16  ENSG00000169429.10    0.126906
                    ENSG00000135245.9     0.110612
ENSG00000277632.1   ENSG00000169429.10    0.456538
                    ENSG00000135245.9     0.438929
dtype: float64

In [34]:
assert clustermatch_df.index.equals(pearson_df.index)

In [35]:
spearman_df = get_upper_triag(spearman_df)

In [36]:
# make spearman abs
spearman_df = spearman_df.unstack().rename_axis((None, None)).dropna().abs()

In [37]:
spearman_df.shape

(12497500,)

In [38]:
spearman_df.head()

ENSG00000135245.9   ENSG00000169429.10    0.853836
ENSG00000163631.16  ENSG00000169429.10    0.709240
                    ENSG00000135245.9     0.675719
ENSG00000277632.1   ENSG00000169429.10    0.850371
                    ENSG00000135245.9     0.834496
dtype: float64

In [39]:
assert clustermatch_df.index.equals(spearman_df.index)

In [40]:
df = pd.DataFrame(
    {
        "ccc": clustermatch_df,
        "pearson": pearson_df,
        "spearman": spearman_df,
    }
).sort_index()

In [41]:
assert not df.isna().any().any()

In [42]:
df.shape

(12497500, 3)

In [43]:
df.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,0.099013,0.094147,0.231269


# Save

In [44]:
df.to_pickle(OUTPUT_FILE)