# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

# Modules loading

In [1]:
import numpy as np
import pandas as pd

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = None
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# Parameters
GTEX_TISSUE = "skin_sun_exposed_lower_leg"

In [4]:
assert GTEX_TISSUE is not None, "Tissue not selected"

# Paths

In [5]:
INPUT_GENE_EXPR_DATA_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_DATA_FILE)

assert INPUT_GENE_EXPR_DATA_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2.pkl')

In [6]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [7]:
OUTPUT_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    tissue=GTEX_TISSUE,
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-all.pkl')

# Load data

## Gene Ensembl ID -> Symbol mapping

In [8]:
gene_map = pd.read_pickle(
    DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
)

In [9]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [10]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

## Gene expression

In [11]:
data = pd.read_pickle(INPUT_GENE_EXPR_DATA_FILE)

In [12]:
data.shape

(5000, 701)

In [13]:
data.head()

Unnamed: 0_level_0,GTEX-111FC-0126-SM-5N9DL,GTEX-111VG-2426-SM-5GZXD,GTEX-1122O-2126-SM-5EGIR,GTEX-1128S-2326-SM-5GZZY,GTEX-113IC-0126-SM-5HL6T,GTEX-113JC-2326-SM-5EQ4E,GTEX-117XS-2726-SM-5N9BL,GTEX-117YW-2626-SM-5GZZH,GTEX-117YX-2326-SM-5H12W,GTEX-1192W-2626-SM-5Q5AF,...,GTEX-ZXG5-0126-SM-5GIEU,GTEX-ZY6K-1826-SM-5GZXK,GTEX-ZYFC-0226-SM-5NQ75,GTEX-ZYFD-0126-SM-5GIDL,GTEX-ZYFG-2326-SM-5E44B,GTEX-ZYT6-0226-SM-5NQ6T,GTEX-ZYW4-0126-SM-5E44A,GTEX-ZYY3-0126-SM-5GZY5,GTEX-ZZ64-1726-SM-5GZYB,GTEX-ZZPT-0226-SM-5E43X
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,189.0,207.2,0.3053,0.3572,239.1,0.5725,187.4,243.7,207.0,214.5,...,234.0,205.9,238.9,177.8,0.3282,234.9,237.6,0.7213,234.6,251.5
ENSG00000135443.8,0.1686,0.5698,0.07342,0.1438,0.1385,0.01785,0.0131,144.9,2.615,0.03579,...,0.0,0.7029,0.141,0.1027,0.2386,0.1443,0.3567,18.76,0.3739,0.1523
ENSG00000134184.12,0.1042,0.4526,0.3969,0.2019,144.4,0.2757,36.33,44.48,0.0,0.2764,...,113.5,0.5192,0.3268,0.4757,0.3103,33.07,0.5799,66.89,0.2625,43.67
ENSG00000204897.6,0.1456,0.1845,0.08913,0.04231,0.0,0.02889,0.1696,343.7,4.717,0.08689,...,0.02978,0.1484,0.08561,0.2327,0.08129,0.1401,0.2431,3.046,0.4952,0.02466
ENSG00000224114.1,0.9824,93.3,2.979,90.49,129.6,117.8,1.063,0.501,63.33,87.24,...,1.608,0.7631,99.16,94.47,1.097,0.7206,119.2,1.011,0.2122,118.5


## Clustermatch

In [14]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="clustermatch",
    )
)

In [15]:
clustermatch_df.shape

(5000, 5000)

In [16]:
clustermatch_df.head()

gene_ens_id,ENSG00000129824.15,ENSG00000135443.8,ENSG00000134184.12,ENSG00000204897.6,ENSG00000224114.1,ENSG00000102891.3,ENSG00000205076.4,ENSG00000149968.11,ENSG00000182591.5,ENSG00000212901.3,...,ENSG00000077092.18,ENSG00000206800.1,ENSG00000173156.6,ENSG00000283175.1,ENSG00000139946.9,ENSG00000104951.15,ENSG00000244267.1,ENSG00000235297.3,ENSG00000275426.1,ENSG00000100504.16
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,0.006389,0.004035,0.005282,0.00488,0.005663,0.004179,0.034413,0.007675,0.005549,...,0.004222,0.00325,0.00495,0.009983,0.011104,0.004968,0.003042,0.008672,0.004757,0.027658
ENSG00000135443.8,0.006389,1.0,0.003032,0.430518,0.006299,0.019512,0.005515,0.004505,0.295279,0.289493,...,0.003384,0.00158,0.006088,0.008653,0.011816,0.003302,0.005873,0.005765,0.003738,0.001839
ENSG00000134184.12,0.004035,0.003032,1.0,0.001339,0.00434,0.010379,0.005208,0.007941,0.004895,0.001413,...,0.003296,0.003422,0.001304,0.002881,0.002143,0.023849,0.003708,0.004115,0.006504,0.005597
ENSG00000204897.6,0.005282,0.430518,0.001339,1.0,0.005176,0.018529,0.009803,0.010833,0.268407,0.250168,...,0.00718,0.003628,0.016976,0.006506,0.02062,0.003193,0.00109,0.007587,0.004157,0.001735
ENSG00000224114.1,0.00488,0.006299,0.00434,0.005176,1.0,0.001827,0.011288,0.002426,0.002286,0.00196,...,0.001645,0.005332,0.005387,0.00527,0.003761,0.004261,0.001955,0.042307,0.003158,0.005041


In [17]:
assert data.index.equals(clustermatch_df.index)

## Pearson

In [18]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="pearson",
    )
)

In [19]:
pearson_df.shape

(5000, 5000)

In [20]:
pearson_df.head()

gene_ens_id,ENSG00000129824.15,ENSG00000135443.8,ENSG00000134184.12,ENSG00000204897.6,ENSG00000224114.1,ENSG00000102891.3,ENSG00000205076.4,ENSG00000149968.11,ENSG00000182591.5,ENSG00000212901.3,...,ENSG00000077092.18,ENSG00000206800.1,ENSG00000173156.6,ENSG00000283175.1,ENSG00000139946.9,ENSG00000104951.15,ENSG00000244267.1,ENSG00000235297.3,ENSG00000275426.1,ENSG00000100504.16
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,0.080008,-0.092589,0.059517,0.03382,-0.095897,-0.006031,0.167971,0.07132,0.041228,...,0.034722,0.076105,0.006976,0.01951,-0.092884,0.02559,0.006502,-0.009664,0.003751,0.043949
ENSG00000135443.8,0.080008,1.0,-0.036381,0.884581,0.012467,0.246753,-0.015795,0.060674,0.937499,0.839319,...,0.077306,-0.042748,-0.040638,0.015104,0.033041,-0.049119,-0.052033,-0.027207,0.018714,-0.034333
ENSG00000134184.12,-0.092589,-0.036381,1.0,-0.022172,-0.024376,0.180749,0.02427,0.032569,-0.014736,-0.011177,...,-0.014043,-0.004809,0.002525,-0.054538,-0.003346,-0.113142,-0.023637,-0.045748,-0.044974,-0.075449
ENSG00000204897.6,0.059517,0.884581,-0.022172,1.0,0.026718,0.228186,-0.031767,0.070858,0.801291,0.74154,...,0.068188,-0.067381,-0.049454,0.029482,0.039879,-0.035164,-0.045703,-0.048379,0.030126,-0.015271
ENSG00000224114.1,0.03382,0.012467,-0.024376,0.026718,1.0,-0.038439,-0.02845,-0.086315,0.000396,-0.016647,...,0.016796,0.011701,0.008087,0.000151,-0.007604,-0.006679,-0.005657,0.085799,0.026087,0.033813


In [21]:
assert data.index.equals(pearson_df.index)

## Spearman

In [22]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="spearman",
    )
)

In [23]:
spearman_df.shape

(5000, 5000)

In [24]:
spearman_df.head()

gene_ens_id,ENSG00000129824.15,ENSG00000135443.8,ENSG00000134184.12,ENSG00000204897.6,ENSG00000224114.1,ENSG00000102891.3,ENSG00000205076.4,ENSG00000149968.11,ENSG00000182591.5,ENSG00000212901.3,...,ENSG00000077092.18,ENSG00000206800.1,ENSG00000173156.6,ENSG00000283175.1,ENSG00000139946.9,ENSG00000104951.15,ENSG00000244267.1,ENSG00000235297.3,ENSG00000275426.1,ENSG00000100504.16
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,-0.026356,-0.048471,-0.026656,0.067988,-0.093766,-0.043562,0.161506,-0.088255,-0.018416,...,0.033364,0.041656,-0.021513,0.102007,-0.000594,0.070636,-0.02252,0.050635,0.030483,0.130514
ENSG00000135443.8,-0.026356,1.0,-0.09545,0.828284,0.067067,0.138146,0.031945,-0.027511,0.699591,0.670228,...,0.059375,-0.064857,0.011348,0.116777,0.084842,0.036756,0.014813,0.045234,0.020591,0.016381
ENSG00000134184.12,-0.048471,-0.09545,1.0,-0.060467,-0.035682,0.10987,0.021547,0.025481,-0.04894,-0.047545,...,0.052675,0.007082,0.003952,-0.034271,0.0376,-0.082645,0.007562,0.014705,-0.000948,-0.052054
ENSG00000204897.6,-0.026656,0.828284,-0.060467,1.0,0.064647,0.153981,0.078705,-0.068483,0.663045,0.633745,...,0.003412,-0.027296,0.083848,0.089167,0.030968,0.009919,0.011464,0.08671,0.014844,0.025546
ENSG00000224114.1,0.067988,0.067067,-0.035682,0.064647,1.0,0.004041,-0.024308,-0.045348,0.018274,0.025719,...,0.006032,-0.016157,-0.04362,0.033866,0.036977,0.04358,-0.022898,0.23488,0.025996,0.055765


In [25]:
assert data.index.equals(spearman_df.index)

## Merge

In [26]:
def get_upper_triag(data, k=1):
    mask = np.triu(np.ones(data.shape), k=k).astype(bool)
    return data.where(mask)

In [27]:
# # make sure genes match
# clustermatch_df = clustermatch_df.loc[pearson_df.index, pearson_df.columns]

In [28]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [29]:
clustermatch_df = clustermatch_df.unstack().rename_axis((None, None)).dropna()

In [30]:
clustermatch_df.shape

(12497500,)

In [31]:
clustermatch_df.head()

ENSG00000135443.8   ENSG00000129824.15    0.006389
ENSG00000134184.12  ENSG00000129824.15    0.004035
                    ENSG00000135443.8     0.003032
ENSG00000204897.6   ENSG00000129824.15    0.005282
                    ENSG00000135443.8     0.430518
dtype: float64

In [32]:
pearson_df = get_upper_triag(pearson_df)

In [33]:
# make pearson abs
pearson_df = pearson_df.unstack().rename_axis((None, None)).dropna().abs()

In [34]:
pearson_df.shape

(12497500,)

In [35]:
pearson_df.head()

ENSG00000135443.8   ENSG00000129824.15    0.080008
ENSG00000134184.12  ENSG00000129824.15    0.092589
                    ENSG00000135443.8     0.036381
ENSG00000204897.6   ENSG00000129824.15    0.059517
                    ENSG00000135443.8     0.884581
dtype: float64

In [36]:
assert clustermatch_df.index.equals(pearson_df.index)

In [37]:
spearman_df = get_upper_triag(spearman_df)

In [38]:
# make spearman abs
spearman_df = spearman_df.unstack().rename_axis((None, None)).dropna().abs()

In [39]:
spearman_df.shape

(12497500,)

In [40]:
spearman_df.head()

ENSG00000135443.8   ENSG00000129824.15    0.026356
ENSG00000134184.12  ENSG00000129824.15    0.048471
                    ENSG00000135443.8     0.095450
ENSG00000204897.6   ENSG00000129824.15    0.026656
                    ENSG00000135443.8     0.828284
dtype: float64

In [41]:
assert clustermatch_df.index.equals(spearman_df.index)

In [42]:
df = pd.DataFrame(
    {
        "clustermatch": clustermatch_df,
        "pearson": pearson_df,
        "spearman": spearman_df,
    }
).sort_index()

In [43]:
assert not df.isna().any().any()

In [44]:
df.shape

(12497500, 3)

In [45]:
df.head()

Unnamed: 0,Unnamed: 1,clustermatch,pearson,spearman
ENSG00000000005.5,ENSG00000004776.12,0.137394,0.357174,0.49426
ENSG00000000005.5,ENSG00000004799.7,0.008434,0.123811,0.018815
ENSG00000000005.5,ENSG00000006059.3,0.039085,0.1834,0.275066
ENSG00000000005.5,ENSG00000007216.14,0.021553,0.079729,0.207325
ENSG00000000005.5,ENSG00000007908.15,0.017744,0.052055,0.18977


# Save

In [46]:
df.to_pickle(OUTPUT_FILE)