# Description

It selects one gene pair (see `Settings` below) and computes the correlation coefficients and p-values across all the tissues in GTEx.
We do this to check whether one pattern found in whole blood also replicates in other tissues.

# Modules

In [1]:
import pandas as pd

from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

from ccc import conf
from ccc.coef import ccc

# Settings

In [2]:
# this gene pair was originally found with ccc on whole blood
# interesting: https://clincancerres.aacrjournals.org/content/26/21/5567.figures-only
gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000183878.15"
gene0_symbol, gene1_symbol = "KDM6A", "UTY"

CCC_PVALUE_N_PERMS = 1000000

# Paths

In [3]:
TISSUE_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
assert TISSUE_DIR.exists()

In [4]:
OUTPUT_DIR = (
    conf.GTEX["RESULTS_DIR"]
    / "other_tissues"
    / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}"
)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_uty')

# Data

## GTEx metadata

In [5]:
gtex_metadata = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_v8-sample_metadata.pkl")

In [6]:
gtex_metadata.shape

(22951, 66)

In [7]:
gtex_metadata.head()

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,


## Gene Ensembl ID -> Symbol mapping

In [8]:
gene_map = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl")

In [9]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [10]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

In [11]:
assert gene_map[gene0_id] == gene0_symbol
assert gene_map[gene1_id] == gene1_symbol

# Compute correlation on all tissues

In [12]:
res_all = pd.DataFrame(
    {
        f.stem.split("_data_")[1]: {
            "cm": ccc(data[gene0_id], data[gene1_id]),
            "pearson": pearsonr(data[gene0_id], data[gene1_id])[0],
            "spearman": spearmanr(data[gene0_id], data[gene1_id])[0],
        }
        for f in TISSUE_DIR.glob("*.pkl")
        if (data := pd.read_pickle(f).T[[gene0_id, gene1_id]].dropna()) is not None
        and data.shape[0] > 10
    }
).T.abs()

In [13]:
res_all.shape

(50, 3)

In [14]:
res_all.head()

Unnamed: 0,cm,pearson,spearman
colon_transverse,0.336727,0.517899,0.408343
brain_amygdala,0.280524,0.037541,0.147571
artery_coronary,0.274554,0.413862,0.391764
artery_aorta,0.429771,0.485788,0.36351
adrenal_gland,0.260197,0.45919,0.35419


In [15]:
res_all.sort_values("cm")

Unnamed: 0,cm,pearson,spearman
uterus,0.0,0.016338,0.041994
ovary,0.006722,0.104096,0.013011
vagina,0.013525,0.074307,0.086761
brain_cerebellum,0.187717,0.124874,0.036402
small_intestine_terminal_ileum,0.193668,0.35915,0.327394
brain_spinal_cord_cervical_c1,0.198741,0.24385,0.125183
testis,0.206704,0.692661,0.69099
stomach,0.209703,0.437367,0.346836
brain_cortex,0.2199,0.162729,0.144386
brain_anterior_cingulate_cortex_ba24,0.23162,0.002993,0.182207


In [16]:
res_all.sort_values("pearson")

Unnamed: 0,cm,pearson,spearman
brain_anterior_cingulate_cortex_ba24,0.23162,0.002993,0.182207
uterus,0.0,0.016338,0.041994
brain_amygdala,0.280524,0.037541,0.147571
brain_frontal_cortex_ba9,0.265633,0.051099,0.188904
bladder,0.330097,0.061798,0.022107
vagina,0.013525,0.074307,0.086761
heart_atrial_appendage,0.246321,0.095973,0.039109
ovary,0.006722,0.104096,0.013011
brain_caudate_basal_ganglia,0.28785,0.118342,0.199696
brain_cerebellum,0.187717,0.124874,0.036402


In [17]:
res_all.sort_values("spearman")

Unnamed: 0,cm,pearson,spearman
heart_left_ventricle,0.33444,0.183529,0.00603
ovary,0.006722,0.104096,0.013011
bladder,0.330097,0.061798,0.022107
brain_cerebellum,0.187717,0.124874,0.036402
heart_atrial_appendage,0.246321,0.095973,0.039109
uterus,0.0,0.016338,0.041994
adipose_visceral_omentum,0.280211,0.198846,0.053794
vagina,0.013525,0.074307,0.086761
whole_blood,0.294391,0.23987,0.100621
brain_spinal_cord_cervical_c1,0.198741,0.24385,0.125183


# Compute p-values on all tissues

In [18]:
res_pval_all = pd.DataFrame(
    {
        f.stem.split("_data_")[1]: {
            "cm": ccc(
                data[gene0_id],
                data[gene1_id],
                pvalue_n_perms=CCC_PVALUE_N_PERMS,
                n_jobs=conf.GENERAL["N_JOBS"],
            )[1],
            "pearson": pearsonr(data[gene0_id], data[gene1_id])[1],
            "spearman": spearmanr(data[gene0_id], data[gene1_id])[1],
        }
        for f in TISSUE_DIR.glob("*.pkl")
        if (data := pd.read_pickle(f).T[[gene0_id, gene1_id]].dropna()) is not None
        and data.shape[0] > 10
    }
).T.abs()

In [19]:
res_pval_all.shape

(50, 3)

In [20]:
res_pval_all.head()

Unnamed: 0,cm,pearson,spearman
colon_transverse,9.99999e-07,3.063714e-29,9.539164e-18
brain_amygdala,9.99999e-07,0.6461089,0.06963023
artery_coronary,9.99999e-07,2.38997e-11,3.159321e-10
artery_aorta,9.99999e-07,5.775754e-27,6.092383e-15
adrenal_gland,9.99999e-07,7.334489e-15,4.847677e-09


In [21]:
res_pval_all.sort_values("cm")

Unnamed: 0,cm,pearson,spearman
colon_transverse,9.99999e-07,3.063714e-29,9.539164e-18
brain_frontal_cortex_ba9,9.99999e-07,0.4624712,0.006157446
brain_caudate_basal_ganglia,9.99999e-07,0.06385581,0.001644387
testis,9.99999e-07,6.885883e-53,1.5301660000000002e-52
muscle_skeletal,9.99999e-07,9.476952e-13,6.688791e-10
stomach,9.99999e-07,3.314971e-18,1.383202e-11
skin_not_sun_exposed_suprapubic,9.99999e-07,1.328216e-14,0.0001756507
adipose_visceral_omentum,9.99999e-07,3.14714e-06,0.211586
nerve_tibial,9.99999e-07,1.544394e-64,6.654551e-19
pancreas,9.99999e-07,1.336164e-07,0.0003350402


In [22]:
res_pval_all.sort_values("pearson")

Unnamed: 0,cm,pearson,spearman
nerve_tibial,9.99999e-07,1.544394e-64,6.654551e-19
thyroid,9.99999e-07,4.016259e-62,1.3075270000000001e-23
prostate,9.99999e-07,8.574051e-57,8.19215e-56
artery_tibial,9.99999e-07,2.616759e-53,2.412324e-16
testis,9.99999e-07,6.885883e-53,1.5301660000000002e-52
cells_cultured_fibroblasts,9.99999e-07,5.273449999999999e-38,9.384953e-18
adipose_subcutaneous,9.99999e-07,3.380047e-37,1.605395e-12
esophagus_muscularis,9.99999e-07,4.264868e-36,5.670023e-15
colon_sigmoid,9.99999e-07,1.492599e-30,3.222739e-13
colon_transverse,9.99999e-07,3.063714e-29,9.539164e-18


In [23]:
res_pval_all.sort_values("spearman")

Unnamed: 0,cm,pearson,spearman
prostate,9.99999e-07,8.574051e-57,8.19215e-56
testis,9.99999e-07,6.885883e-53,1.5301660000000002e-52
thyroid,9.99999e-07,4.016259e-62,1.3075270000000001e-23
nerve_tibial,9.99999e-07,1.544394e-64,6.654551e-19
cells_cultured_fibroblasts,9.99999e-07,5.273449999999999e-38,9.384953e-18
colon_transverse,9.99999e-07,3.063714e-29,9.539164e-18
artery_tibial,9.99999e-07,2.616759e-53,2.412324e-16
esophagus_muscularis,9.99999e-07,4.264868e-36,5.670023e-15
artery_aorta,9.99999e-07,5.775754e-27,6.092383e-15
colon_sigmoid,9.99999e-07,1.492599e-30,3.222739e-13


# Save

## Coefficient values

In [24]:
res_all.to_pickle(OUTPUT_DIR / "coef_values.pkl")

## Coefficient p-values

In [25]:
res_pval_all.to_pickle(OUTPUT_DIR / "coef_pvalues.pkl")