# Description

It selects one gene pair (see `Settings` below) and computes the correlation coefficients and p-values across all the tissues in GTEx.
We do this to check whether one pattern found in whole blood also replicates in other tissues.

# Modules

In [1]:
import pandas as pd

from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

from ccc import conf
from ccc.coef import ccc

# Settings

In [2]:
# this gene pair was originally found with ccc on whole blood
# interesting: https://clincancerres.aacrjournals.org/content/26/21/5567.figures-only
gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000067048.16"
gene0_symbol, gene1_symbol = "KDM6A", "DDX3Y"

CCC_PVALUE_N_PERMS = 1000000

# Paths

In [3]:
TISSUE_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
assert TISSUE_DIR.exists()

In [4]:
OUTPUT_DIR = (
    conf.GTEX["RESULTS_DIR"]
    / "other_tissues"
    / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}"
)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_ddx3y')

# Data

## GTEx metadata

In [5]:
gtex_metadata = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_v8-sample_metadata.pkl")

In [6]:
gtex_metadata.shape

(22951, 66)

In [7]:
gtex_metadata.head()

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,


## Gene Ensembl ID -> Symbol mapping

In [8]:
gene_map = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl")

In [9]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [10]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

In [11]:
assert gene_map[gene0_id] == gene0_symbol
assert gene_map[gene1_id] == gene1_symbol

# Compute correlation on all tissues

In [12]:
res_all = pd.DataFrame(
    {
        f.stem.split("_data_")[1]: {
            "cm": ccc(data[gene0_id], data[gene1_id]),
            "pearson": pearsonr(data[gene0_id], data[gene1_id])[0],
            "spearman": spearmanr(data[gene0_id], data[gene1_id])[0],
        }
        for f in TISSUE_DIR.glob("*.pkl")
        if (data := pd.read_pickle(f).T[[gene0_id, gene1_id]].dropna()) is not None
        and data.shape[0] > 10
    }
).T

In [13]:
res_all.shape

(50, 3)

In [14]:
res_all.head()

Unnamed: 0,cm,pearson,spearman
colon_transverse,0.280814,-0.393881,-0.465898
brain_amygdala,0.305677,0.038732,0.154331
artery_coronary,0.24272,-0.48761,-0.426164
artery_aorta,0.38197,-0.579236,-0.409761
adrenal_gland,0.188929,-0.4894,-0.418784


In [15]:
res_all.sort_values("cm")

Unnamed: 0,cm,pearson,spearman
testis,0.006332,0.017759,-0.028708
ovary,0.016698,-0.10052,0.125249
vagina,0.016935,0.080397,0.190372
uterus,0.025004,0.07848,0.228621
bladder,0.08951,-0.440277,-0.216883
small_intestine_terminal_ileum,0.164531,-0.361532,-0.436642
brain_spinal_cord_cervical_c1,0.166019,0.119269,0.067168
prostate,0.16653,0.576071,0.614194
stomach,0.172034,-0.36428,-0.45374
adrenal_gland,0.188929,-0.4894,-0.418784


In [16]:
res_all.sort_values("pearson")

Unnamed: 0,cm,pearson,spearman
cells_cultured_fibroblasts,0.302399,-0.694025,-0.592887
nerve_tibial,0.374104,-0.647834,-0.403768
artery_tibial,0.29844,-0.617718,-0.387765
artery_aorta,0.38197,-0.579236,-0.409761
thyroid,0.326157,-0.550632,-0.428463
pituitary,0.212682,-0.534453,-0.28915
colon_sigmoid,0.408355,-0.529455,-0.414515
minor_salivary_gland,0.230149,-0.493186,-0.273789
esophagus_muscularis,0.391141,-0.49228,-0.366355
adrenal_gland,0.188929,-0.4894,-0.418784


In [17]:
res_all.sort_values("spearman")

Unnamed: 0,cm,pearson,spearman
cells_cultured_fibroblasts,0.302399,-0.694025,-0.592887
colon_transverse,0.280814,-0.393881,-0.465898
stomach,0.172034,-0.36428,-0.45374
small_intestine_terminal_ileum,0.164531,-0.361532,-0.436642
thyroid,0.326157,-0.550632,-0.428463
artery_coronary,0.24272,-0.48761,-0.426164
adrenal_gland,0.188929,-0.4894,-0.418784
colon_sigmoid,0.408355,-0.529455,-0.414515
artery_aorta,0.38197,-0.579236,-0.409761
nerve_tibial,0.374104,-0.647834,-0.403768


# Compute p-values on all tissues

In [18]:
res_pval_all = pd.DataFrame(
    {
        f.stem.split("_data_")[1]: {
            "cm": ccc(
                data[gene0_id],
                data[gene1_id],
                pvalue_n_perms=CCC_PVALUE_N_PERMS,
                n_jobs=conf.GENERAL["N_JOBS"],
            )[1],
            "pearson": pearsonr(data[gene0_id], data[gene1_id])[1],
            "spearman": spearmanr(data[gene0_id], data[gene1_id])[1],
        }
        for f in TISSUE_DIR.glob("*.pkl")
        if (data := pd.read_pickle(f).T[[gene0_id, gene1_id]].dropna()) is not None
        and data.shape[0] > 10
    }
).T

In [19]:
res_pval_all.shape

(50, 3)

In [20]:
res_pval_all.head()

Unnamed: 0,cm,pearson,spearman
colon_transverse,9.99999e-07,1.613504e-16,2.880714e-23
brain_amygdala,9.99999e-07,0.6356755,0.05764275
artery_coronary,9.99999e-07,9.731623e-16,5.220895e-12
artery_aorta,9.99999e-07,4.513966e-40,6.380371999999999e-19
adrenal_gland,9.99999e-07,6.058615e-17,2.230048e-12


In [21]:
res_pval_all.sort_values("cm")

Unnamed: 0,cm,pearson,spearman
colon_transverse,9.99999e-07,1.613504e-16,2.880714e-23
prostate,9.99999e-07,4.637571e-23,8.471438000000001e-27
brain_frontal_cortex_ba9,9.99999e-07,0.4371643,0.006734313
brain_caudate_basal_ganglia,9.99999e-07,0.007044831,0.0006797072
muscle_skeletal,9.99999e-07,3.656202e-13,1.56302e-07
stomach,9.99999e-07,1.045907e-12,1.2404499999999999e-19
skin_not_sun_exposed_suprapubic,9.99999e-07,4.78434e-21,3.167647e-11
adipose_visceral_omentum,9.99999e-07,6.191569e-12,0.001373269
nerve_tibial,9.99999e-07,6.07387e-75,1.130228e-25
pancreas,9.99999e-07,1.432674e-08,4.739023e-06


In [22]:
res_pval_all.sort_values("pearson")

Unnamed: 0,cm,pearson,spearman
nerve_tibial,9.99999e-07,6.07387e-75,1.130228e-25
cells_cultured_fibroblasts,9.99999e-07,1.188516e-73,3.662235e-49
artery_tibial,9.99999e-07,5.248493e-71,3.246061e-25
thyroid,9.99999e-07,4.844271e-53,1.541651e-30
artery_aorta,9.99999e-07,4.513966e-40,6.380371999999999e-19
adipose_subcutaneous,9.99999e-07,6.287645e-40,2.6187090000000003e-17
esophagus_muscularis,9.99999e-07,8.650204e-33,8.375537e-18
colon_sigmoid,9.99999e-07,2.450898e-28,6.395478000000001e-17
skin_sun_exposed_lower_leg,9.99999e-07,5.836576e-27,2.245602e-16
esophagus_mucosa,9.99999e-07,6.374962e-27,2.030847e-21


In [23]:
res_pval_all.sort_values("spearman")

Unnamed: 0,cm,pearson,spearman
cells_cultured_fibroblasts,9.99999e-07,1.188516e-73,3.662235e-49
thyroid,9.99999e-07,4.844271e-53,1.541651e-30
prostate,9.99999e-07,4.637571e-23,8.471438000000001e-27
nerve_tibial,9.99999e-07,6.07387e-75,1.130228e-25
artery_tibial,9.99999e-07,5.248493e-71,3.246061e-25
colon_transverse,9.99999e-07,1.613504e-16,2.880714e-23
esophagus_mucosa,9.99999e-07,6.374962e-27,2.030847e-21
stomach,9.99999e-07,1.045907e-12,1.2404499999999999e-19
artery_aorta,9.99999e-07,4.513966e-40,6.380371999999999e-19
esophagus_muscularis,9.99999e-07,8.650204e-33,8.375537e-18


# Save

## Coefficient values

In [24]:
res_all.to_pickle(OUTPUT_DIR / "coef_values.pkl")

## Coefficient p-values

In [25]:
res_pval_all.to_pickle(OUTPUT_DIR / "coef_pvalues.pkl")