# Description

TODO

# Modules loading

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd
from concurrent.futures import as_completed, ProcessPoolExecutor

from ccc.coef import ccc
from ccc import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

PVALUE_N_PERMS = 100000

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [3]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
OUTPUT_DIR

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

# Load gene expression data

In [7]:
data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()

In [8]:
data.shape

(5000, 755)

In [9]:
data.head()

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419.12,20.65,25.05,7.155,49.13,6.147,4.143,5.39,4.389,1.158,6.824,...,4.407,32.34,18.68,9.251,7.828,7.46,33.24,5.848,25.76,17.08
ENSG00000000938.12,906.0,1344.0,633.5,719.2,392.6,166.5,338.2,413.2,51.54,423.6,...,354.8,1102.0,774.9,206.0,620.4,346.3,1304.0,232.9,631.6,884.5
ENSG00000001167.14,8.19,20.01,20.47,21.22,16.46,8.619,18.22,16.58,1.602,35.68,...,11.34,11.25,11.18,9.523,41.86,24.58,8.892,13.39,13.47,42.64
ENSG00000001561.6,0.7104,1.771,2.234,6.014,3.206,0.3962,2.445,1.418,0.5531,0.7447,...,0.9269,2.555,0.5976,3.417,2.645,1.883,0.5391,0.9816,1.036,6.729
ENSG00000002549.12,22.5,21.33,19.29,157.1,29.33,9.577,14.17,23.33,1.407,28.3,...,4.493,50.47,16.21,32.74,18.15,11.92,20.1,15.55,11.98,35.37


# Load gene pairs samples

In [10]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [11]:
gene_pair_samples = pd.read_pickle(output_file)

In [12]:
len(gene_pair_samples)

9

In [13]:
sorted(gene_pair_samples.keys())

['all_high',
 'all_low',
 'ccc_high_and_pearson_low',
 'ccc_high_and_spearman_low',
 'ccc_high_and_spearman_pearson_low',
 'ccc_spearman_high_and_pearson_low',
 'pearson_high_and_ccc_low',
 'pearson_high_and_ccc_spearman_low',
 'selected_in_manuscript']

In [14]:
gene_pair_samples["all_high"].head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000052749.13,ENSG00000165025.14,True,False,True,False,True,False,0.36234,0.709449,0.795566
ENSG00000102897.9,ENSG00000086544.2,True,False,True,False,True,False,0.429092,0.698537,0.822212
ENSG00000110628.13,ENSG00000267078.1,True,False,True,False,True,False,0.230143,0.509499,0.632816
ENSG00000169554.18,ENSG00000132424.14,True,False,True,False,True,False,0.509012,0.773762,0.878352
ENSG00000143933.16,ENSG00000135378.3,True,False,True,False,True,False,0.471842,0.531121,0.819382


In [15]:
[i for i in gene_pair_samples["all_high"].head(10).index]

[('ENSG00000052749.13', 'ENSG00000165025.14'),
 ('ENSG00000102897.9', 'ENSG00000086544.2'),
 ('ENSG00000110628.13', 'ENSG00000267078.1'),
 ('ENSG00000169554.18', 'ENSG00000132424.14'),
 ('ENSG00000143933.16', 'ENSG00000135378.3'),
 ('ENSG00000170776.21', 'ENSG00000155903.11'),
 ('ENSG00000136111.12', 'ENSG00000065911.11'),
 ('ENSG00000131042.14', 'ENSG00000141367.11'),
 ('ENSG00000160703.15', 'ENSG00000231964.1'),
 ('ENSG00000008394.12', 'ENSG00000101347.8')]

# Compute pvalues on sampled gene pairs

In [16]:
output_file = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"

In [17]:
def corr_single(x, y):
    ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)
    p_val, p_pval = stats.pearsonr(x, y)
    s_val, s_pval = stats.spearmanr(x, y)
    
    return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval

In [18]:
results = []

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (gene0, gene1, k)
        for k, v in gene_pair_samples.items()
        for gene0, gene1 in gene_pair_samples[k].index
    }

    for t_idx, t in enumerate(as_completed(tasks)):
        gene0, gene1, k = tasks[t]
        ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()

        results.append({
            "gene0": gene0,
            "gene1": gene1,
            "group": k,
            "ccc": ccc_val,
            "ccc_pvalue": ccc_pval,
            "pearson": p_val,
            "pearson_pvalue": p_pval,
            "spearman": s_val,
            "spearman_pvalue": s_pval,
        })
        
        if t_idx % 10:
            _df = pd.DataFrame(results)
            _df["group"] = _df["group"].astype("category")
            _df.to_pickle(output_file)

In [19]:
len(results)

644

In [20]:
results_df = pd.DataFrame(results)
results_df["group"] = results_df["group"].astype("category")

In [21]:
results_df.shape

(644, 9)

In [22]:
results_df.head()

Unnamed: 0,gene0,gene1,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
0,ENSG00000170776.21,ENSG00000155903.11,all_high,0.324987,0.009901,0.751337,4.609357e-138,0.769746,6.110239e-149
1,ENSG00000169554.18,ENSG00000132424.14,all_high,0.509012,0.009901,0.773762,1.8934870000000002e-151,0.878352,1.3744550000000002e-243
2,ENSG00000102978.12,ENSG00000144224.16,all_high,0.18194,0.009901,0.481308,4.903005e-45,0.586014,8.034343e-71
3,ENSG00000135828.11,ENSG00000250138.4,all_high,0.388366,0.009901,0.751843,2.38259e-138,0.794799,1.7194320000000001e-165
4,ENSG00000174705.12,ENSG00000164105.3,all_high,0.261762,0.009901,0.586937,4.31239e-71,0.674704,2.085742e-101


# Save

In [23]:
results_df.to_pickle(output_file)