# Description

Reads the gene pair samples across different categories and computes their p-values.

# Modules loading

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd
from concurrent.futures import as_completed, ProcessPoolExecutor
from tqdm import tqdm

from ccc.coef import ccc
from ccc import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

PVALUE_N_PERMS = 10000000

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [3]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
OUTPUT_DIR

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

# Load gene expression data

In [7]:
data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()

In [8]:
data.shape

(5000, 755)

In [9]:
data.head()

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419.12,20.65,25.05,7.155,49.13,6.147,4.143,5.39,4.389,1.158,6.824,...,4.407,32.34,18.68,9.251,7.828,7.46,33.24,5.848,25.76,17.08
ENSG00000000938.12,906.0,1344.0,633.5,719.2,392.6,166.5,338.2,413.2,51.54,423.6,...,354.8,1102.0,774.9,206.0,620.4,346.3,1304.0,232.9,631.6,884.5
ENSG00000001167.14,8.19,20.01,20.47,21.22,16.46,8.619,18.22,16.58,1.602,35.68,...,11.34,11.25,11.18,9.523,41.86,24.58,8.892,13.39,13.47,42.64
ENSG00000001561.6,0.7104,1.771,2.234,6.014,3.206,0.3962,2.445,1.418,0.5531,0.7447,...,0.9269,2.555,0.5976,3.417,2.645,1.883,0.5391,0.9816,1.036,6.729
ENSG00000002549.12,22.5,21.33,19.29,157.1,29.33,9.577,14.17,23.33,1.407,28.3,...,4.493,50.47,16.21,32.74,18.15,11.92,20.1,15.55,11.98,35.37


# Load gene pairs samples

In [10]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [11]:
gene_pair_samples = pd.read_pickle(output_file)

In [12]:
len(gene_pair_samples)

25

In [13]:
sorted(gene_pair_samples.keys())

['all_high-top_ccc',
 'all_high-top_pearson',
 'all_high-top_spearman',
 'all_low-top_ccc',
 'all_low-top_pearson',
 'all_low-top_spearman',
 'ccc_high_and_pearson_low-top_ccc',
 'ccc_high_and_pearson_low-top_pearson',
 'ccc_high_and_pearson_low-top_spearman',
 'ccc_high_and_spearman_low-top_ccc',
 'ccc_high_and_spearman_low-top_pearson',
 'ccc_high_and_spearman_low-top_spearman',
 'ccc_high_and_spearman_pearson_low-top_ccc',
 'ccc_high_and_spearman_pearson_low-top_pearson',
 'ccc_high_and_spearman_pearson_low-top_spearman',
 'ccc_spearman_high_and_pearson_low-top_ccc',
 'ccc_spearman_high_and_pearson_low-top_pearson',
 'ccc_spearman_high_and_pearson_low-top_spearman',
 'pearson_high_and_ccc_low-top_ccc',
 'pearson_high_and_ccc_low-top_pearson',
 'pearson_high_and_ccc_low-top_spearman',
 'pearson_high_and_ccc_spearman_low-top_ccc',
 'pearson_high_and_ccc_spearman_low-top_pearson',
 'pearson_high_and_ccc_spearman_low-top_spearman',
 'selected_in_manuscript']

In [14]:
_k = list(gene_pair_samples.keys())[0]
gene_pair_samples[_k].head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000277089.4,ENSG00000277632.1,True,False,True,False,True,False,0.916918,0.916024,0.955427
ENSG00000171148.13,ENSG00000247596.8,True,False,True,False,True,False,0.89671,0.965141,0.973215
ENSG00000168298.6,ENSG00000124575.6,True,False,True,False,True,False,0.89671,0.869247,0.914341
ENSG00000125910.5,ENSG00000165879.8,True,False,True,False,True,False,0.88669,0.896396,0.940058
ENSG00000130772.13,ENSG00000163154.5,True,False,True,False,True,False,0.876726,0.945764,0.949664


In [15]:
[i for i in gene_pair_samples[_k].head(10).index]

[('ENSG00000277089.4', 'ENSG00000277632.1'),
 ('ENSG00000171148.13', 'ENSG00000247596.8'),
 ('ENSG00000168298.6', 'ENSG00000124575.6'),
 ('ENSG00000125910.5', 'ENSG00000165879.8'),
 ('ENSG00000130772.13', 'ENSG00000163154.5')]

# Compute pvalues on sampled gene pairs

In [16]:
output_file = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"

In [17]:
def corr_single(x, y):
    ccc_val, ccc_pval = ccc(
        x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL["N_JOBS"]
    )
    p_val, p_pval = stats.pearsonr(x, y)
    s_val, s_pval = stats.spearmanr(x, y)

    return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval

In [18]:
results = []

# I leave the ProcessPoolExecutor here in case I want to easily swith between
# parallelize across gene pairs (max_workers=conf.GENERAL["N_JOBS"] and n_jobs=1 inside function corr_single)
# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL["N_JOBS"])
with ProcessPoolExecutor(max_workers=1) as executor:
    tasks = {
        executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (
            gene0,
            gene1,
            k,
        )
        for k, v in gene_pair_samples.items()
        for gene0, gene1 in gene_pair_samples[k].index
    }

    for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100):
        gene0, gene1, k = tasks[t]
        ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()

        results.append(
            {
                "gene0": gene0,
                "gene1": gene1,
                "group": k,
                "ccc": ccc_val,
                "ccc_pvalue": ccc_pval,
                "pearson": p_val,
                "pearson_pvalue": p_pval,
                "spearman": s_val,
                "spearman_pvalue": s_pval,
            }
        )

        # save
        _df = pd.DataFrame(results)
        _df["group"] = _df["group"].astype("category")
        _df.to_pickle(output_file)

100%|█████████████████████████████████████████████████████████| 128/128 [10:48:12<00:00, 303.84s/it]


In [19]:
len(results)

128

In [20]:
results_df = pd.DataFrame(results)
results_df["group"] = results_df["group"].astype("category")

In [21]:
results_df.shape

(128, 9)

In [22]:
results_df.head()

Unnamed: 0,gene0,gene1,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
0,ENSG00000277089.4,ENSG00000277632.1,all_high-top_ccc,0.916918,9.999999e-08,0.916024,5.888683e-301,0.955427,0.0
1,ENSG00000171148.13,ENSG00000247596.8,all_high-top_ccc,0.89671,9.999999e-08,0.965141,0.0,0.973215,0.0
2,ENSG00000168298.6,ENSG00000124575.6,all_high-top_ccc,0.89671,9.999999e-08,0.869247,1.416256e-232,0.914341,7.434755e-298
3,ENSG00000125910.5,ENSG00000165879.8,all_high-top_ccc,0.88669,9.999999e-08,0.896396,2.7581260000000003e-268,0.940058,0.0
4,ENSG00000130772.13,ENSG00000163154.5,all_high-top_ccc,0.876726,9.999999e-08,0.945764,0.0,0.949664,0.0


# Save

In [23]:
results_df.to_pickle(output_file)