# Description

Reads the gene pair samples across different categories and computes their p-values.

# Modules loading

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd
from concurrent.futures import as_completed, ProcessPoolExecutor
from tqdm import tqdm

from ccc.coef import ccc
from ccc import conf

# Settings

In [None]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

PVALUE_N_PERMS = 10000000

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [None]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

In [None]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

In [None]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
OUTPUT_DIR

# Load gene expression data

In [None]:
data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()

In [None]:
data.shape

In [None]:
data.head()

# Load gene pairs samples

In [None]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [None]:
gene_pair_samples = pd.read_pickle(output_file)

In [None]:
len(gene_pair_samples)

In [None]:
sorted(gene_pair_samples.keys())

In [None]:
_k = list(gene_pair_samples.keys())[0]
gene_pair_samples[_k].head()

In [None]:
[i for i in gene_pair_samples[_k].head(10).index]

# Compute pvalues on sampled gene pairs

In [None]:
output_file = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"

In [None]:
def corr_single(x, y):
    ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL["N_JOBS"])
    p_val, p_pval = stats.pearsonr(x, y)
    s_val, s_pval = stats.spearmanr(x, y)

    return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval

In [None]:
results = []

# I leave the ProcessPoolExecutor here in case I want to easily swith between
# parallelize across gene pairs (max_workers=conf.GENERAL["N_JOBS"] and n_jobs=1 inside function corr_single)
# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL["N_JOBS"])
with ProcessPoolExecutor(max_workers=1) as executor:
    tasks = {
        executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (
            gene0,
            gene1,
            k,
        )
        for k, v in gene_pair_samples.items()
        for gene0, gene1 in gene_pair_samples[k].index
    }

    for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100):
        gene0, gene1, k = tasks[t]
        ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()

        results.append(
            {
                "gene0": gene0,
                "gene1": gene1,
                "group": k,
                "ccc": ccc_val,
                "ccc_pvalue": ccc_pval,
                "pearson": p_val,
                "pearson_pvalue": p_pval,
                "spearman": s_val,
                "spearman_pvalue": s_pval,
            }
        )

        # save
        _df = pd.DataFrame(results)
        _df["group"] = _df["group"].astype("category")
        _df.to_pickle(output_file)

In [None]:
len(results)

In [None]:
results_df = pd.DataFrame(results)
results_df["group"] = results_df["group"].astype("category")

In [None]:
results_df.shape

In [None]:
results_df.head()

# Save

In [None]:
results_df.to_pickle(output_file)