# Description

It reads the pvalues generated previously and adjust them using FDR.

# Modules loading

In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests

from ccc import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

# Paths

In [3]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
assert OUTPUT_DIR.exists()

In [4]:
OUTPUT_DIR

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

In [5]:
INPUT_PVALUES_FILE = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"
display(INPUT_PVALUES_FILE)
assert INPUT_PVALUES_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues/gene_pair-samples-pvalues.pkl')

# Load pvalues

In [6]:
pvalues = pd.read_pickle(INPUT_PVALUES_FILE).sort_index()

In [7]:
pvalues.shape

(12116, 9)

In [8]:
pvalues.head()

Unnamed: 0,gene0,gene1,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
0,ENSG00000197324.8,ENSG00000171302.16,all_high-top_ccc,0.866819,9.99999e-07,0.958808,0.0,0.972106,0.0
1,ENSG00000197561.6,ENSG00000172232.9,all_high-top_ccc,0.876726,9.99999e-07,0.953421,0.0,0.97859,0.0
2,ENSG00000147872.9,ENSG00000135245.9,all_high-top_ccc,0.876726,9.99999e-07,0.849159,5.792244e-211,0.911077,5.115786000000001e-292
3,ENSG00000235568.6,ENSG00000169403.11,all_high-top_ccc,0.866819,9.99999e-07,0.934582,0.0,0.951794,0.0
4,ENSG00000125910.5,ENSG00000165879.8,all_high-top_ccc,0.88669,9.99999e-07,0.896396,2.7581260000000003e-268,0.940058,0.0


# Get set of unique gene pairs

In [9]:
pvalues = pvalues.set_index(["gene0", "gene1"])

In [10]:
assert not pvalues.index.is_unique

In [11]:
pvalues.loc[pvalues.index.duplicated(keep=False)].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000002834.17,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_ccc,0.429092,9.999990e-07,0.114457,1.632100e-03,0.761734,4.379293e-144
ENSG00000002834.17,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_spearman,0.429092,9.999990e-07,0.114457,1.632100e-03,0.761734,4.379293e-144
ENSG00000003756.16,ENSG00000183542.5,pearson_high_and_ccc_low-top_ccc,0.034734,9.999990e-07,0.437469,1.218991e-36,0.299159,4.481318e-17
ENSG00000003756.16,ENSG00000183542.5,pearson_high_and_ccc_low-top_spearman,0.034734,9.999990e-07,0.437469,1.218991e-36,0.299159,4.481318e-17
ENSG00000004700.15,ENSG00000169884.13,pearson_high_and_ccc_low-top_ccc,0.033224,9.999990e-07,0.438002,9.795726e-37,0.303437,1.519754e-17
...,...,...,...,...,...,...,...,...
ENSG00000281649.1,ENSG00000133067.17,pearson_high_and_ccc_low-top_spearman,0.031776,9.999990e-07,0.466467,4.680131e-42,0.316109,5.551746e-19
ENSG00000281649.1,ENSG00000200259.1,pearson_high_and_ccc_low-top_ccc,0.034577,9.999990e-07,0.449314,8.640972e-39,0.301519,2.473806e-17
ENSG00000281649.1,ENSG00000200259.1,pearson_high_and_ccc_low-top_spearman,0.034577,9.999990e-07,0.449314,8.640972e-39,0.301519,2.473806e-17
ENSG00000284526.1,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_ccc,0.422164,9.999990e-07,0.085627,1.861099e-02,0.707076,1.948370e-115


In [12]:
pvalues_nodup = pvalues.loc[~pvalues.index.duplicated(keep="first"), :]

In [13]:
pvalues_nodup.shape

(9824, 7)

# Adjust p-values for all methods

In [14]:
adj_pvals = multipletests(pvalues_nodup["pearson_pvalue"], alpha=0.05, method="fdr_bh")

In [15]:
adj_pvals[1].shape

(9824,)

In [16]:
adj_pvals

(array([ True,  True,  True, ...,  True, False, False]),
 array([0.00000000e+000, 0.00000000e+000, 5.12639712e-210, ...,
        2.81036608e-020, 9.19067279e-001, 1.32862419e-001]),
 5.2212093323289466e-06,
 5.089576547231271e-06)

In [17]:
for coef in ("ccc", "pearson", "spearman"):
    pval_col_name = f"{coef}_pvalue"
    fdr_col_name = f"{coef}_fdr"
    print(f"{pval_col_name} - {fdr_col_name}")

    adj_pvals = multipletests(pvalues_nodup[pval_col_name], alpha=0.05, method="fdr_bh")
    pvalues_nodup = pvalues_nodup.assign(**{fdr_col_name: adj_pvals[1]})

ccc_pvalue - ccc_fdr
pearson_pvalue - pearson_fdr
spearman_pvalue - spearman_fdr


In [18]:
pvalues_nodup.shape

(9824, 10)

In [19]:
# reorder columns
_tmp = (
    pvalues_nodup.rename(columns={"group": "agroup"})
    .sort_index(axis="columns")
    .rename(columns={"agroup": "group"})
)
display(_tmp.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000197324.8,ENSG00000171302.16,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.958808,0.0,0.0,0.972106,0.0,0.0
ENSG00000197561.6,ENSG00000172232.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.953421,0.0,0.0,0.97859,0.0,0.0
ENSG00000147872.9,ENSG00000135245.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.849159,5.126397e-210,5.792244e-211,0.911077,5.295835e-291,5.115786000000001e-292
ENSG00000235568.6,ENSG00000169403.11,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.934582,0.0,0.0,0.951794,0.0,0.0
ENSG00000125910.5,ENSG00000165879.8,all_high-top_ccc,0.88669,1e-06,9.99999e-07,0.896396,2.834292e-267,2.7581260000000003e-268,0.940058,0.0,0.0


In [20]:
pvalues_nodup = _tmp

In [21]:
pvalues_nodup.shape

(9824, 10)

In [22]:
pvalues_nodup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000197324.8,ENSG00000171302.16,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.958808,0.0,0.0,0.972106,0.0,0.0
ENSG00000197561.6,ENSG00000172232.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.953421,0.0,0.0,0.97859,0.0,0.0
ENSG00000147872.9,ENSG00000135245.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.849159,5.126397e-210,5.792244e-211,0.911077,5.295835e-291,5.115786000000001e-292
ENSG00000235568.6,ENSG00000169403.11,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.934582,0.0,0.0,0.951794,0.0,0.0
ENSG00000125910.5,ENSG00000165879.8,all_high-top_ccc,0.88669,1e-06,9.99999e-07,0.896396,2.834292e-267,2.7581260000000003e-268,0.940058,0.0,0.0


# Reassign adjusted pvalues to original file

In [23]:
pvalues.shape

(12116, 7)

In [24]:
pvalues = pvalues.assign(
    **{
        (col := f"{coef}_fdr"): pvalues_nodup[col]
        for coef in ("ccc", "pearson", "spearman")
    }
)
pvalues = pvalues[pvalues_nodup.columns]

In [25]:
pvalues.shape

(12116, 10)

In [26]:
pvalues.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000197324.8,ENSG00000171302.16,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.958808,0.0,0.0,0.972106,0.0,0.0
ENSG00000197561.6,ENSG00000172232.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.953421,0.0,0.0,0.97859,0.0,0.0
ENSG00000147872.9,ENSG00000135245.9,all_high-top_ccc,0.876726,1e-06,9.99999e-07,0.849159,5.126397e-210,5.792244e-211,0.911077,5.295835e-291,5.115786000000001e-292
ENSG00000235568.6,ENSG00000169403.11,all_high-top_ccc,0.866819,1e-06,9.99999e-07,0.934582,0.0,0.0,0.951794,0.0,0.0
ENSG00000125910.5,ENSG00000165879.8,all_high-top_ccc,0.88669,1e-06,9.99999e-07,0.896396,2.834292e-267,2.7581260000000003e-268,0.940058,0.0,0.0


In [27]:
# Make sure duplicated gene pairs have the same pvalues/values
pvalues.loc[pvalues.index.duplicated(keep=False)].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000002834.17,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_ccc,0.429092,0.000001,9.999990e-07,0.114457,2.939815e-03,1.632100e-03,0.761734,2.764921e-143,4.379293e-144
ENSG00000002834.17,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_spearman,0.429092,0.000001,9.999990e-07,0.114457,2.939815e-03,1.632100e-03,0.761734,2.764921e-143,4.379293e-144
ENSG00000003756.16,ENSG00000183542.5,pearson_high_and_ccc_low-top_ccc,0.034734,0.000001,9.999990e-07,0.437469,3.582223e-36,1.218991e-36,0.299159,6.953793e-17,4.481318e-17
ENSG00000003756.16,ENSG00000183542.5,pearson_high_and_ccc_low-top_spearman,0.034734,0.000001,9.999990e-07,0.437469,3.582223e-36,1.218991e-36,0.299159,6.953793e-17,4.481318e-17
ENSG00000004700.15,ENSG00000169884.13,pearson_high_and_ccc_low-top_ccc,0.033224,0.000001,9.999990e-07,0.438002,2.895974e-36,9.795726e-37,0.303437,2.381950e-17,1.519754e-17
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000281649.1,ENSG00000133067.17,pearson_high_and_ccc_low-top_spearman,0.031776,0.000001,9.999990e-07,0.466467,1.628679e-41,4.680131e-42,0.316109,8.901641e-19,5.551746e-19
ENSG00000281649.1,ENSG00000200259.1,pearson_high_and_ccc_low-top_ccc,0.034577,0.000001,9.999990e-07,0.449314,2.760615e-38,8.640972e-39,0.301519,3.862471e-17,2.473806e-17
ENSG00000281649.1,ENSG00000200259.1,pearson_high_and_ccc_low-top_spearman,0.034577,0.000001,9.999990e-07,0.449314,2.760615e-38,8.640972e-39,0.301519,3.862471e-17,2.473806e-17
ENSG00000284526.1,ENSG00000068976.13,ccc_spearman_high_and_pearson_low-top_ccc,0.422164,0.000001,9.999990e-07,0.085627,2.553196e-02,1.861099e-02,0.707076,9.171437e-115,1.948370e-115


In [28]:
def _assert_same_values(x):
    for coef in ("ccc", "pearson", "spearman"):
        assert x[f"{coef}"].unique().shape[0] == 1
        assert x[f"{coef}_fdr"].unique().shape[0] == 1

        # for CCC, the pvalue column is computed via permutations, so we don't expect to be all the same
        if coef == "ccc":
            assert x[f"{coef}_pvalue"].unique().shape[0] >= 1, x
        else:
            assert x[f"{coef}_pvalue"].unique().shape[0] == 1, x

In [29]:
pvalues.loc[pvalues.index.duplicated(keep=False)].groupby(["gene0", "gene1"]).apply(
    _assert_same_values
)
print("values seem correct")

values seem correct


# Save

In [30]:
INPUT_PVALUES_FILE.parent

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

In [31]:
INPUT_PVALUES_FILE.stem

'gene_pair-samples-pvalues'

In [32]:
INPUT_PVALUES_FILE.suffix

'.pkl'

In [33]:
output_file = (
    INPUT_PVALUES_FILE.parent
    / f"{INPUT_PVALUES_FILE.stem}-fdr{INPUT_PVALUES_FILE.suffix}"
)
display(output_file)

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues/gene_pair-samples-pvalues-fdr.pkl')

In [34]:
pvalues.to_pickle(output_file)