# Description

It sample gene pairs from the categories in Figure 3b.

# Modules loading

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd

from ccc import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [3]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
OUTPUT_DIR

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

# Load gene pairs intersection

In [7]:
df_plot = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [8]:
df_plot.shape

(12497500, 9)

In [9]:
df_plot.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


In [10]:
df_plot[
    df_plot["Spearman (high)"]
    & df_plot["Pearson (low)"]
    & (~df_plot["Clustermatch (high)"])
    & (~df_plot["Clustermatch (low)"])
]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000003402.19,ENSG00000133800.8,False,True,True,False,False,False,0.157825,0.048650,0.570855
ENSG00000003756.16,ENSG00000163739.4,False,True,True,False,False,False,0.164233,0.040470,0.565410
ENSG00000004776.12,ENSG00000196616.13,False,True,True,False,False,False,0.159947,0.097615,0.568500
ENSG00000008083.13,ENSG00000163739.4,False,True,True,False,False,False,0.159947,0.069861,0.571621
ENSG00000008988.9,ENSG00000229769.2,False,True,True,False,False,False,0.151281,0.091727,0.568723
...,...,...,...,...,...,...,...,...,...,...
ENSG00000278274.1,ENSG00000169136.10,False,True,True,False,False,False,0.172974,0.076674,0.569907
ENSG00000279520.1,ENSG00000163739.4,False,True,True,False,False,False,0.168576,0.061147,0.584956
ENSG00000281358.1,ENSG00000139194.7,False,True,True,False,False,False,0.177429,0.100793,0.615723
ENSG00000281649.1,ENSG00000163739.4,False,True,True,False,False,False,0.177429,0.080855,0.574391


# Select gene pairs from each category in Figure 3b

In [11]:
gene_pair_cats = {}

In [12]:
gene_pair_cats["all_high"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (high)"]
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["all_high"].shape)

(3120576, 9)

In [13]:
gene_pair_cats["all_low"] = df_plot[
    df_plot["Clustermatch (low)"] & df_plot["Spearman (low)"] & df_plot["Pearson (low)"]
]
display(gene_pair_cats["all_low"].shape)

(2545332, 9)

In [14]:
gene_pair_cats["ccc_spearman_high_and_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (high)"]
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_spearman_high_and_pearson_low"].shape)

(9748, 9)

In [15]:
gene_pair_cats["ccc_high_and_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & (~df_plot["Spearman (high)"])
    & (~df_plot["Spearman (low)"])
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_pearson_low"].shape)

(20951, 9)

In [16]:
gene_pair_cats["ccc_high_and_spearman_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (low)"]
    & (~df_plot["Pearson (low)"])
    & (~df_plot["Pearson (high)"])
]
display(gene_pair_cats["ccc_high_and_spearman_low"].shape)

(28, 9)

In [17]:
gene_pair_cats["ccc_high_and_spearman_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (low)"]
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_spearman_pearson_low"].shape)

(8, 9)

In [18]:
gene_pair_cats["pearson_high_and_ccc_low"] = df_plot[
    df_plot["Clustermatch (low)"]
    & (~df_plot["Spearman (low)"])
    & (~df_plot["Spearman (high)"])
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_low"].shape)

(1075, 9)

In [19]:
gene_pair_cats["pearson_high_and_ccc_spearman_low"] = df_plot[
    df_plot["Clustermatch (low)"]
    & df_plot["Spearman (low)"]
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_spearman_low"].shape)

(531, 9)

In [20]:
assert len(gene_pair_cats) == 8

# Sample gene pairs

In [21]:
gene_pair_samples = {}

for k, v in gene_pair_cats.items():
    # sample at most 100 gene pairs
    df = gene_pair_cats[k]
    n = min(100, df.shape[0])
    sample_n = df.sample(n=n, replace=False, random_state=RANDOM_STATE)
    # sample_fraq = gene_pair_cats[k].sample(fraq=replace=False)

    gene_pair_samples[k] = sample_n

    display(f"{k}: {gene_pair_samples[k].shape}")

'all_high: (100, 9)'

'all_low: (100, 9)'

'ccc_spearman_high_and_pearson_low: (100, 9)'

'ccc_high_and_pearson_low: (100, 9)'

'ccc_high_and_spearman_low: (28, 9)'

'ccc_high_and_spearman_pearson_low: (8, 9)'

'pearson_high_and_ccc_low: (100, 9)'

'pearson_high_and_ccc_spearman_low: (100, 9)'

# Include gene pairs mentioned in the paper

In [22]:
selected_gene_pairs = [
    # ('SDS', 'IFNG')
    ("ENSG00000135094.10", "ENSG00000111537.4"),
    # ('APOC1', 'JUN')
    ("ENSG00000130208.9", "ENSG00000177606.6"),
    # ('CCL18', 'ZDHHC12')
    ("ENSG00000275385.1", "ENSG00000160446.18"),
    # ('KDM6A', 'UTY')
    ("ENSG00000147050.14", "ENSG00000183878.15"),
    # ('CYTIP', 'RASSF2')
    ("ENSG00000115165.9", "ENSG00000101265.15"),
    # ('KLHL21', 'AC068580.6')
    ("ENSG00000162413.16", "ENSG00000235027.1"),
    # ('TNNI2', 'MYOZ1')
    ("ENSG00000130598.15", "ENSG00000177791.11"),
    # ('TPM2', 'PYGM')
    ("ENSG00000198467.13", "ENSG00000068976.13"),
]

In [23]:
gene_pair_samples["selected_in_manuscript"] = df_plot.loc[selected_gene_pairs]
display(gene_pair_samples["selected_in_manuscript"].shape)

(8, 9)

In [24]:
gene_pair_samples["selected_in_manuscript"]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000135094.10,ENSG00000111537.4,False,True,True,False,True,False,0.706993,0.090451,0.765177
ENSG00000130208.9,ENSG00000177606.6,False,True,True,False,True,False,0.612233,0.084623,0.748265
ENSG00000275385.1,ENSG00000160446.18,False,True,False,False,True,False,0.446659,0.099853,0.560171
ENSG00000147050.14,ENSG00000183878.15,False,False,False,True,True,False,0.294391,0.23987,0.100621
ENSG00000115165.9,ENSG00000101265.15,False,False,False,True,True,False,0.201962,0.15606,0.107882
ENSG00000162413.16,ENSG00000235027.1,False,True,False,True,True,False,0.184217,0.062875,0.186421
ENSG00000130598.15,ENSG00000177791.11,True,False,False,False,False,True,0.034593,0.967834,0.284206
ENSG00000198467.13,ENSG00000068976.13,True,False,False,True,False,True,0.034912,0.94443,0.029852


# Save

In [25]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [26]:
pd.to_pickle(gene_pair_samples, output_file)