# Description

It sample gene pairs from the categories in Figure 3b.

# Modules loading

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd

from ccc import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"
N_MAX_SAMPLES_PER_CATEGORY = 500

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [3]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
OUTPUT_DIR

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')

# Load gene pairs intersection

In [7]:
df_plot = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [8]:
df_plot.shape

(12497500, 9)

In [9]:
df_plot.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


In [10]:
df_plot[
    df_plot["Spearman (high)"]
    & df_plot["Pearson (low)"]
    & (~df_plot["Clustermatch (high)"])
    & (~df_plot["Clustermatch (low)"])
]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000003402.19,ENSG00000133800.8,False,True,True,False,False,False,0.157825,0.048650,0.570855
ENSG00000003756.16,ENSG00000163739.4,False,True,True,False,False,False,0.164233,0.040470,0.565410
ENSG00000004776.12,ENSG00000196616.13,False,True,True,False,False,False,0.159947,0.097615,0.568500
ENSG00000008083.13,ENSG00000163739.4,False,True,True,False,False,False,0.159947,0.069861,0.571621
ENSG00000008988.9,ENSG00000229769.2,False,True,True,False,False,False,0.151281,0.091727,0.568723
...,...,...,...,...,...,...,...,...,...,...
ENSG00000278274.1,ENSG00000169136.10,False,True,True,False,False,False,0.172974,0.076674,0.569907
ENSG00000279520.1,ENSG00000163739.4,False,True,True,False,False,False,0.168576,0.061147,0.584956
ENSG00000281358.1,ENSG00000139194.7,False,True,True,False,False,False,0.177429,0.100793,0.615723
ENSG00000281649.1,ENSG00000163739.4,False,True,True,False,False,False,0.177429,0.080855,0.574391


# Select gene pairs from each category in Figure 3b

In [11]:
gene_pair_cats = {}

In [12]:
gene_pair_cats["all_high"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (high)"]
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["all_high"].shape)

(3120576, 9)

In [13]:
gene_pair_cats["all_low"] = df_plot[
    df_plot["Clustermatch (low)"] & df_plot["Spearman (low)"] & df_plot["Pearson (low)"]
]
display(gene_pair_cats["all_low"].shape)

(2545332, 9)

In [14]:
gene_pair_cats["ccc_spearman_high_and_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (high)"]
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_spearman_high_and_pearson_low"].shape)

(9748, 9)

In [15]:
gene_pair_cats["ccc_high_and_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & (~df_plot["Spearman (high)"])
    & (~df_plot["Spearman (low)"])
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_pearson_low"].shape)

(20951, 9)

In [16]:
gene_pair_cats["ccc_high_and_spearman_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (low)"]
    & (~df_plot["Pearson (low)"])
    & (~df_plot["Pearson (high)"])
]
display(gene_pair_cats["ccc_high_and_spearman_low"].shape)

(28, 9)

In [17]:
gene_pair_cats["ccc_high_and_spearman_pearson_low"] = df_plot[
    df_plot["Clustermatch (high)"]
    & df_plot["Spearman (low)"]
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_spearman_pearson_low"].shape)

(8, 9)

In [18]:
gene_pair_cats["pearson_high_and_ccc_low"] = df_plot[
    df_plot["Clustermatch (low)"]
    & (~df_plot["Spearman (low)"])
    & (~df_plot["Spearman (high)"])
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_low"].shape)

(1075, 9)

In [19]:
gene_pair_cats["pearson_high_and_ccc_spearman_low"] = df_plot[
    df_plot["Clustermatch (low)"]
    & df_plot["Spearman (low)"]
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_spearman_low"].shape)

(531, 9)

In [20]:
assert len(gene_pair_cats) == 8

# Sample gene pairs

Here I take all the categories defined above (keys in dictionaries) and I create three subcategories for each, where I take the top genes prioritized by the three coefficients.

In [21]:
# prepare weights for sampling, where I will put zeros on already sampled gene pairs
gene_pairs_weights = (
    df_plot.drop(columns=df_plot.columns[:-1])
    .rename(columns={df_plot.columns[-1]: "weight"})
    .assign(weight=1.0)
    .squeeze()
    .sort_index()
)

In [22]:
gene_pairs_weights

ENSG00000000419.12  ENSG00000002834.17    1.0
                    ENSG00000002919.14    1.0
                    ENSG00000002933.7     1.0
                    ENSG00000003402.19    1.0
                    ENSG00000004478.7     1.0
                                         ... 
ENSG00000284574.1   ENSG00000282651.2     1.0
                    ENSG00000282780.1     1.0
                    ENSG00000282939.1     1.0
                    ENSG00000283063.1     1.0
                    ENSG00000283602.1     1.0
Name: weight, Length: 12497500, dtype: float64

In [23]:
_tmp = df_plot.sample(n=10, replace=False, weights=gene_pairs_weights)
assert _tmp.shape[0] == 10

display(_tmp)

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000160808.9,ENSG00000160712.12,False,True,False,True,False,True,0.005865,0.069605,0.014116
ENSG00000187257.15,ENSG00000129538.13,False,False,False,False,False,False,0.041353,0.195403,0.328354
ENSG00000196576.14,ENSG00000223648.4,False,False,False,False,False,False,0.0601,0.244487,0.309536
ENSG00000167984.17,ENSG00000169583.12,True,False,True,False,True,False,0.245699,0.590416,0.676359
ENSG00000182118.7,ENSG00000197622.12,False,False,True,False,True,False,0.210187,0.396896,0.614692
ENSG00000134698.10,ENSG00000255328.1,True,False,True,False,True,False,0.267229,0.624077,0.669723
ENSG00000106546.12,ENSG00000224137.1,False,True,False,True,False,True,0.004956,0.035826,0.083771
ENSG00000145022.4,ENSG00000147604.13,False,True,False,True,False,True,0.004948,0.043008,0.104607
ENSG00000153310.19,ENSG00000224650.2,False,True,False,False,False,False,0.037414,0.032336,0.212447
ENSG00000139218.17,ENSG00000169087.10,True,False,True,False,True,False,0.355974,0.735773,0.786861


In [24]:
gene_pair_samples = {}

for k, v in gene_pair_cats.items():
    # sample at most 100 gene pairs
    df = gene_pair_cats[k]

    n = min(N_MAX_SAMPLES_PER_CATEGORY, df.shape[0])

    for coef in ("ccc", "pearson", "spearman", "random"):
        if coef == "random":
            new_k = f"{k}-{coef}"

            # do not sample if all gene pairs were already sampled
            df_weights = gene_pairs_weights.loc[df.index]
            if (df_weights > 0).sum() < n:
                display(f"  WARNING: {new_k}: none selected")
                continue

            sample_n = df.sample(
                n=n,
                replace=False,
                random_state=RANDOM_STATE,
                weights=gene_pairs_weights,
            )

            # do not sample these gene pairs again
            gene_pairs_weights.loc[sample_n.index] = 0.0

            gene_pair_samples[new_k] = sample_n

            display(f"{new_k}: {gene_pair_samples[new_k].shape}")

            continue

        df_coef = df.sort_values(coef, ascending=False)
        sample_n = df_coef.head(n)

        # when taking the top gene pairs by a coefficient, I do not remove repeated ones

        # do not sample these gene pairs again
        gene_pairs_weights.loc[sample_n.index] = 0.0

        new_k = f"{k}-top_{coef}"
        gene_pair_samples[new_k] = sample_n

        display(f"{new_k}: {gene_pair_samples[new_k].shape}")

'all_high-top_ccc: (500, 9)'

'all_high-top_pearson: (500, 9)'

'all_high-top_spearman: (500, 9)'

'all_high-random: (500, 9)'

'all_low-top_ccc: (500, 9)'

'all_low-top_pearson: (500, 9)'

'all_low-top_spearman: (500, 9)'

'all_low-random: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_ccc: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_pearson: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_spearman: (500, 9)'

'ccc_spearman_high_and_pearson_low-random: (500, 9)'

'ccc_high_and_pearson_low-top_ccc: (500, 9)'

'ccc_high_and_pearson_low-top_pearson: (500, 9)'

'ccc_high_and_pearson_low-top_spearman: (500, 9)'

'ccc_high_and_pearson_low-random: (500, 9)'

'ccc_high_and_spearman_low-top_ccc: (28, 9)'

'ccc_high_and_spearman_low-top_pearson: (28, 9)'

'ccc_high_and_spearman_low-top_spearman: (28, 9)'



'ccc_high_and_spearman_pearson_low-top_ccc: (8, 9)'

'ccc_high_and_spearman_pearson_low-top_pearson: (8, 9)'

'ccc_high_and_spearman_pearson_low-top_spearman: (8, 9)'



'pearson_high_and_ccc_low-top_ccc: (500, 9)'

'pearson_high_and_ccc_low-top_pearson: (500, 9)'

'pearson_high_and_ccc_low-top_spearman: (500, 9)'



'pearson_high_and_ccc_spearman_low-top_ccc: (500, 9)'

'pearson_high_and_ccc_spearman_low-top_pearson: (500, 9)'

'pearson_high_and_ccc_spearman_low-top_spearman: (500, 9)'



# Include gene pairs mentioned in the paper

In [25]:
selected_gene_pairs = [
    # ('SDS', 'IFNG')
    ("ENSG00000135094.10", "ENSG00000111537.4"),
    # ('APOC1', 'JUN')
    ("ENSG00000130208.9", "ENSG00000177606.6"),
    # ('CCL18', 'ZDHHC12')
    ("ENSG00000275385.1", "ENSG00000160446.18"),
    # ('KDM6A', 'UTY')
    ("ENSG00000147050.14", "ENSG00000183878.15"),
    # ('CYTIP', 'RASSF2')
    ("ENSG00000115165.9", "ENSG00000101265.15"),
    # ('KLHL21', 'AC068580.6')
    ("ENSG00000162413.16", "ENSG00000235027.1"),
    # ('TNNI2', 'MYOZ1')
    ("ENSG00000130598.15", "ENSG00000177791.11"),
    # ('TPM2', 'PYGM')
    ("ENSG00000198467.13", "ENSG00000068976.13"),
]

In [26]:
gene_pair_samples["selected_in_manuscript"] = df_plot.loc[selected_gene_pairs]
display(gene_pair_samples["selected_in_manuscript"].shape)

(8, 9)

In [27]:
gene_pair_samples["selected_in_manuscript"]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
ENSG00000135094.10,ENSG00000111537.4,False,True,True,False,True,False,0.706993,0.090451,0.765177
ENSG00000130208.9,ENSG00000177606.6,False,True,True,False,True,False,0.612233,0.084623,0.748265
ENSG00000275385.1,ENSG00000160446.18,False,True,False,False,True,False,0.446659,0.099853,0.560171
ENSG00000147050.14,ENSG00000183878.15,False,False,False,True,True,False,0.294391,0.23987,0.100621
ENSG00000115165.9,ENSG00000101265.15,False,False,False,True,True,False,0.201962,0.15606,0.107882
ENSG00000162413.16,ENSG00000235027.1,False,True,False,True,True,False,0.184217,0.062875,0.186421
ENSG00000130598.15,ENSG00000177791.11,True,False,False,False,False,True,0.034593,0.967834,0.284206
ENSG00000198467.13,ENSG00000068976.13,True,False,False,True,False,True,0.034912,0.94443,0.029852


# Include a random sample across the entire dataset

This includes all possible gene pairs from the top 5k genes initially selected, not the filtered list derived from the intersections.

## Load all correlations

In [28]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [29]:
INPUT_CORR_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    tissue=GTEX_TISSUE,
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(INPUT_CORR_FILE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_whole_blood-var_pc_log2-all.pkl')

In [30]:
df = pd.read_pickle(INPUT_CORR_FILE)

In [31]:
df.shape

(12497500, 3)

In [32]:
df.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,0.099013,0.094147,0.231269


## Select 2n here (double)

In [33]:
sample_n = df.sample(n=int(n * 2), replace=False, random_state=RANDOM_STATE)

new_k = f"entire_dataset-random"
gene_pair_samples[new_k] = sample_n

In [34]:
gene_pair_samples[new_k].shape

(1000, 3)

In [35]:
gene_pair_samples[new_k]

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman
ENSG00000150527.16,ENSG00000188895.11,0.579461,0.862097,0.912108
ENSG00000175895.3,ENSG00000222037.5,0.007740,0.034657,0.032004
ENSG00000213866.3,ENSG00000242887.1,0.034012,0.156495,0.270456
ENSG00000170542.5,ENSG00000230202.1,0.089190,0.110683,0.413265
ENSG00000211779.3,ENSG00000253616.5,0.024536,0.075756,0.073756
...,...,...,...,...
ENSG00000129347.19,ENSG00000132694.18,0.021054,0.031069,0.000816
ENSG00000059728.10,ENSG00000205364.3,0.017633,0.121704,0.182822
ENSG00000152520.13,ENSG00000148965.9,0.009905,0.057705,0.099234
ENSG00000164751.14,ENSG00000173575.20,0.045276,0.217614,0.307587


# Save

In [36]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [37]:
pd.to_pickle(gene_pair_samples, output_file)