# Description

Creates **Supplementary File 1**.

*Description*: Classification and correlations of gene pairs used in Figure 3a (top 5,000 most variable genes in GTEx v8 whole blood). P-values are only included for a subset of gene pairs, as explained in the Methods section of the manuscript.

# Modules

In [1]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

from ccc import conf

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [4]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

# Paths

In [5]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None and conf.MANUSCRIPT["BASE_DIR"].exists()
), "Manuscript dir not set"

In [10]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [11]:
INPUT_PVALUES_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / "pvalues"
    / "gene_pair-samples-pvalues-fdr.pkl"
)
display(INPUT_PVALUES_FILE)
assert INPUT_PVALUES_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues/gene_pair-samples-pvalues-fdr.pkl')

In [7]:
OUTPUT_DIR = conf.MANUSCRIPT["SUPPLEMENTARY_MATERIAL_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/supplementary_material')

In [8]:
OUTPUT_FILENAME = "Supplementary_File_01-Gene_pair_intersections"

# Data

## Gene Ensembl ID -> Symbol mapping

In [12]:
gene_map = pd.read_pickle(
    DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
)

In [13]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [14]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

## Gene pairs intersection

In [18]:
gene_pair_intersections = (
    pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)
    .rename_axis(("gene0_id", "gene1_id"))
    .sort_index()
)

In [19]:
gene_pair_intersections.shape

(12497500, 9)

In [20]:
gene_pair_intersections.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
gene0_id,gene1_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


## p-values

In [21]:
df_pvalues = (
    pd.read_pickle(INPUT_PVALUES_FILE)
    .rename_axis(("gene0_id", "gene1_id"))
    .sort_index()
)

In [22]:
df_pvalues.shape

(12116, 10)

In [23]:
df_pvalues.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0_id,gene1_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000938.12,ENSG00000128342.4,ccc_high_and_pearson_low-top_pearson,0.2327,1e-06,9.99999e-07,-0.116906,0.002597,0.001291,-0.473951,2.7873200000000003e-43,1.534672e-43
ENSG00000000938.12,ENSG00000249138.1,ccc_high_and_pearson_low-random,0.198172,1e-06,9.99999e-07,-0.093076,0.015275,0.010503,-0.308136,7.175155e-18,4.538519e-18
ENSG00000001167.14,ENSG00000119862.12,all_low-top_ccc,0.034955,1e-06,9.99999e-07,0.024366,0.532093,0.50382,0.091169,0.0137764,0.0122058
ENSG00000001167.14,ENSG00000131475.6,all_low-top_pearson,0.028636,4e-06,2.999997e-06,-0.117799,0.002597,0.001184,-0.046469,0.2136647,0.2021593
ENSG00000001561.6,ENSG00000086730.16,all_low-random,0.01855,0.000149,0.0001289999,-0.002443,0.951313,0.946568,0.059267,0.1116711,0.1036914


In [24]:
# remove duplicated gene pairs
df_pvalues = df_pvalues[~df_pvalues.index.duplicated(keep="first")]

**Note**: Here the "group" column specifies the categories in Figure 3a, followed by `top_[coef]`, where for the same category I sorted gene pairs by `coef`. This allows me, for instance, to take the gene pairs where Pearson is high and CCC is low, and sort by any of those coefficient values.

# Combine data

In [25]:
df_full = gene_pair_intersections.rename(
    columns={
        "ccc": "ccc_coef",
        "pearson": "pearson_coef",
        "spearman": "spearman_coef",
    }
).join(df_pvalues["ccc_fdr pearson_fdr spearman_fdr".split()], how="left")

In [26]:
assert gene_pair_intersections.shape[0] == df_full.shape[0]
display(df_full.shape)

(12497500, 12)

In [27]:
df_full.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
gene0_id,gene1_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595,,,
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991,,,
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128,,,
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653,,,
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269,,,


## Add gene symbols

In [28]:
df_full = df_full.assign(
    gene0_symbol=df_full.apply(lambda x: gene_map[x.name[0]], axis=1),
    gene1_symbol=df_full.apply(lambda x: gene_map[x.name[1]], axis=1),
)

In [29]:
df_full.shape

(12497500, 14)

In [30]:
# reorder columns
col_name = "gene1_symbol"
col = df_full.pop(col_name)
df_full.insert(0, col_name, col)

col_name = "gene0_symbol"
col = df_full.pop(col_name)
df_full.insert(0, col_name, col)

In [31]:
df_full.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene0_symbol,gene1_symbol,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
gene0_id,gene1_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ENSG00000000419.12,ENSG00000002834.17,DPM1,LASP1,True,False,True,False,True,False,0.418721,0.681847,0.786595,,,
ENSG00000000419.12,ENSG00000002919.14,DPM1,SNX11,True,False,True,False,True,False,0.40509,0.734699,0.816991,,,
ENSG00000000419.12,ENSG00000002933.7,DPM1,TMEM176A,False,True,False,True,False,True,0.007466,0.013825,0.004128,,,
ENSG00000000419.12,ENSG00000003402.19,DPM1,CFLAR,True,False,True,False,True,False,0.391683,0.727347,0.803653,,,
ENSG00000000419.12,ENSG00000004478.7,DPM1,FKBP4,False,True,False,False,False,False,0.099013,0.094147,0.231269,,,


## Optimize DataFrame dtypes

In [32]:
df_full_orig = df_full

In [33]:
display(df_full.memory_usage())
display(f"{df_full.memory_usage().sum():,}")

Index                  320867034
gene0_symbol            99980000
gene1_symbol            99980000
Pearson (high)          12497500
Pearson (low)           12497500
Spearman (high)         12497500
Spearman (low)          12497500
Clustermatch (high)     12497500
Clustermatch (low)      12497500
ccc_coef                99980000
pearson_coef            99980000
spearman_coef           99980000
ccc_fdr                 99980000
pearson_fdr             99980000
spearman_fdr            99980000
dtype: int64

'1,195,692,034'

### Remove MultiIndex

A MultiIndex is not necessary for a supplementary file.

In [34]:
display(df_full.index.dtype)
display(df_full.index)

dtype('O')

MultiIndex([('ENSG00000000419.12', 'ENSG00000002834.17'),
            ('ENSG00000000419.12', 'ENSG00000002919.14'),
            ('ENSG00000000419.12',  'ENSG00000002933.7'),
            ('ENSG00000000419.12', 'ENSG00000003402.19'),
            ('ENSG00000000419.12',  'ENSG00000004478.7'),
            ('ENSG00000000419.12', 'ENSG00000004660.14'),
            ('ENSG00000000419.12',  'ENSG00000004799.7'),
            ('ENSG00000000419.12', 'ENSG00000004939.13'),
            ('ENSG00000000419.12', 'ENSG00000005020.12'),
            ('ENSG00000000419.12', 'ENSG00000005302.17'),
            ...
            ( 'ENSG00000284574.1',  'ENSG00000282416.1'),
            ( 'ENSG00000284574.1',  'ENSG00000282420.1'),
            ( 'ENSG00000284574.1',  'ENSG00000282499.1'),
            ( 'ENSG00000284574.1',  'ENSG00000282572.2'),
            ( 'ENSG00000284574.1',  'ENSG00000282639.1'),
            ( 'ENSG00000284574.1',  'ENSG00000282651.2'),
            ( 'ENSG00000284574.1',  'ENSG00000282780.1')

In [35]:
df_full = df_full.reset_index()

In [36]:
df_full.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
0,ENSG00000000419.12,ENSG00000002834.17,DPM1,LASP1,True,False,True,False,True,False,0.418721,0.681847,0.786595,,,
1,ENSG00000000419.12,ENSG00000002919.14,DPM1,SNX11,True,False,True,False,True,False,0.40509,0.734699,0.816991,,,
2,ENSG00000000419.12,ENSG00000002933.7,DPM1,TMEM176A,False,True,False,True,False,True,0.007466,0.013825,0.004128,,,
3,ENSG00000000419.12,ENSG00000003402.19,DPM1,CFLAR,True,False,True,False,True,False,0.391683,0.727347,0.803653,,,
4,ENSG00000000419.12,ENSG00000004478.7,DPM1,FKBP4,False,True,False,False,False,False,0.099013,0.094147,0.231269,,,


In [37]:
display(df_full.index.dtype)
display(df_full.index)

dtype('int64')

RangeIndex(start=0, stop=12497500, step=1)

In [38]:
display(df_full.memory_usage())
display(f"{df_full.memory_usage().sum():,}")

Index                       128
gene0_id               99980000
gene1_id               99980000
gene0_symbol           99980000
gene1_symbol           99980000
Pearson (high)         12497500
Pearson (low)          12497500
Spearman (high)        12497500
Spearman (low)         12497500
Clustermatch (high)    12497500
Clustermatch (low)     12497500
ccc_coef               99980000
pearson_coef           99980000
spearman_coef          99980000
ccc_fdr                99980000
pearson_fdr            99980000
spearman_fdr           99980000
dtype: int64

'1,074,785,128'

### Downcast dtypes

In [39]:
df_full.dtypes

gene0_id                object
gene1_id                object
gene0_symbol            object
gene1_symbol            object
Pearson (high)            bool
Pearson (low)             bool
Spearman (high)           bool
Spearman (low)            bool
Clustermatch (high)       bool
Clustermatch (low)        bool
ccc_coef               float64
pearson_coef           float64
spearman_coef          float64
ccc_fdr                float64
pearson_fdr            float64
spearman_fdr           float64
dtype: object

In [40]:
# categorical values
for _col in ("gene0_id", "gene1_id", "gene0_symbol", "gene1_symbol"):
    df_full[_col] = df_full[_col].astype("category")

In [41]:
df_full.dtypes

gene0_id               category
gene1_id               category
gene0_symbol           category
gene1_symbol           category
Pearson (high)             bool
Pearson (low)              bool
Spearman (high)            bool
Spearman (low)             bool
Clustermatch (high)        bool
Clustermatch (low)         bool
ccc_coef                float64
pearson_coef            float64
spearman_coef           float64
ccc_fdr                 float64
pearson_fdr             float64
spearman_fdr            float64
dtype: object

In [42]:
display(df_full.memory_usage())
display(f"{df_full.memory_usage().sum():,}")

Index                       128
gene0_id               25167128
gene1_id               25167128
gene0_symbol           25167112
gene1_symbol           25167112
Pearson (high)         12497500
Pearson (low)          12497500
Spearman (high)        12497500
Spearman (low)         12497500
Clustermatch (high)    12497500
Clustermatch (low)     12497500
ccc_coef               99980000
pearson_coef           99980000
spearman_coef          99980000
ccc_fdr                99980000
pearson_fdr            99980000
spearman_fdr           99980000
dtype: int64

'775,533,608'

In [43]:
# float
for _col in ("ccc_coef", "pearson_coef", "spearman_coef"):
    df_full[_col] = pd.to_numeric(df_full[_col], downcast="float")

In [44]:
df_full.dtypes

gene0_id               category
gene1_id               category
gene0_symbol           category
gene1_symbol           category
Pearson (high)             bool
Pearson (low)              bool
Spearman (high)            bool
Spearman (low)             bool
Clustermatch (high)        bool
Clustermatch (low)         bool
ccc_coef                float32
pearson_coef            float32
spearman_coef           float32
ccc_fdr                 float64
pearson_fdr             float64
spearman_fdr            float64
dtype: object

In [45]:
display(df_full.memory_usage())
display(f"{df_full.memory_usage().sum():,}")

Index                       128
gene0_id               25167128
gene1_id               25167128
gene0_symbol           25167112
gene1_symbol           25167112
Pearson (high)         12497500
Pearson (low)          12497500
Spearman (high)        12497500
Spearman (low)         12497500
Clustermatch (high)    12497500
Clustermatch (low)     12497500
ccc_coef               49990000
pearson_coef           49990000
spearman_coef          49990000
ccc_fdr                99980000
pearson_fdr            99980000
spearman_fdr           99980000
dtype: int64

'625,563,608'

### Check results

In [46]:
df_full.shape

(12497500, 16)

In [47]:
df_full.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
0,ENSG00000000419.12,ENSG00000002834.17,DPM1,LASP1,True,False,True,False,True,False,0.418721,0.681847,0.786595,,,
1,ENSG00000000419.12,ENSG00000002919.14,DPM1,SNX11,True,False,True,False,True,False,0.40509,0.734699,0.816991,,,
2,ENSG00000000419.12,ENSG00000002933.7,DPM1,TMEM176A,False,True,False,True,False,True,0.007466,0.013825,0.004128,,,
3,ENSG00000000419.12,ENSG00000003402.19,DPM1,CFLAR,True,False,True,False,True,False,0.391683,0.727347,0.803653,,,
4,ENSG00000000419.12,ENSG00000004478.7,DPM1,FKBP4,False,True,False,False,False,False,0.099013,0.094147,0.231269,,,


In [48]:
# testing
pd.testing.assert_frame_equal(
    df_full_orig.reset_index(),
    df_full,
    check_categorical=False,
    check_dtype=False,
)

In [49]:
del df_full_orig

# Save

In [50]:
data = df_full

In [51]:
# reset index to avoid problems with MultiIndex in Pandas
if isinstance(data.index, pd.MultiIndex):
    display("MultiIndex")
    data = data.reset_index()

## Pickle

In [52]:
data.to_pickle(OUTPUT_DIR / f"{OUTPUT_FILENAME}.pkl.gz")

## RDS

In [53]:
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.rds"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_01-Gene_pair_intersections.rds')

In [54]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [55]:
data_r

gene0_id,gene1_id,gene0_symbol,...,ccc_fdr,pearson_fdr,spearman_fdr
ENSG0...,ENSG0...,DPM1,...,,,
ENSG0...,ENSG0...,DPM1,,,,
ENSG0...,ENSG0...,DPM1,,,,
ENSG0...,ENSG0...,DPM1,,,,
...,...,...,,...,...,...
ENSG0...,ENSG0...,MIR6787,,,,
ENSG0...,ENSG0...,MIR6787,,,,
ENSG0...,ENSG0...,MIR6787,,,,
ENSG0...,ENSG0...,MIR6787,,,,


In [56]:
saveRDS(data_r, str(output_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7330d5def940> [RTYPES.NILSXP]

In [57]:
# testing: load the rds file again
data_r = readRDS(str(output_file))

In [58]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
    data_again.index = data_again.index.astype(int)

In [59]:
data_again.shape

(12497500, 16)

In [60]:
data_again.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
0,ENSG00000000419.12,ENSG00000002834.17,DPM1,LASP1,1,0,1,0,1,0,0.418721,0.681847,0.786595,,,
1,ENSG00000000419.12,ENSG00000002919.14,DPM1,SNX11,1,0,1,0,1,0,0.40509,0.734699,0.816991,,,
2,ENSG00000000419.12,ENSG00000002933.7,DPM1,TMEM176A,0,1,0,1,0,1,0.007466,0.013825,0.004128,,,
3,ENSG00000000419.12,ENSG00000003402.19,DPM1,CFLAR,1,0,1,0,1,0,0.391683,0.727347,0.803653,,,
4,ENSG00000000419.12,ENSG00000004478.7,DPM1,FKBP4,0,1,0,0,0,0,0.099013,0.094147,0.231269,,,


In [61]:
# testing
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_dtype=False,
)

## Text

In [62]:
# tsv format
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.tsv.gz"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_01-Gene_pair_intersections.tsv.gz')

In [63]:
data.to_csv(output_file, sep="\t", index=False, float_format="%.5e")

In [64]:
# testing
data2 = data.copy()
data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_file, sep="\t")
data_again.index = list(data_again.index)

In [65]:
data_again.shape

(12497500, 16)

In [66]:
data_again.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc_coef,pearson_coef,spearman_coef,ccc_fdr,pearson_fdr,spearman_fdr
0,ENSG00000000419.12,ENSG00000002834.17,DPM1,LASP1,True,False,True,False,True,False,0.418721,0.681847,0.786595,,,
1,ENSG00000000419.12,ENSG00000002919.14,DPM1,SNX11,True,False,True,False,True,False,0.40509,0.734699,0.816991,,,
2,ENSG00000000419.12,ENSG00000002933.7,DPM1,TMEM176A,False,True,False,True,False,True,0.007466,0.013825,0.004128,,,
3,ENSG00000000419.12,ENSG00000003402.19,DPM1,CFLAR,True,False,True,False,True,False,0.391683,0.727347,0.803653,,,
4,ENSG00000000419.12,ENSG00000004478.7,DPM1,FKBP4,False,True,False,False,False,False,0.099013,0.094147,0.231269,,,


In [67]:
# testing
pd.testing.assert_frame_equal(
    data2,
    data_again,
    check_categorical=False,
    check_dtype=False,
)