# Description

Creates **Supplementary File 2**.

*Description*: Percentiles for Pearson, Spearman and CCC computed on Supplementary File 1.

# Modules

In [1]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

from ccc import conf

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [4]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

# Paths

In [5]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None and conf.MANUSCRIPT["BASE_DIR"].exists()
), "Manuscript dir not set"

In [6]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [7]:
OUTPUT_DIR = conf.MANUSCRIPT["SUPPLEMENTARY_MATERIAL_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/supplementary_material')

In [8]:
OUTPUT_FILENAME = "Supplementary_File_02-Coefficients_percentiles_GTEx_whole_blood"

# Data

## Gene pairs intersection

In [9]:
gene_pair_intersections = (
    pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)
    .rename_axis(("gene0_id", "gene1_id"))
    .sort_index()
)

In [10]:
gene_pair_intersections.shape

(12497500, 9)

In [11]:
gene_pair_intersections.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),ccc,pearson,spearman
gene0_id,gene1_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


# Compute percentiles

In [12]:
percentiles = (
    gene_pair_intersections[["ccc", "pearson", "spearman"]]
    .quantile(np.arange(0.00, 1.01, 0.01))
    .rename_axis("percentile")
)

In [13]:
# convert index to string
percentiles.index = percentiles.index.map(lambda x: f"{x:.2f}")
display(percentiles.index)

Index(['0.00', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.07', '0.08',
       '0.09',
       ...
       '0.91', '0.92', '0.93', '0.94', '0.95', '0.96', '0.97', '0.98', '0.99',
       '1.00'],
      dtype='object', name='percentile', length=101)

In [14]:
with pd.option_context("display.max_rows", None):
    display(percentiles)

Unnamed: 0_level_0,ccc,pearson,spearman
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,1.507747e-08,1.394156e-08
0.01,0.003946,0.003388963,0.006152752
0.02,0.005268,0.006768538,0.01227735
0.03,0.00638,0.01013848,0.01844244
0.04,0.007398,0.0135214,0.02461349
0.05,0.008373,0.01693231,0.03080856
0.06,0.009316,0.02036774,0.03698643
0.07,0.010252,0.02383742,0.0432374
0.08,0.011229,0.02732681,0.0495207
0.09,0.012138,0.03085061,0.0558841


# Save

In [15]:
data = percentiles

In [16]:
# reset index to avoid problems with MultiIndex in Pandas
if isinstance(data.index, pd.MultiIndex):
    display("MultiIndex")
    data = data.reset_index()

## Pickle

In [17]:
data.to_pickle(OUTPUT_DIR / f"{OUTPUT_FILENAME}.pkl.gz")

## RDS

In [18]:
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.rds"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_02-Coefficients_percentiles_GTEx_whole_blood.rds')

In [19]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [20]:
data_r

ccc,pearson,spearman
...,...,...


In [21]:
saveRDS(data_r, str(output_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x78b08513d800> [RTYPES.NILSXP]

In [22]:
# testing: load the rds file again
data_r = readRDS(str(output_file))

In [23]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
    # data_again.index = data_again.index.astype(int)

In [24]:
data_again.shape

(101, 3)

In [25]:
data_again.head()

Unnamed: 0,ccc,pearson,spearman
0.0,0.0,1.507747e-08,1.394156e-08
0.01,0.003946,0.003388963,0.006152752
0.02,0.005268,0.006768538,0.01227735
0.03,0.00638,0.01013848,0.01844244
0.04,0.007398,0.0135214,0.02461349


In [26]:
# testing
pd.testing.assert_frame_equal(
    data,
    data_again.rename_axis("percentile"),
    check_dtype=False,
)

## Text

In [27]:
# tsv format
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.tsv"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_02-Coefficients_percentiles_GTEx_whole_blood.tsv')

In [28]:
data.to_csv(output_file, sep="\t", index=True, float_format="%.5e")

In [29]:
# testing
data2 = data  # .copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_file, sep="\t", index_col="percentile")
data_again.index = data_again.index.map(lambda x: f"{x:.2f}")

In [30]:
data_again.shape

(101, 3)

In [31]:
data_again.head()

Unnamed: 0_level_0,ccc,pearson,spearman
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,1.50775e-08,1.39416e-08
0.01,0.003946,0.00338896,0.00615275
0.02,0.005268,0.00676854,0.0122774
0.03,0.00638,0.0101385,0.0184424
0.04,0.007398,0.0135214,0.0246135


In [32]:
# testing
pd.testing.assert_frame_equal(
    data2,
    data_again,
    check_categorical=False,
    check_dtype=False,
)