# Description

Creates **Supplementary File 3**.

*Description*: Correlations and p-values of a subset of gene pairs across all tissues in GTEx v8.

# Modules

In [1]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

from ccc import conf

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [4]:
DATASET_CONFIG = conf.GTEX

# Paths

In [5]:
assert (
    conf.MANUSCRIPT["BASE_DIR"] is not None and conf.MANUSCRIPT["BASE_DIR"].exists()
), "Manuscript dir not set"

In [6]:
INPUT_DIR = conf.GTEX["RESULTS_DIR"] / "other_tissues"
display(INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/other_tissues')

In [7]:
OUTPUT_DIR = conf.MANUSCRIPT["SUPPLEMENTARY_MATERIAL_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/supplementary_material')

In [8]:
OUTPUT_FILENAME = "Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues"

# Data

## Gene Ensembl ID -> Symbol mapping

In [9]:
gene_map = pd.read_pickle(
    DATASET_CONFIG["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
)

In [10]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [11]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

# List of dataframes to combine

In [12]:
df_list = []

# KDM6A - UTY

In [13]:
gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000183878.15"
gene0_symbol, gene1_symbol = "KDM6A", "UTY"

assert gene_map[gene0_id] == gene0_symbol
assert gene_map[gene1_id] == gene1_symbol

In [14]:
GENE_PAIR_INPUT_DIR = INPUT_DIR / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}"
display(GENE_PAIR_INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_uty')

## Correlation values

In [15]:
res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_values.pkl").rename(
    columns={
        "cm": "ccc_coef",
        "pearson": "pearson_coef",
        "spearman": "spearman_coef",
    }
)

In [16]:
res_all.shape

(50, 3)

In [17]:
res_all.head()

Unnamed: 0,ccc_coef,pearson_coef,spearman_coef
colon_transverse,0.336727,-0.517899,-0.408343
brain_amygdala,0.280524,0.037541,0.147571
artery_coronary,0.274554,-0.413862,-0.391764
artery_aorta,0.429771,-0.485788,-0.36351
adrenal_gland,0.260197,-0.45919,-0.35419


## P-values

In [18]:
res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_pvalues.pkl").rename(
    columns={
        "cm": "ccc_pvalue",
        "pearson": "pearson_pvalue",
        "spearman": "spearman_pvalue",
    }
)

In [19]:
res_pval_all.shape

(50, 3)

In [20]:
res_pval_all.head()

Unnamed: 0,ccc_pvalue,pearson_pvalue,spearman_pvalue
colon_transverse,9.99999e-07,3.063714e-29,9.539164e-18
brain_amygdala,9.99999e-07,0.6461089,0.06963023
artery_coronary,9.99999e-07,2.38997e-11,3.159321e-10
artery_aorta,9.99999e-07,5.775754e-27,6.092383e-15
adrenal_gland,9.99999e-07,7.334489e-15,4.847677e-09


## Combine

In [21]:
df = res_all.join(res_pval_all, how="inner").rename_axis("tissue").reset_index()
assert df.shape[0] == res_all.shape[0]
assert df.shape[0] == res_pval_all.shape[0]

In [22]:
df.insert(0, "gene0_id", gene0_id)
df.insert(1, "gene1_id", gene1_id)
df.insert(2, "gene0_symbol", gene0_symbol)
df.insert(3, "gene1_symbol", gene1_symbol)

In [23]:
df.shape

(50, 11)

In [24]:
df.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,tissue,ccc_coef,pearson_coef,spearman_coef,ccc_pvalue,pearson_pvalue,spearman_pvalue
0,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,colon_transverse,0.336727,-0.517899,-0.408343,9.99999e-07,3.063714e-29,9.539164e-18
1,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,brain_amygdala,0.280524,0.037541,0.147571,9.99999e-07,0.6461089,0.06963023
2,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_coronary,0.274554,-0.413862,-0.391764,9.99999e-07,2.38997e-11,3.159321e-10
3,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_aorta,0.429771,-0.485788,-0.36351,9.99999e-07,5.775754e-27,6.092383e-15
4,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,adrenal_gland,0.260197,-0.45919,-0.35419,9.99999e-07,7.334489e-15,4.847677e-09


In [25]:
df_list.append(df)

# KDM6A - DDX3Y

In [26]:
gene0_id, gene1_id = "ENSG00000147050.14", "ENSG00000067048.16"
gene0_symbol, gene1_symbol = "KDM6A", "DDX3Y"

assert gene_map[gene0_id] == gene0_symbol
assert gene_map[gene1_id] == gene1_symbol

In [27]:
GENE_PAIR_INPUT_DIR = INPUT_DIR / f"{gene0_symbol.lower()}_vs_{gene1_symbol.lower()}"
display(GENE_PAIR_INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/other_tissues/kdm6a_vs_ddx3y')

## Correlation values

In [28]:
res_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_values.pkl").rename(
    columns={
        "cm": "ccc_coef",
        "pearson": "pearson_coef",
        "spearman": "spearman_coef",
    }
)

In [29]:
res_all.shape

(50, 3)

In [30]:
res_all.head()

Unnamed: 0,ccc_coef,pearson_coef,spearman_coef
colon_transverse,0.280814,-0.393881,-0.465898
brain_amygdala,0.305677,0.038732,0.154331
artery_coronary,0.24272,-0.48761,-0.426164
artery_aorta,0.38197,-0.579236,-0.409761
adrenal_gland,0.188929,-0.4894,-0.418784


## P-values

In [31]:
res_pval_all = pd.read_pickle(GENE_PAIR_INPUT_DIR / "coef_pvalues.pkl").rename(
    columns={
        "cm": "ccc_pvalue",
        "pearson": "pearson_pvalue",
        "spearman": "spearman_pvalue",
    }
)

In [32]:
res_pval_all.shape

(50, 3)

In [33]:
res_pval_all.head()

Unnamed: 0,ccc_pvalue,pearson_pvalue,spearman_pvalue
colon_transverse,9.99999e-07,1.613504e-16,2.880714e-23
brain_amygdala,9.99999e-07,0.6356755,0.05764275
artery_coronary,9.99999e-07,9.731623e-16,5.220895e-12
artery_aorta,9.99999e-07,4.513966e-40,6.380371999999999e-19
adrenal_gland,9.99999e-07,6.058615e-17,2.230048e-12


## Combine

In [34]:
df = res_all.join(res_pval_all, how="inner").rename_axis("tissue").reset_index()
assert df.shape[0] == res_all.shape[0]
assert df.shape[0] == res_pval_all.shape[0]

In [35]:
df.insert(0, "gene0_id", gene0_id)
df.insert(1, "gene1_id", gene1_id)
df.insert(2, "gene0_symbol", gene0_symbol)
df.insert(3, "gene1_symbol", gene1_symbol)

In [36]:
df.shape

(50, 11)

In [37]:
df.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,tissue,ccc_coef,pearson_coef,spearman_coef,ccc_pvalue,pearson_pvalue,spearman_pvalue
0,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,colon_transverse,0.280814,-0.393881,-0.465898,9.99999e-07,1.613504e-16,2.880714e-23
1,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,brain_amygdala,0.305677,0.038732,0.154331,9.99999e-07,0.6356755,0.05764275
2,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,artery_coronary,0.24272,-0.48761,-0.426164,9.99999e-07,9.731623e-16,5.220895e-12
3,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,artery_aorta,0.38197,-0.579236,-0.409761,9.99999e-07,4.513966e-40,6.380371999999999e-19
4,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,adrenal_gland,0.188929,-0.4894,-0.418784,9.99999e-07,6.058615e-17,2.230048e-12


In [38]:
df_list.append(df)

# Combine

In [39]:
df_final = pd.concat(df_list, ignore_index=True, axis=0)

In [40]:
assert df_final.shape[0] == sum(d.shape[0] for d in df_list)
for d in df_list:
    assert df_final.shape[1] == d.shape[1]
display(df_final.shape)

(100, 11)

In [41]:
df_final

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,tissue,ccc_coef,pearson_coef,spearman_coef,ccc_pvalue,pearson_pvalue,spearman_pvalue
0,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,colon_transverse,0.336727,-0.517899,-0.408343,9.999990e-07,3.063714e-29,9.539164e-18
1,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,brain_amygdala,0.280524,0.037541,0.147571,9.999990e-07,6.461089e-01,6.963023e-02
2,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_coronary,0.274554,-0.413862,-0.391764,9.999990e-07,2.389970e-11,3.159321e-10
3,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_aorta,0.429771,-0.485788,-0.363510,9.999990e-07,5.775754e-27,6.092383e-15
4,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,adrenal_gland,0.260197,-0.459190,-0.354190,9.999990e-07,7.334489e-15,4.847677e-09
...,...,...,...,...,...,...,...,...,...,...,...
95,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,artery_tibial,0.298440,-0.617718,-0.387765,9.999990e-07,5.248493e-71,3.246061e-25
96,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,brain_hypothalamus,0.232632,0.118391,0.251149,9.999990e-07,9.332407e-02,3.117929e-04
97,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,lung,0.289771,-0.252442,-0.224470,9.999990e-07,7.462864e-10,4.905714e-08
98,ENSG00000147050.14,ENSG00000067048.16,KDM6A,DDX3Y,brain_cerebellum,0.219113,-0.106469,0.034902,9.999990e-07,9.916004e-02,5.897648e-01


# Save

In [42]:
data = df_final

In [43]:
display(data.index.dtype)
display(data.index)

dtype('int64')

RangeIndex(start=0, stop=100, step=1)

In [44]:
# reset index to avoid problems with MultiIndex in Pandas
if isinstance(data.index, pd.MultiIndex):
    display("MultiIndex")
    data = data.reset_index()

## Pickle

In [45]:
data.to_pickle(OUTPUT_DIR / f"{OUTPUT_FILENAME}.pkl.gz")

## RDS

In [46]:
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.rds"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues.rds')

In [47]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [48]:
data_r

gene0_id,gene1_id,gene0_symbol,...,ccc_pvalue,pearson_pvalue,spearman_pvalue
'ENSG0000...,'ENSG0000...,'KDM6A',...,0.000001,0.000000,0.000000
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.646109,0.069630
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.000000,0.000000
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.000000,0.000000
...,...,...,,...,...,...
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.093324,0.000312
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.000000,0.000000
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.099160,0.589765
'ENSG0000...,'ENSG0000...,'KDM6A',,0.000001,0.006596,0.000023


In [49]:
saveRDS(data_r, str(output_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7150ddd228c0> [RTYPES.NILSXP]

In [50]:
# testing: load the rds file again
data_r = readRDS(str(output_file))

In [51]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
    data_again.index = data_again.index.astype(int)

In [52]:
data_again.shape

(100, 11)

In [53]:
data_again.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,tissue,ccc_coef,pearson_coef,spearman_coef,ccc_pvalue,pearson_pvalue,spearman_pvalue
0,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,colon_transverse,0.336727,-0.517899,-0.408343,9.99999e-07,3.063714e-29,9.539164e-18
1,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,brain_amygdala,0.280524,0.037541,0.147571,9.99999e-07,0.6461089,0.06963023
2,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_coronary,0.274554,-0.413862,-0.391764,9.99999e-07,2.38997e-11,3.159321e-10
3,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_aorta,0.429771,-0.485788,-0.36351,9.99999e-07,5.775754e-27,6.092383e-15
4,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,adrenal_gland,0.260197,-0.45919,-0.35419,9.99999e-07,7.334489e-15,4.847677e-09


In [54]:
# testing
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_dtype=False,
)

## Text

In [55]:
# tsv format
output_file = OUTPUT_DIR / f"{OUTPUT_FILENAME}.tsv"
display(output_file)

PosixPath('/opt/data/supplementary_material/Supplementary_File_03-Gene_pairs_correlations_all_GTEx_tissues.tsv')

In [56]:
data.to_csv(output_file, sep="\t", index=False, float_format="%.5e")

In [57]:
# testing
data2 = data  # .copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_file, sep="\t", index_col=None)
# data_again.index = data_again.index.map(lambda x: f"{x:.2f}")

In [58]:
data_again.shape

(100, 11)

In [59]:
data_again.head()

Unnamed: 0,gene0_id,gene1_id,gene0_symbol,gene1_symbol,tissue,ccc_coef,pearson_coef,spearman_coef,ccc_pvalue,pearson_pvalue,spearman_pvalue
0,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,colon_transverse,0.336727,-0.517899,-0.408343,9.99999e-07,3.0637100000000003e-29,9.53916e-18
1,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,brain_amygdala,0.280524,0.037541,0.147571,9.99999e-07,0.646109,0.0696302
2,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_coronary,0.274554,-0.413862,-0.391764,9.99999e-07,2.38997e-11,3.15932e-10
3,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,artery_aorta,0.429771,-0.485788,-0.36351,9.99999e-07,5.77575e-27,6.09238e-15
4,ENSG00000147050.14,ENSG00000183878.15,KDM6A,UTY,adrenal_gland,0.260197,-0.45919,-0.35419,9.99999e-07,7.33449e-15,4.84768e-09


In [60]:
# testing
pd.testing.assert_frame_equal(
    data2,
    data_again,
    check_categorical=False,
    check_dtype=False,
)