# Description

It gets all the gene pairs prioritized by different correlation coefficients and writes a file with gene ID mappings (symbols and Entrez IDs).

# Modules

In [1]:
%load_ext rpy2.ipython

In [2]:
import pandas as pd

from ccc import conf

# Settings

In [3]:
DATASET_CONFIG = conf.GTEX

# Paths

In [4]:
INPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections')

In [5]:
OUTPUT_DIR = conf.GIANT["RESULTS_DIR"] / "intersection_genes"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/giant/intersection_genes')

# Get gene entrez ids

In [6]:
genes = set()

In [7]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson.pkl")
_tmp0 = set(data.index.get_level_values(0))
_tmp1 = set(data.index.get_level_values(1))
genes.update(_tmp0.union(_tmp1))
display(len(genes))

2887

In [8]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_pearson_spearman.pkl")
_tmp0 = set(data.index.get_level_values(0))
_tmp1 = set(data.index.get_level_values(1))
genes.update(_tmp0.union(_tmp1))
display(len(genes))

2888

In [9]:
data = pd.read_pickle(INPUT_DIR / "clustermatch_vs_spearman.pkl")
_tmp0 = set(data.index.get_level_values(0))
_tmp1 = set(data.index.get_level_values(1))
genes.update(_tmp0.union(_tmp1))
display(len(genes))

2892

In [10]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch.pkl")
_tmp0 = set(data.index.get_level_values(0))
_tmp1 = set(data.index.get_level_values(1))
genes.update(_tmp0.union(_tmp1))
display(len(genes))

3224

In [11]:
data = pd.read_pickle(INPUT_DIR / "pearson_vs_clustermatch_spearman.pkl")
_tmp0 = set(data.index.get_level_values(0))
_tmp1 = set(data.index.get_level_values(1))
genes.update(_tmp0.union(_tmp1))
display(len(genes))

3243

In [12]:
genes = list(genes)
assert not pd.Series(genes).isna().any()

In [13]:
%%R -i genes -o symbol_to_entrezid
library(org.Hs.eg.db)
hs <- org.Hs.eg.db

symbol_to_entrezid <- select(hs,
       keys = unlist(genes),
       columns = c("ENTREZID", "SYMBOL"),
       keytype = "SYMBOL")

R[write to console]: Loading required package: AnnotationDbi

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Positi

In [14]:
symbol_to_entrezid.shape

(3243, 2)

In [15]:
assert symbol_to_entrezid.shape[0] == len(genes)

In [16]:
symbol_to_entrezid.head()

Unnamed: 0,SYMBOL,ENTREZID
1,RP11-326C3.2,
2,C6orf89,221477.0
3,TRMT1L,81627.0
4,RP4-671O14.5,
5,SMCHD1,23347.0


In [17]:
symbol_to_entrezid.isna().any().any()

True

In [18]:
symbol_to_entrezid = symbol_to_entrezid.dropna()

In [19]:
symbol_to_entrezid.shape

(2878, 2)

In [20]:
assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "IFNG"].shape[0] == 1
assert symbol_to_entrezid[symbol_to_entrezid["SYMBOL"] == "RASSF2"].shape[0] == 1

# Save

In [21]:
symbol_to_entrezid.to_pickle(OUTPUT_DIR / "gene_map-symbol_to_entrezid.pkl")