# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
# ENRICH_FUNCTION = "enrichGO"

In [4]:
# CORRELATION_METHOD_NAME = "clustermatch"

In [5]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [6]:
# # clusterProfiler settings
# ENRICH_FUNCTION = "enrichGO"
# SIMPLIFY_CUTOFF = 0.7
# GO_ONTOLOGIES = ("BP", "CC", "MF")

In [7]:
# SIMILARITY_MATRICES_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
# display(SIMILARITY_MATRICES_DIR)

In [8]:
# SIMILARITY_MATRIX_FILENAME_TEMPLATE = conf.GTEX["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
# display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

# Paths

In [9]:
INPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment')

In [10]:
OUTPUT_FILE = DATASET_CONFIG["GENE_ENRICHMENT_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

# Get data files

In [11]:
filename_pattern = re.compile(DATASET_CONFIG["GENE_ENRICHMENT_FILENAME_PATTERN"])

In [12]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

20

[PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichKEGG-hsa.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch_k2-SpectralClustering-enrichKEGG-hsa.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-pearson_abs-SpectralClustering-enrichKEGG-hsa.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-spearman_abs-SpectralClustering-enrichKEGG-hsa.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatc

## Preview data

In [13]:
display(input_files[0])

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichKEGG-hsa.pkl')

In [14]:
_tmp_df = pd.read_pickle(input_files[0])

In [15]:
_tmp_df.shape

(4620, 16)

In [16]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0,cluster_id,term_id,term_desc,gene_ratio,bg_ratio,pvalue,pvalue_adjust,qvalue,gene_id,gene_count,n_clusters,gene_total,bg_count,bg_total,rich_factor,fold_enrich
802,C14,hsa05145,Toxoplasmosis,0.049724,0.024803,0.001345,0.02242,0.020768,3118/3127/7043/3119/3117/3111/3109/7042/3115/3...,18,45,362,44,1774,0.409091,2.004771
2980,C50,hsa04145,Phagosome,0.12,0.038331,0.001736,0.022971,0.020665,2214/3689/4481/64581/3684/1520/4688/11151/1535,9,95,75,68,1774,0.132353,3.130588
330,C2,hsa05143,African trypanosomiasis,0.020147,0.009019,0.00189,0.017779,0.014883,3043/6401/3040/3039/4878/7412/3383/3620/5332/3...,11,25,546,16,1774,0.6875,2.233745
133,C8,hsa05171,Coronavirus disease - COVID-19,0.074468,0.03044,0.000977,0.013061,0.011436,713/2162/714/1536/712/728/717/719/2212/7097/68...,14,10,188,54,1774,0.259259,2.446414
3934,C24,hsa05202,Transcriptional misregulation in cancer,0.666667,0.034949,0.003526,0.005165,,1668/1669,2,146,3,62,1774,0.032258,19.075269


# Run

## Read data, convert dtypes, add new metrics

In [17]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full)
    #     f_data = f_data.rename(
    #         columns={
    #             "Count": "gene_count",
    #             "GeneRatio": "gene_ratio",
    #             "BgRatio": "bg_ratio",
    #             "ID": "go_term_id",
    #             "Description": "go_term_desc",
    #             "Cluster": "cluster_id",
    #             "clustering_n_clusters": "n_clusters",
    #             "p.adjust": "fdr_per_file",
    #         }
    #     )

    #     # genes in cluster
    #     f_data = f_data.assign(
    #         gene_total=f_data["gene_ratio"].apply(lambda x: int(x.split("/")[1]))
    #     )

    #     # background genes
    #     f_data = f_data.assign(
    #         bg_count=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[0]))
    #     )
    #     f_data = f_data.assign(
    #         bg_total=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[1]))
    #     )

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "n_clusters",
            "cluster_id",
            "term_id",
            "term_desc",
            "gene_count",
            "gene_total",
            "gene_ratio",
            "bg_count",
            "bg_total",
            "bg_ratio",
            "pvalue",
            "pvalue_adjust",
        ]
    ]

    f_data["tissue"] = metadata.group("tissue")
    f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")
    f_data["enrich_func"] = metadata.group("enrich_func")
    f_data["enrich_params"] = metadata.group("enrich_params")

    all_results.append(f_data)

100%|██████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 137.25it/s]


In [18]:
df = pd.concat(all_results, ignore_index=True)

# to category dtype
df["cluster_id"] = df["cluster_id"].astype("category")
df["term_id"] = df["term_id"].astype("category")
df["term_desc"] = df["term_desc"].astype("category")
df["tissue"] = df["tissue"].astype("category")
df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")
df["enrich_func"] = df["enrich_func"].astype("category")
df["enrich_params"] = df["enrich_params"].astype("category")

# convert to int32
df["n_clusters"] = df["n_clusters"].astype("int32")
df["gene_count"] = df["gene_count"].astype("int32")
df["gene_total"] = df["gene_total"].astype("int32")
df["bg_count"] = df["bg_count"].astype("int32")
df["bg_total"] = df["bg_total"].astype("int32")

# # convert ratios to numbers
# df["gene_ratio"] = df["gene_count"].div(df["gene_total"])
# df["bg_ratio"] = df["bg_count"].div(df["bg_total"])

# # add other metrics
# df["rich_factor"] = df["gene_count"].div(df["bg_count"])
# df["fold_enrich"] = df["gene_ratio"].div(df["bg_ratio"])

In [19]:
# # adjust for multiple testing across all results
# adj_pval = multipletests(df["pvalue"], alpha=0.05, method="fdr_bh")
# df = df.assign(fdr=adj_pval[1])

In [20]:
df.shape

(84471, 19)

In [21]:
df.head()

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,qvalue,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
0,2,C0,hsa04380,Osteoclast differentiation,27,248,0.108871,51,1774,0.028749,2.376904e-11,5.229189e-09,4.678748e-09,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichKEGG,hsa
1,2,C0,hsa04613,Neutrophil extracellular trap formation,25,248,0.100806,50,1774,0.028185,6.975013e-10,7.672515e-08,6.864882e-08,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichKEGG,hsa
2,2,C0,hsa04662,B cell receptor signaling pathway,18,248,0.072581,29,1774,0.016347,1.939872e-09,1.422573e-07,1.272828e-07,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichKEGG,hsa
3,2,C0,hsa05152,Tuberculosis,25,248,0.100806,58,1774,0.032694,3.349306e-08,1.842118e-06,1.648211e-06,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichKEGG,hsa
4,2,C0,hsa04972,Pancreatic secretion,19,248,0.076613,41,1774,0.023112,4.180927e-07,1.839608e-05,1.645965e-05,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichKEGG,hsa


In [22]:
display(df.dtypes)
assert df.dtypes.loc["cluster_id"] == "category"

n_clusters              int32
cluster_id           category
term_id              category
term_desc            category
gene_count              int32
gene_total              int32
gene_ratio            float64
bg_count                int32
bg_total                int32
bg_ratio              float64
pvalue                float64
pvalue_adjust         float64
qvalue                float64
tissue               category
gene_sel_strategy    category
corr_method          category
clust_method         category
enrich_func          category
enrich_params        category
dtype: object

In [24]:
df.sample(n=5, random_state=0)

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,qvalue,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
84207,186,C53,hsa05204,Chemical carcinogenesis - DNA adducts,5,22,0.227273,10,1991,0.005023,2.468158e-08,2.96179e-07,1.299031e-07,whole_blood,var_pc_log2,spearman_abs,SpectralClustering,enrichKEGG,hsa
29441,89,C86,hsa05016,Huntington disease,6,27,0.222222,35,1576,0.022208,1.629067e-05,0.0001656218,0.0001171785,artery_tibial,var_pc_log2,pearson_abs,SpectralClustering,enrichKEGG,hsa
46892,30,C23,hsa04261,Adrenergic signaling in cardiomyocytes,13,99,0.131313,50,1819,0.027488,1.112786e-06,9.848155e-05,9.07799e-05,muscle_skeletal,var_pc_log2,spearman_abs,SpectralClustering,enrichKEGG,hsa
76190,30,C15,hsa04218,Cellular senescence,6,29,0.206897,63,1991,0.031642,0.0002120447,0.002162856,0.00178564,whole_blood,var_pc_log2,pearson_abs,SpectralClustering,enrichKEGG,hsa
81494,49,C38,hsa00010,Glycolysis / Gluconeogenesis,4,25,0.16,25,1991,0.012557,0.0002050851,0.000865915,0.0004317582,whole_blood,var_pc_log2,spearman_abs,SpectralClustering,enrichKEGG,hsa


## Some stats

In [25]:
display(df["pvalue_adjust"].describe())
assert df["pvalue_adjust"].min() > 0.0
assert df["pvalue_adjust"].max() < 1.0

count    7.294900e+04
mean     8.483141e-03
std      1.106547e-02
min      2.030879e-57
25%      1.380923e-04
50%      3.181741e-03
75%      1.316917e-02
max      4.947642e-02
Name: qvalue, dtype: float64

In [26]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  69,  75,  80,  85,  89,  95,  98, 124, 146, 168, 189,  14,
        18,  23,  28,  33,  38,  43,  48,  53,  58,  62,  67,  71,  81,
        84,  90, 119, 139, 158, 173,  70,  74,  79,  94, 122, 166, 187,
        78,  83,  93,  99, 118, 137, 154, 169,  63,  68,  77,  86, 117,
       162, 182,  39,  44,  49,  54,  64,  72, 114, 133, 159, 142, 157,
        88,  92,  96, 115, 135, 147, 161,  59,  82, 121, 138, 185,  13,
        19,  22,  27,  32,  37,  42,  47,  52,  57,  66, 112, 134, 152,
       172,  73,  91, 116, 148, 174,  29,  87,  97, 141, 164, 188,  24,
        56,  61,  76, 136, 171, 120, 140, 170, 132, 143, 167, 144, 186],
      dtype=int32)

In [27]:
df["tissue"].unique()

['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']
Categories (5, object): ['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']

In [28]:
df["gene_sel_strategy"].unique()

['var_pc_log2']
Categories (1, object): ['var_pc_log2']

In [29]:
df["corr_method"].unique()

['clustermatch', 'clustermatch_k2', 'pearson_abs', 'spearman_abs']
Categories (4, object): ['clustermatch', 'clustermatch_k2', 'pearson_abs', 'spearman_abs']

In [30]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

In [31]:
df["enrich_params"].unique()

['hsa']
Categories (1, object): ['hsa']

In [32]:
assert not df.isna().any().any()

AssertionError: 

# Save

In [31]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [32]:
data = df

## Pickle

In [33]:
display(OUTPUT_FILE)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

In [34]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [None]:
output_rds_file = OUTPUT_FILE.with_suffix(".rds")
display(output_rds_file)

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [None]:
data_r

In [None]:
saveRDS(data_r, str(output_rds_file))

In [None]:
# testing
data_r = readRDS(str(output_rds_file))

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)

    # convert index to int, otherwise it's converted to string
    data_again.index = data_again.index.astype(int)

In [None]:
data_again.shape

In [None]:
data_again.head()

In [None]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_exact=True,  # since this is a binary format, it should match exactly
)

## tsv.gz

In [None]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

In [None]:
data.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [None]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)

In [None]:
data_again.shape

In [None]:
data_again.head()

In [None]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)