# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
# ENRICH_FUNCTION = "enrichGO"

In [4]:
# CORRELATION_METHOD_NAME = "clustermatch"

In [5]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [6]:
# # clusterProfiler settings
# ENRICH_FUNCTION = "enrichGO"
# SIMPLIFY_CUTOFF = 0.7
# GO_ONTOLOGIES = ("BP", "CC", "MF")

In [7]:
# SIMILARITY_MATRICES_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
# display(SIMILARITY_MATRICES_DIR)

In [8]:
# SIMILARITY_MATRIX_FILENAME_TEMPLATE = conf.GTEX["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
# display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

# Paths

In [9]:
INPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment')

In [10]:
OUTPUT_FILE = DATASET_CONFIG["GENE_ENRICHMENT_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

# Get data files

In [11]:
filename_pattern = re.compile(DATASET_CONFIG["GENE_ENRICHMENT_FILENAME_PATTERN"])

In [12]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

130

[PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-BP_full.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-CC_full.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-MF_full.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichKEGG-hsa.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichPathway-human.pkl')]

## Preview data

In [13]:
display(input_files[0])

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-BP_full.pkl')

In [14]:
_tmp_df = pd.read_pickle(input_files[0])

In [15]:
_tmp_df.shape

(50515, 17)

In [16]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0,cluster_id,term_id,term_desc,gene_ratio,bg_ratio,pvalue,pvalue_adjust,qvalue,gene_id,gene_count,n_clusters,ontology,gene_total,bg_count,bg_total,rich_factor,fold_enrich
17492,C21,GO:0035383,thioester metabolic process,0.045593,0.009637,9.214483e-08,6.388708e-06,5.708822e-06,FASN/ACSL1/ACACB/ACOT1/ACSM5/ELOVL5/SLC25A1/AA...,15,65,BP,329,34,3528,0.441176,4.730914
25252,C60,GO:0032103,positive regulation of response to external st...,0.122807,0.048186,0.0009718798,0.0160994,0.01372641,CD209/CLEC10A/CSF1R/TYROBP/AIF1/CD28/P2RY12/BT...,14,80,BP,114,170,3528,0.082353,2.548607
38316,C1,GO:0071398,cellular response to fatty acid,0.15,0.005385,0.0001425567,0.009785604,0.008874987,AKR1C2/AKR1C1/AKR1C3,3,125,BP,20,19,3528,0.157895,27.852632
3849,C3,GO:0045321,leukocyte activation,0.483871,0.138889,5.381953e-16,1.085833e-15,5.068876e-16,IGHG2/IGKC/IGLC1/IGHG1/IGHG3/IGHM/IGHA1/IGLC3/...,45,20,BP,93,490,3528,0.091837,3.483871
26471,C36,GO:0060627,regulation of vesicle-mediated transport,0.428571,0.036565,0.001500944,0.01795773,0.009874634,FGB/APOC3/FGG,3,80,BP,7,129,3528,0.023256,11.72093


# Run

## Read data, convert dtypes, add new metrics

In [17]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full)
    #     f_data = f_data.rename(
    #         columns={
    #             "Count": "gene_count",
    #             "GeneRatio": "gene_ratio",
    #             "BgRatio": "bg_ratio",
    #             "ID": "go_term_id",
    #             "Description": "go_term_desc",
    #             "Cluster": "cluster_id",
    #             "clustering_n_clusters": "n_clusters",
    #             "p.adjust": "fdr_per_file",
    #         }
    #     )

    #     # genes in cluster
    #     f_data = f_data.assign(
    #         gene_total=f_data["gene_ratio"].apply(lambda x: int(x.split("/")[1]))
    #     )

    #     # background genes
    #     f_data = f_data.assign(
    #         bg_count=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[0]))
    #     )
    #     f_data = f_data.assign(
    #         bg_total=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[1]))
    #     )

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "n_clusters",
            "cluster_id",
            "term_id",
            "term_desc",
            "gene_count",
            "gene_total",
            "gene_ratio",
            "bg_count",
            "bg_total",
            "bg_ratio",
            "pvalue",
            "pvalue_adjust",
            "rich_factor",
            "fold_enrich",
        ]
    ]

    f_data["tissue"] = metadata.group("tissue")
    f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")
    f_data["enrich_func"] = metadata.group("enrich_func")
    f_data["enrich_params"] = metadata.group("enrich_params")

    all_results.append(f_data)

100%|█████████████████████████████████████████████████████████████| 130/130 [00:01<00:00, 65.77it/s]


In [18]:
df = pd.concat(all_results, ignore_index=True)

# to category dtype
df["cluster_id"] = df["cluster_id"].astype("category")
df["term_id"] = df["term_id"].astype("category")
df["term_desc"] = df["term_desc"].astype("category")
df["tissue"] = df["tissue"].astype("category")
df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")
df["enrich_func"] = df["enrich_func"].astype("category")
df["enrich_params"] = df["enrich_params"].astype("category")

# convert to int32
df["n_clusters"] = df["n_clusters"].astype("int32")
df["gene_count"] = df["gene_count"].astype("int32")
df["gene_total"] = df["gene_total"].astype("int32")
df["bg_count"] = df["bg_count"].astype("int32")
df["bg_total"] = df["bg_total"].astype("int32")

# # convert ratios to numbers
# df["gene_ratio"] = df["gene_count"].div(df["gene_total"])
# df["bg_ratio"] = df["bg_count"].div(df["bg_total"])

# # add other metrics
# df["rich_factor"] = df["gene_count"].div(df["bg_count"])
# df["fold_enrich"] = df["gene_ratio"].div(df["bg_ratio"])

In [19]:
# # adjust for multiple testing across all results
# adj_pval = multipletests(df["pvalue"], alpha=0.05, method="fdr_bh")
# df = df.assign(fdr=adj_pval[1])

In [20]:
df.shape

(2187619, 20)

In [21]:
df.head()

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,rich_factor,fold_enrich,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
0,2,C0,GO:0002443,leukocyte mediated immunity,182,523,0.347992,357,3528,0.10119,1.930523e-67,5.5135729999999995e-64,0.509804,3.438983,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
1,2,C0,GO:0002250,adaptive immune response,153,523,0.292543,260,3528,0.073696,3.935201e-67,5.619467e-64,0.588462,3.969584,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
2,2,C0,GO:0002252,immune effector process,210,523,0.40153,478,3528,0.135488,1.9947619999999998e-64,1.8990140000000002e-61,0.439331,2.963591,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
3,2,C0,GO:0050776,regulation of immune response,178,523,0.340344,372,3528,0.105442,3.9086730000000004e-60,2.7907919999999996e-57,0.478495,3.22778,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
4,2,C0,GO:0006909,phagocytosis,112,523,0.214149,171,3528,0.048469,5.2857330000000004e-55,3.019211e-52,0.654971,4.418235,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full


In [22]:
display(df.dtypes)
assert df.dtypes.loc["cluster_id"] == "category"

n_clusters              int32
cluster_id           category
term_id              category
term_desc            category
gene_count              int32
gene_total              int32
gene_ratio            float64
bg_count                int32
bg_total                int32
bg_ratio              float64
pvalue                float64
pvalue_adjust         float64
rich_factor           float64
fold_enrich           float64
tissue               category
gene_sel_strategy    category
corr_method          category
clust_method         category
enrich_func          category
enrich_params        category
dtype: object

In [23]:
df.sample(n=5, random_state=0)

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,rich_factor,fold_enrich,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
1123961,80,C61,R-HSA-5688426,Deubiquitination,16,142,0.112676,70,2433,0.028771,1e-06,8e-06,0.228571,3.916298,muscle_skeletal,var_pc_log2,spearman_abs,SpectralClustering,enrichPathway,human
872084,59,C22,R-HSA-1234176,Oxygen-dependent proline hydroxylation of Hypo...,16,595,0.026891,24,2433,0.009864,1.3e-05,0.000161,0.666667,2.72605,muscle_skeletal,var_pc_log2,clustermatch,SpectralClustering,enrichPathway,human
1528835,150,C93,GO:0016817,"hydrolase activity, acting on acid anhydrides",8,46,0.173913,143,3341,0.042802,0.000607,0.001855,0.055944,4.063241,skin_sun_exposed_lower_leg,var_pc_log2,pearson_full,SpectralClustering,enrichGO,MF_full
633906,15,C11,GO:0034122,negative regulation of toll-like receptor sign...,6,414,0.014493,11,3278,0.003356,0.001035,0.007627,0.545455,4.318841,artery_tibial,var_pc_log2,pearson_full,SpectralClustering,enrichGO,BP_full
1774114,143,C54,R-HSA-2894858,Signaling by NOTCH1 HD+PEST Domain Mutants in ...,5,106,0.04717,20,2694,0.007424,0.000829,0.024885,0.25,6.353774,whole_blood,var_pc_log2,clustermatch,SpectralClustering,enrichPathway,human


## Some stats

In [24]:
display(df["pvalue_adjust"].describe())
assert df["pvalue_adjust"].min() > 0.0
assert df["pvalue_adjust"].max() < 1.0

count     2.187619e+06
mean      1.324874e-02
std       1.552997e-02
min      6.807472e-203
25%       1.029856e-04
50%       5.434603e-03
75%       2.470489e-02
max       4.999997e-02
Name: pvalue_adjust, dtype: float64

In [25]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 125, 150, 175, 200,  69,
        89,  98, 124, 146, 168, 189,  14,  18,  23,  28,  33,  38,  43,
        48,  53,  58,  62,  67,  71,  81,  84, 119, 139, 158, 173,  74,
        79,  94, 122, 166, 187,  78,  83,  93,  99, 118, 137, 154, 169,
        63,  68,  77,  86, 117, 162, 182,  39,  44,  49,  54,  64,  72,
       114, 133, 159, 142, 157,  88,  92,  96, 115, 135, 147, 161,  59,
        82, 121, 138, 185,  13,  19,  22,  27,  32,  37,  42,  47,  52,
        57,  66, 112, 134, 152, 172,  73,  91, 116, 148, 174,  29,  87,
        97, 141, 164, 188,  24,  56,  61,  76, 136, 171, 120, 140, 170,
       132, 143, 167, 144, 186], dtype=int32)

In [26]:
df["tissue"].unique()

['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']
Categories (5, object): ['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']

In [27]:
df["gene_sel_strategy"].unique()

['var_pc_log2']
Categories (1, object): ['var_pc_log2']

In [28]:
df["corr_method"].unique()

['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']
Categories (6, object): ['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']

In [29]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

In [30]:
df["enrich_params"].unique()

['BP_full', 'CC_full', 'MF_full', 'hsa', 'human']
Categories (5, object): ['BP_full', 'CC_full', 'MF_full', 'hsa', 'human']

In [31]:
assert not df.isna().any().any()

# Save

In [32]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [33]:
data = df

## Pickle

In [34]:
display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

In [35]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [36]:
output_rds_file = OUTPUT_FILE.with_suffix(".rds")
display(output_rds_file)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.rds')

In [37]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [38]:
data_r

n_clusters,cluster_id,term_id,...,clust_method,enrich_func,enrich_params
2,C0,GO:00...,...,Spect...,enric...,BP_full
2,C0,GO:00...,,Spect...,enric...,BP_full
2,C0,GO:00...,,Spect...,enric...,BP_full
2,C0,GO:00...,,Spect...,enric...,BP_full
...,...,...,,...,...,...
200,C54,GO:00...,,Spect...,enric...,MF_full
200,C47,GO:00...,,Spect...,enric...,MF_full
200,C164,GO:00...,,Spect...,enric...,MF_full
200,C164,GO:00...,,Spect...,enric...,MF_full


In [39]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f7da531dbc0> [RTYPES.NILSXP]

In [40]:
# testing
data_r = readRDS(str(output_rds_file))

In [41]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)

    # convert index to int, otherwise it's converted to string
    data_again.index = data_again.index.astype(int)

In [42]:
data_again.shape

(2187619, 20)

In [43]:
data_again.head()

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,rich_factor,fold_enrich,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
0,2,C0,GO:0002443,leukocyte mediated immunity,182,523,0.347992,357,3528,0.10119,1.930523e-67,5.5135729999999995e-64,0.509804,3.438983,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
1,2,C0,GO:0002250,adaptive immune response,153,523,0.292543,260,3528,0.073696,3.935201e-67,5.619467e-64,0.588462,3.969584,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
2,2,C0,GO:0002252,immune effector process,210,523,0.40153,478,3528,0.135488,1.9947619999999998e-64,1.8990140000000002e-61,0.439331,2.963591,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
3,2,C0,GO:0050776,regulation of immune response,178,523,0.340344,372,3528,0.105442,3.9086730000000004e-60,2.7907919999999996e-57,0.478495,3.22778,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
4,2,C0,GO:0006909,phagocytosis,112,523,0.214149,171,3528,0.048469,5.2857330000000004e-55,3.019211e-52,0.654971,4.418235,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full


In [44]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_exact=True,  # since this is a binary format, it should match exactly
)

## tsv.gz

In [45]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.tsv.gz')

In [46]:
data.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [47]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)

In [48]:
data_again.shape

(2187619, 20)

In [49]:
data_again.head()

Unnamed: 0,n_clusters,cluster_id,term_id,term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,pvalue_adjust,rich_factor,fold_enrich,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,enrich_params
0,2,C0,GO:0002443,leukocyte mediated immunity,182,523,0.347992,357,3528,0.10119,1.93052e-67,5.513569999999999e-64,0.509804,3.43898,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
1,2,C0,GO:0002250,adaptive immune response,153,523,0.292543,260,3528,0.073696,3.9352e-67,5.6194699999999994e-64,0.588462,3.96958,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
2,2,C0,GO:0002252,immune effector process,210,523,0.40153,478,3528,0.135488,1.9947599999999998e-64,1.89901e-61,0.439331,2.96359,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
3,2,C0,GO:0050776,regulation of immune response,178,523,0.340344,372,3528,0.105442,3.9086700000000004e-60,2.79079e-57,0.478495,3.22778,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full
4,2,C0,GO:0006909,phagocytosis,112,523,0.214149,171,3528,0.048469,5.2857300000000005e-55,3.0192100000000003e-52,0.654971,4.41823,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_full


In [50]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)