# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [None]:
DATASET_CONFIG = conf.GTEX

In [2]:
# ENRICH_FUNCTION = "enrichGO"

In [3]:
# CORRELATION_METHOD_NAME = "clustermatch"

In [4]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [5]:
# # clusterProfiler settings
# ENRICH_FUNCTION = "enrichGO"
# SIMPLIFY_CUTOFF = 0.7
# GO_ONTOLOGIES = ("BP", "CC", "MF")

In [6]:
# SIMILARITY_MATRICES_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
# display(SIMILARITY_MATRICES_DIR)

In [7]:
# SIMILARITY_MATRIX_FILENAME_TEMPLATE = conf.GTEX["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
# display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

# Paths

In [8]:
INPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment')

In [9]:
OUTPUT_FILE = DATASET_CONFIG["GENE_ENRICHMENT_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

# Get data files

In [10]:
filename_pattern = re.compile(DATASET_CONFIG["GENE_ENRICHMENT_FILENAME_PATTERN"])

In [11]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

180

[PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-BP_full.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-BP_simplified_070.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-CC_full.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-CC_simplified_070.pkl'),
 PosixPath('/home/miltondp/projects/

## Preview data

In [12]:
display(input_files[0])

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering-enrichGO-BP_full.pkl')

In [13]:
_tmp_df = pd.read_pickle(input_files[0])

In [14]:
_tmp_df.shape

(50515, 12)

In [15]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0,Cluster,ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count,clustering_id,clustering_n_clusters
17492,C21,GO:0035383,thioester metabolic process,15/329,34/3528,9.214483e-08,6.388708e-06,5.708822e-06,FASN/ACSL1/ACACB/ACOT1/ACSM5/ELOVL5/SLC25A1/AA...,15,SpectralClustering #13,65
25252,C60,GO:0032103,positive regulation of response to external st...,14/114,170/3528,0.0009718798,0.0160994,0.01372641,CD209/CLEC10A/CSF1R/TYROBP/AIF1/CD28/P2RY12/BT...,14,SpectralClustering #16,80
38316,C1,GO:0071398,cellular response to fatty acid,3/20,19/3528,0.0001425567,0.009785604,0.008874987,AKR1C2/AKR1C1/AKR1C3,3,SpectralClustering #21,125
3849,C3,GO:0045321,leukocyte activation,45/93,490/3528,5.381953e-16,1.085833e-15,5.068876e-16,IGHG2/IGKC/IGLC1/IGHG1/IGHG3/IGHM/IGHA1/IGLC3/...,45,SpectralClustering #4,20
26471,C36,GO:0060627,regulation of vesicle-mediated transport,3/7,129/3528,0.001500944,0.01795773,0.009874634,FGB/APOC3/FGG,3,SpectralClustering #16,80


# Run

## Read data, convert dtypes, add new metrics

In [16]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full)
#     f_data = f_data.rename(
#         columns={
#             "Count": "gene_count",
#             "GeneRatio": "gene_ratio",
#             "BgRatio": "bg_ratio",
#             "ID": "go_term_id",
#             "Description": "go_term_desc",
#             "Cluster": "cluster_id",
#             "clustering_n_clusters": "n_clusters",
#             "p.adjust": "fdr_per_file",
#         }
#     )

#     # genes in cluster
#     f_data = f_data.assign(
#         gene_total=f_data["gene_ratio"].apply(lambda x: int(x.split("/")[1]))
#     )

#     # background genes
#     f_data = f_data.assign(
#         bg_count=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[0]))
#     )
#     f_data = f_data.assign(
#         bg_total=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[1]))
#     )

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "n_clusters",
            "cluster_id",
            "term_id",
            "term_desc",
            "gene_count",
            "gene_total",
            "gene_ratio",
            "bg_count",
            "bg_total",
            "bg_ratio",
            "pvalue",
            "pvalue_adjust",
            "qvalue",
        ]
    ]

    f_data["tissue"] = metadata.group("tissue")
    f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")
    f_data["enrich_func"] = metadata.group("enrich_func")
    f_data["enrich_params"] = metadata.group("enrich_params")

    all_results.append(f_data)

100%|█████████████████████████████████████████████████████████████| 180/180 [00:08<00:00, 21.23it/s]


In [17]:
df = pd.concat(all_results, ignore_index=True)

# to category dtype
df["cluster_id"] = df["cluster_id"].astype("category")
df["term_id"] = df["go_term_id"].astype("category")
df["term_desc"] = df["go_term_desc"].astype("category")
df["tissue"] = df["tissue"].astype("category")
df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")
df["enrich_func"] = df["enrich_func"].astype("category")
df["enrich_params"] = df["enrich_params"].astype("category")

# convert to int32
df["n_clusters"] = df["n_clusters"].astype("uint32")
df["gene_count"] = df["gene_count"].astype("uint32")
df["gene_total"] = df["gene_total"].astype("uint32")
df["bg_count"] = df["bg_count"].astype("uint32")
df["bg_total"] = df["bg_total"].astype("uint32")

# # convert ratios to numbers
# df["gene_ratio"] = df["gene_count"].div(df["gene_total"])
# df["bg_ratio"] = df["bg_count"].div(df["bg_total"])

# # add other metrics
# df["rich_factor"] = df["gene_count"].div(df["bg_count"])
# df["fold_enrich"] = df["gene_ratio"].div(df["bg_ratio"])

In [18]:
# # adjust for multiple testing across all results
# adj_pval = multipletests(df["pvalue"], alpha=0.05, method="fdr_bh")
# df = df.assign(fdr=adj_pval[1])

In [19]:
df.shape

(2754014, 21)

In [20]:
display(df.dtypes)
assert df.dtypes.loc["cluster_id"] == "category"

n_clusters              int32
cluster_id           category
go_term_id           category
go_term_desc         category
gene_count              int32
gene_total              int32
gene_ratio            float64
bg_count                int32
bg_total                int32
bg_ratio              float64
pvalue                float64
fdr_per_file          float64
tissue               category
gene_sel_strategy    category
corr_method          category
clust_method         category
enrich_func          category
results_subset       category
rich_factor           float64
fold_enrich           float64
fdr                   float64
dtype: object

In [21]:
df.sample(n=5, random_state=0)

Unnamed: 0,n_clusters,cluster_id,go_term_id,go_term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,...,fdr_per_file,tissue,gene_sel_strategy,corr_method,clust_method,enrich_func,results_subset,rich_factor,fold_enrich,fdr
2682920,150,C2,GO:0030198,extracellular matrix organization,4,10,0.4,108,4013,0.026913,...,0.001633,whole_blood,var_pc_log2,spearman_full,SpectralClustering,enrichGO,BP_full,0.037037,14.862963,0.000224
1934389,50,C37,GO:0010959,regulation of metal ion transport,10,133,0.075188,67,3407,0.019665,...,0.007624,skin_sun_exposed_lower_leg,var_pc_log2,spearman_abs,SpectralClustering,enrichGO,BP_full,0.149254,3.823364,0.000448
671711,10,C9,GO:2000106,regulation of leukocyte apoptotic process,13,642,0.020249,31,3278,0.009457,...,0.029928,artery_tibial,var_pc_log2,clustermatch_k2,SpectralClustering,enrichGO,BP_simplified_070,0.419355,2.141192,0.004232
1069153,65,C40,GO:0046394,carboxylic acid biosynthetic process,5,32,0.15625,105,3746,0.02803,...,0.043229,muscle_skeletal,var_pc_log2,clustermatch,SpectralClustering,enrichGO,BP_simplified_070,0.047619,5.574405,0.002381
1217210,25,C7,GO:0051216,cartilage development,22,619,0.035541,76,3746,0.020288,...,0.027969,muscle_skeletal,var_pc_log2,pearson_abs,SpectralClustering,enrichGO,BP_simplified_070,0.289474,1.751807,0.005156


## Some stats

In [22]:
display(df["qvalue"].describe())
assert df["qvalue"].min() > 0.0
assert df["qvalue"].max() < 1.0

count     2.754014e+06
mean      2.496662e-03
std       4.778156e-03
min      2.403574e-198
25%       1.488134e-05
50%       5.062089e-04
75%       2.745984e-03
max       4.998964e-02
Name: fdr, dtype: float64

In [23]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 125, 150, 175, 200],
      dtype=int32)

In [24]:
df["tissue"].unique()

['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']
Categories (5, object): ['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']

In [25]:
df["gene_sel_strategy"].unique()

['var_pc_log2']
Categories (1, object): ['var_pc_log2']

In [26]:
df["corr_method"].unique()

['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']
Categories (6, object): ['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']

In [27]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

In [28]:
df["enrich_params"].unique()

['BP_full', 'BP_simplified_070', 'CC_full', 'CC_simplified_070', 'MF_full', 'MF_simplified_070']
Categories (6, object): ['BP_full', 'BP_simplified_070', 'CC_full', 'CC_simplified_070', 'MF_full', 'MF_simplified_070']

In [29]:
assert not df.isna().any().any()

# Save

In [31]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [32]:
data = df

## Pickle

In [33]:
display(OUTPUT_FILE)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/gene_set_enrichment/gtex_v8_data-gene_set_enrichment.pkl')

In [34]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [None]:
output_rds_file = OUTPUT_FILE.with_suffix(".rds")
display(output_rds_file)

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [None]:
data_r

In [None]:
saveRDS(data_r, str(output_rds_file))

In [None]:
# testing
data_r = readRDS(str(output_rds_file))

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)

    # convert index to int, otherwise it's converted to string
    data_again.index = data_again.index.astype(int)

In [None]:
data_again.shape

In [None]:
data_again.head()

In [None]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_exact=True,  # since this is a binary format, it should match exactly
)

## tsv.gz

In [None]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

In [None]:
data.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [None]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)

In [None]:
data_again.shape

In [None]:
data_again.head()

In [None]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)