# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.RECOUNT2

# Paths

In [3]:
INPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/results/recount2/gene_set_enrichment')

In [4]:
OUTPUT_FILE = DATASET_CONFIG["GENE_ENRICHMENT_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-gene_set_enrichment.pkl')

# Get data files

In [5]:
filename_pattern = re.compile(DATASET_CONFIG["GENE_ENRICHMENT_FILENAME_PATTERN"])

In [6]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
        #         and m.group("corr_method") == CORRELATION_METHOD_NAME
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

36

[PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-BP_full.pkl'),
 PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-BP_simplified_070.pkl'),
 PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-CC_full.pkl'),
 PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-CC_simplified_070.pkl'),
 PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-MF_full.pkl')]

## Preview data

In [7]:
display(input_files[0])

PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering-enrichGO-BP_full.pkl')

In [8]:
_tmp_df = pd.read_pickle(input_files[0])

In [9]:
_tmp_df.shape

(108849, 11)

In [10]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0,cluster_id,go_term_id,go_term_desc,gene_ratio,bg_ratio,pvalue,fdr_per_partition,qvalue,geneID,gene_count,n_clusters
94584,C90,GO:0009636,response to toxic substance,1/1,133/6519,0.020402,0.037203,0.010106,MT2A,1,150
16957,C26,GO:0048729,tissue morphogenesis,32/260,324/6519,1e-06,0.000134,0.000118,DSP/MDK/JAG1/HOXA5/FZD5/EPHA2/FZD2/VCL/LIF/PLO...,32,30
52770,C55,GO:0009259,ribonucleotide metabolic process,18/131,270/6519,6e-06,0.000389,0.000343,ELOVL6/ACACA/ACLY/PPAT/AK4/DLAT/GMPS/PDHX/PFKP...,18,75
24685,C31,GO:0051251,positive regulation of lymphocyte activation,17/210,215/6519,0.000527,0.016359,0.013781,IL18/IL23A/KLRK1/CD40/CD5/LCK/MYB/NOD2/PYCARD/...,17,40
7933,C4,GO:0099173,postsynapse organization,14/537,80/6519,0.005234,0.041536,0.033806,CDH2/SLC7A11/NRP2/LGMN/ITGA3/NLGN2/SPTBN2/ARHG...,14,15


# Run

## Read data, convert dtypes, add new metrics

In [11]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full)
    # f_data = f_data.rename(
    # columns={
    # "Count": "gene_count",
    # "GeneRatio": "gene_ratio",
    # "BgRatio": "bg_ratio",
    # "ID": "go_term_id",
    # "Description": "go_term_desc",
    # "Cluster": "cluster_id",
    # "clustering_n_clusters": "n_clusters",
    # "p.adjust": "fdr_per_file",
    # }
    # )

    # genes in cluster
    f_data = f_data.assign(
        gene_total=f_data["gene_ratio"].apply(lambda x: int(x.split("/")[1]))
    )

    # background genes
    f_data = f_data.assign(
        bg_count=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[0]))
    )
    f_data = f_data.assign(
        bg_total=f_data["bg_ratio"].apply(lambda x: int(x.split("/")[1]))
    )

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "n_clusters",
            "cluster_id",
            "go_term_id",
            "go_term_desc",
            "gene_count",
            "gene_total",
            "gene_ratio",
            "bg_count",
            "bg_total",
            "bg_ratio",
            "pvalue",
            "fdr_per_partition",
        ]
    ]

    # f_data["tissue"] = metadata.group("tissue")
    # f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")
    f_data["enrich_func"] = metadata.group("enrich_func")
    f_data["enrich_params"] = metadata.group("enrich_params")

    all_results.append(f_data)

100%|███████████████████████████████████████████████████████████████| 36/36 [00:03<00:00,  9.91it/s]


In [12]:
df = pd.concat(all_results, ignore_index=True)

# to category dtype
df["cluster_id"] = df["cluster_id"].astype("category")
df["go_term_id"] = df["go_term_id"].astype("category")
df["go_term_desc"] = df["go_term_desc"].astype("category")
# df["tissue"] = df["tissue"].astype("category")
# df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")
df["enrich_func"] = df["enrich_func"].astype("category")
df["enrich_params"] = df["enrich_params"].astype("category")

# convert to int32
df["n_clusters"] = df["n_clusters"].astype("int32")
df["gene_count"] = df["gene_count"].astype("int32")
df["gene_total"] = df["gene_total"].astype("int32")
df["bg_count"] = df["bg_count"].astype("int32")
df["bg_total"] = df["bg_total"].astype("int32")

# convert ratios to numbers
df["gene_ratio"] = df["gene_count"].div(df["gene_total"])
df["bg_ratio"] = df["bg_count"].div(df["bg_total"])

# add other metrics
df["rich_factor"] = df["gene_count"].div(df["bg_count"])
df["fold_enrich"] = df["gene_ratio"].div(df["bg_ratio"])

In [13]:
# adjust for multiple testing across all results
adj_pval = multipletests(df["pvalue"], alpha=0.05, method="fdr_bh")
df = df.assign(fdr=adj_pval[1])

In [14]:
df.shape

(1247015, 19)

In [15]:
display(df.dtypes)
assert df.dtypes.loc["cluster_id"] == "category"

n_clusters              int32
cluster_id           category
go_term_id           category
go_term_desc         category
gene_count              int32
gene_total              int32
gene_ratio            float64
bg_count                int32
bg_total                int32
bg_ratio              float64
pvalue                float64
fdr_per_partition     float64
corr_method          category
clust_method         category
enrich_func          category
enrich_params        category
rich_factor           float64
fold_enrich           float64
fdr                   float64
dtype: object

In [16]:
df.sample(n=5)

Unnamed: 0,n_clusters,cluster_id,go_term_id,go_term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,fdr_per_partition,corr_method,clust_method,enrich_func,enrich_params,rich_factor,fold_enrich,fdr
405570,125,C52,GO:0061733,peptide-lysine-N-acetyltransferase activity,3,65,0.046154,18,6524,0.002759,0.000692,0.008004,clustermatch_k2to5,SpectralClustering,enrichGO,MF_full,0.166667,16.728205,0.001085
99488,175,C44,GO:0050657,nucleic acid transport,7,80,0.0875,112,6519,0.017181,0.000414,0.023512,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.0625,5.092969,0.00072
896742,200,C91,GO:0042254,ribosome biogenesis,1,1,1.0,118,6519,0.018101,0.018101,0.049087,spearman_abs,SpectralClustering,enrichGO,BP_full,0.008475,55.245763,0.018373
966550,175,C100,GO:0098793,presynapse,11,129,0.085271,222,6555,0.033867,0.004168,0.027095,spearman_abs,SpectralClustering,enrichGO,CC_full,0.04955,2.517809,0.004694
149404,200,C168,GO:0008285,negative regulation of cell population prolife...,9,36,0.25,359,6519,0.05507,0.000105,0.002581,clustermatch_k2,SpectralClustering,enrichGO,BP_simplified_070,0.02507,4.539694,0.000239


## Some stats

In [17]:
display(df["fdr"].describe())
assert df["fdr"].min() > 0.0
assert df["fdr"].max() < 1.0

count     1.247015e+06
mean      1.927088e-03
std       4.054476e-03
min      6.607322e-144
25%       9.933220e-06
50%       4.080055e-04
75%       2.075541e-03
max       4.971338e-02
Name: fdr, dtype: float64

In [18]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 125, 150, 175, 200],
      dtype=int32)

In [19]:
# df["tissue"].unique()

In [20]:
# df["gene_sel_strategy"].unique()

In [21]:
df["corr_method"].unique()

['clustermatch_k2', 'clustermatch_k2to5', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']
Categories (6, object): ['clustermatch_k2', 'clustermatch_k2to5', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']

In [22]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

In [23]:
df["enrich_params"].unique()

['BP_full', 'BP_simplified_070', 'CC_full', 'CC_simplified_070', 'MF_full', 'MF_simplified_070']
Categories (6, object): ['BP_full', 'BP_simplified_070', 'CC_full', 'CC_simplified_070', 'MF_full', 'MF_simplified_070']

## Testing

In [24]:
assert not df.isna().any().any()

In [25]:
# test if values are correctly calculated
# _tmp = df[
#    (df.go_term_id == "GO:0035383")
#    & (df.n_clusters == 65)
#    & (df.cluster_id == "C21")
#    # & (df.tissue == "adipose_subcutaneous")
#    # & (df.gene_sel_strategy == "var_pc_log2")
#    & (df.corr_method == "clustermatch")
#    & (df.clust_method == "SpectralClustering")
#    & (df.enrich_func == "enrichGO")
#    & (df.enrich_params == "BP_full")
# ]
# assert _tmp.shape[0] == 1
# _tmp = _tmp.iloc[0]
#
# assert _tmp["gene_count"] == 15
# assert _tmp["gene_total"] == 329
# assert _tmp["gene_ratio"] == 15.0 / 329.0
# assert _tmp["bg_count"] == 34
# assert _tmp["bg_total"] == 3528
# assert _tmp["bg_ratio"] == 34.0 / 3528.0
# assert _tmp["rich_factor"] == 15.0 / 34.0
# assert _tmp["fold_enrich"] == (15.0 / 329.0) / (34.0 / 3528.0)

# Save

In [26]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [27]:
data = df

## Pickle

In [28]:
display(OUTPUT_FILE)

PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-gene_set_enrichment.pkl')

In [29]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [30]:
output_rds_file = OUTPUT_FILE.with_suffix(".rds")
display(output_rds_file)

PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-gene_set_enrichment.rds')

In [31]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [32]:
data_r

n_clusters,cluster_id,go_term_id,...,rich_factor,fold_enrich,fdr
2,C1,GO:00...,...,0.803965,1.714441,0.000000
2,C0,GO:00...,,0.872774,1.643446,0.000000
2,C0,GO:00...,,0.856383,1.612583,0.000000
2,C0,GO:00...,,0.843990,1.589246,0.000000
...,...,...,,...,...,...
200,C170,GO:00...,,0.037975,4.346436,0.002953
200,C170,GO:00...,,0.037975,4.346436,0.002953
200,C36,GO:00...,,0.086207,4.197118,0.006950
200,C36,GO:00...,,0.108108,5.263413,0.007094


In [33]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f83e9991680> [RTYPES.NILSXP]

In [34]:
# testing
data_r = readRDS(str(output_rds_file))

In [35]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)

    # convert index to int, otherwise it's converted to string
    data_again.index = data_again.index.astype(int)

In [36]:
data_again.shape

(1247015, 19)

In [37]:
data_again.head()

Unnamed: 0,n_clusters,cluster_id,go_term_id,go_term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,fdr_per_partition,corr_method,clust_method,enrich_func,enrich_params,rich_factor,fold_enrich,fdr
0,2,C1,GO:0006954,inflammatory response,365,3057,0.119398,454,6519,0.069643,3.2743200000000003e-52,1.562178e-48,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.803965,1.714441,6.924072999999999e-50
1,2,C0,GO:0006396,RNA processing,343,3462,0.099076,393,6519,0.060285,4.1995809999999994e-50,2.020839e-46,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.872774,1.643446,8.120546999999999e-48
2,2,C0,GO:0006412,translation,322,3462,0.09301,376,6519,0.057678,4.688221e-43,1.1279860000000002e-39,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.856383,1.612583,6.48866e-41
3,2,C0,GO:0043043,peptide biosynthetic process,330,3462,0.095321,391,6519,0.059979,3.054631e-41,4.899628e-38,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.84399,1.589246,3.857779e-39
4,2,C1,GO:0007186,G protein-coupled receptor signaling pathway,322,3057,0.105332,419,6519,0.064274,2.7789309999999996e-38,6.62914e-35,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.768496,1.638805,3.029963e-36


In [38]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_exact=True,  # since this is a binary format, it should match exactly
)

## tsv.gz

In [39]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/opt/data/results/recount2/gene_set_enrichment/recount_data_prep_PLIER-gene_set_enrichment.tsv.gz')

In [40]:
data.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [41]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)

In [42]:
data_again.shape

(1247015, 19)

In [43]:
data_again.head()

Unnamed: 0,n_clusters,cluster_id,go_term_id,go_term_desc,gene_count,gene_total,gene_ratio,bg_count,bg_total,bg_ratio,pvalue,fdr_per_partition,corr_method,clust_method,enrich_func,enrich_params,rich_factor,fold_enrich,fdr
0,2,C1,GO:0006954,inflammatory response,365,3057,0.119398,454,6519,0.069643,3.2743200000000003e-52,1.56218e-48,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.803965,1.71444,6.924069999999999e-50
1,2,C0,GO:0006396,RNA processing,343,3462,0.099076,393,6519,0.060285,4.1995799999999996e-50,2.02084e-46,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.872774,1.64345,8.120549999999999e-48
2,2,C0,GO:0006412,translation,322,3462,0.09301,376,6519,0.057678,4.68822e-43,1.12799e-39,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.856383,1.61258,6.48866e-41
3,2,C0,GO:0043043,peptide biosynthetic process,330,3462,0.095321,391,6519,0.059978,3.0546299999999996e-41,4.89963e-38,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.84399,1.58925,3.85778e-39
4,2,C1,GO:0007186,G protein-coupled receptor signaling pathway,322,3057,0.105332,419,6519,0.064274,2.7789299999999997e-38,6.62914e-35,clustermatch_k2,SpectralClustering,enrichGO,BP_full,0.768496,1.63881,3.02996e-36


In [44]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)