# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [2]:
# ENRICH_FUNCTION = "enrichGO"

In [3]:
# CORRELATION_METHOD_NAME = "clustermatch"

In [4]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [5]:
# # clusterProfiler settings
# ENRICH_FUNCTION = "enrichGO"
# SIMPLIFY_CUTOFF = 0.7
# GO_ONTOLOGIES = ("BP", "CC", "MF")

In [6]:
# SIMILARITY_MATRICES_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
# display(SIMILARITY_MATRICES_DIR)

In [7]:
# SIMILARITY_MATRIX_FILENAME_TEMPLATE = conf.GTEX["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
# display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

# Paths

In [8]:
INPUT_DIR = conf.GTEX["CLUSTERING_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering')

In [9]:
OUTPUT_FILE = conf.GTEX["CLUSTERING_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data-clustering.pkl')

# Get data files

In [10]:
filename_pattern = re.compile(conf.GTEX["CLUSTERING_FILENAME_PATTERN"])

In [11]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
        #         and m.group("corr_method") == CORRELATION_METHOD_NAME
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

30

[PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch_k2-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-pearson_full-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous

## Preview data

In [12]:
display(input_files[0])

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering.pkl')

In [13]:
_tmp_df = pd.read_pickle(input_files[0])

In [14]:
_tmp_df.shape

(25, 4)

In [15]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0_level_0,params,partition,n_clusters,si_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SpectralClustering #5,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[9, 24, 11, 0, 13, 9, 9, 9, 23, 7, 0, 17, 0, 1...",25,0.032697
SpectralClustering #2,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[1, 1, 6, 1, 9, 5, 5, 5, 9, 5, 1, 1, 6, 9, 9, ...",10,0.032722
SpectralClustering #19,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[38, 0, 79, 9, 25, 38, 38, 38, 25, 59, 9, 61, ...",95,0.027408
SpectralClustering #16,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[39, 53, 63, 7, 78, 39, 39, 39, 78, 28, 7, 71,...",80,0.028881
SpectralClustering #11,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[17, 38, 23, 2, 29, 17, 17, 17, 29, 34, 2, 30,...",55,0.029121


# Run

## Read data, convert dtypes, add new metrics

In [16]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full).reset_index()

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "id",
            "n_clusters",
            "partition",
            "si_score",
        ]
    ]

    f_data["tissue"] = metadata.group("tissue")
    f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")

    all_results.append(f_data)

100%|██████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 275.14it/s]


In [17]:
df = pd.concat(all_results, ignore_index=True)

In [18]:
df.shape

(750, 8)

In [19]:
df.head()

Unnamed: 0,id,n_clusters,partition,si_score,tissue,gene_sel_strategy,corr_method,clust_method
0,SpectralClustering #0,2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.025231,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
1,SpectralClustering #1,5,"[4, 0, 4, 1, 0, 0, 0, 4, 0, 4, 0, 4, 1, 4, 0, ...",0.026704,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
2,SpectralClustering #2,10,"[1, 1, 6, 1, 9, 5, 5, 5, 9, 5, 1, 1, 6, 9, 9, ...",0.032722,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
3,SpectralClustering #3,15,"[0, 0, 13, 10, 1, 10, 13, 10, 14, 12, 13, 10, ...",0.03206,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
4,SpectralClustering #4,20,"[15, 0, 7, 18, 4, 15, 15, 15, 11, 8, 18, 13, 1...",0.033032,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering


In [20]:
df.dtypes

id                    object
n_clusters             int64
partition             object
si_score             float64
tissue                object
gene_sel_strategy     object
corr_method           object
clust_method          object
dtype: object

In [21]:
# convert to int32
df["n_clusters"] = df["n_clusters"].astype("int32")

# to category dtype
df["id"] = df["id"].astype("category")
df["tissue"] = df["tissue"].astype("category")
df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")

In [22]:
display(df.dtypes)
assert df.dtypes.loc["id"] == "category"

id                   category
n_clusters              int32
partition              object
si_score              float64
tissue               category
gene_sel_strategy    category
corr_method          category
clust_method         category
dtype: object

In [23]:
df.iloc[0]["partition"]

array([1, 1, 1, ..., 1, 1, 0], dtype=int32)

In [24]:
df.sample(n=5)

Unnamed: 0,id,n_clusters,partition,si_score,tissue,gene_sel_strategy,corr_method,clust_method
104,SpectralClustering #4,20,"[5, 1, 17, 4, 9, 5, 9, 5, 9, 5, 4, 15, 4, 9, 3...",0.097459,adipose_subcutaneous,var_pc_log2,spearman_abs,SpectralClustering
411,SpectralClustering #11,55,"[15, 4, 19, 28, 28, 28, 35, 51, 4, 5, 33, 1, 5...",0.057994,muscle_skeletal,var_pc_log2,spearman_abs,SpectralClustering
392,SpectralClustering #17,85,"[11, 3, 36, 34, 7, 49, 82, 37, 3, 14, 74, 80, ...",0.008016,muscle_skeletal,var_pc_log2,pearson_full,SpectralClustering
31,SpectralClustering #6,30,"[16, 15, 8, 1, 8, 16, 16, 16, 8, 0, 1, 18, 28,...",0.031441,adipose_subcutaneous,var_pc_log2,clustermatch_k2,SpectralClustering
195,SpectralClustering #20,100,"[64, 30, 30, 24, 36, 64, 24, 0, 42, 0, 42, 22,...",0.023271,artery_tibial,var_pc_log2,clustermatch_k2,SpectralClustering


## Some stats

In [25]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 125, 150, 175, 200],
      dtype=int32)

In [26]:
df["tissue"].unique()

['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']
Categories (5, object): ['adipose_subcutaneous', 'artery_tibial', 'muscle_skeletal', 'skin_sun_exposed_lower_leg', 'whole_blood']

In [27]:
df["gene_sel_strategy"].unique()

['var_pc_log2']
Categories (1, object): ['var_pc_log2']

In [28]:
df["corr_method"].unique()

['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']
Categories (6, object): ['clustermatch', 'clustermatch_k2', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']

In [29]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

## Testing

In [30]:
assert not df.isna().any().any()

# Save

In [31]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [32]:
data = df

## Pickle

In [33]:
display(OUTPUT_FILE)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data-clustering.pkl')

In [34]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [35]:
# output_rds_file = OUTPUT_FILE.with_suffix(".rds")
# display(output_rds_file)

In [36]:
# with localconverter(ro.default_converter + pandas2ri.converter):
# #     data["partition"] = data["partition"].apply(lambda x: ro.IntVector(x.tolist()))
#     data_r = ro.conversion.py2rpy(data)

In [37]:
# data_r

In [38]:
# saveRDS(data_r, str(output_rds_file))

In [39]:
# # testing
# data_r = readRDS(str(output_rds_file))

In [40]:
# with localconverter(ro.default_converter + pandas2ri.converter):
#     data_again = ro.conversion.rpy2py(data_r)

#     # convert index to int, otherwise it's converted to string
#     data_again.index = data_again.index.astype(int)

In [41]:
# data_again.shape

In [42]:
# data_again.head()

In [43]:
# pd.testing.assert_frame_equal(
#     data,
#     data_again,
#     check_names=False,  # do not check "name" attribute of index and column
#     check_exact=True,  # since this is a binary format, it should match exactly
# )

## tsv.gz

In [44]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/clustering/gtex_v8_data-clustering.tsv.gz')

In [45]:
data_text = data.copy()
data_text["partition"] = data_text["partition"].apply(lambda x: repr(x.tolist()))

In [46]:
data_text.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [47]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)
data_again["partition"] = data_again["partition"].apply(
    lambda x: np.array(eval(x), dtype="int32")
)

In [48]:
data_again.shape

(750, 8)

In [49]:
data_again.head()

Unnamed: 0,id,n_clusters,partition,si_score,tissue,gene_sel_strategy,corr_method,clust_method
0,SpectralClustering #0,2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.025232,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
1,SpectralClustering #1,5,"[4, 0, 4, 1, 0, 0, 0, 4, 0, 4, 0, 4, 1, 4, 0, ...",0.026704,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
2,SpectralClustering #2,10,"[1, 1, 6, 1, 9, 5, 5, 5, 9, 5, 1, 1, 6, 9, 9, ...",0.032722,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
3,SpectralClustering #3,15,"[0, 0, 13, 10, 1, 10, 13, 10, 14, 12, 13, 10, ...",0.03206,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering
4,SpectralClustering #4,20,"[15, 0, 7, 18, 4, 15, 15, 15, 11, 8, 18, 13, 1...",0.033032,adipose_subcutaneous,var_pc_log2,clustermatch,SpectralClustering


In [50]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)