# Description

Combines all gene enrichment results found in input directory.

# Modules loading

In [1]:
import re

import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.RECOUNT2

In [3]:
# ENRICH_FUNCTION = "enrichGO"

In [4]:
# CORRELATION_METHOD_NAME = "clustermatch"

In [5]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [6]:
# # clusterProfiler settings
# ENRICH_FUNCTION = "enrichGO"
# SIMPLIFY_CUTOFF = 0.7
# GO_ONTOLOGIES = ("BP", "CC", "MF")

In [7]:
# SIMILARITY_MATRICES_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
# display(SIMILARITY_MATRICES_DIR)

In [8]:
# SIMILARITY_MATRIX_FILENAME_TEMPLATE = conf.GTEX["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
# display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

# Paths

In [9]:
INPUT_DIR = DATASET_CONFIG["CLUSTERING_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering')

In [10]:
OUTPUT_FILE = DATASET_CONFIG["CLUSTERING_COMBINED_FILE"]
display(OUTPUT_FILE)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-clustering.pkl')

# Get data files

In [11]:
filename_pattern = re.compile(DATASET_CONFIG["CLUSTERING_FILENAME_PATTERN"])

In [12]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
        #         and m.group("corr_method") == CORRELATION_METHOD_NAME
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

6

[PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-clustermatch_k2to5-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-pearson_full-SpectralClustering.pkl'),
 PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-spearman_abs-SpectralClustering.pkl')]

## Preview data

In [13]:
display(input_files[0])

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering.pkl')

In [14]:
_tmp_df = pd.read_pickle(input_files[0])

In [15]:
_tmp_df.shape

(25, 4)

In [16]:
_tmp_df.sample(n=5, random_state=0)

Unnamed: 0_level_0,params,partition,n_clusters,si_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SpectralClustering #5,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[11, 11, 23, 17, 11, 16, 23, 23, 16, 23, 11, 1...",25,0.023936
SpectralClustering #2,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[3, 3, 3, 7, 3, 7, 3, 3, 7, 3, 3, 3, 3, 3, 3, ...",10,0.038756
SpectralClustering #19,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[9, 72, 88, 68, 65, 70, 60, 60, 70, 9, 22, 60,...",95,0.003103
SpectralClustering #16,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[70, 74, 69, 79, 77, 4, 39, 74, 4, 39, 77, 39,...",80,0.004088
SpectralClustering #11,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[16, 16, 46, 44, 30, 12, 39, 39, 12, 0, 22, 39...",55,0.012551


# Run

## Read data, convert dtypes, add new metrics

In [18]:
all_results = []

for f_full in tqdm(input_files, ncols=100):
    f_name = f_full.name

    f_data = pd.read_pickle(f_full).reset_index()

    # add metadata
    metadata = re.search(filename_pattern, f_name)

    f_data = f_data[
        [
            "id",
            "n_clusters",
            "partition",
            "si_score",
        ]
    ]

#     f_data["tissue"] = metadata.group("tissue")
#     f_data["gene_sel_strategy"] = metadata.group("gene_sel_strategy")
    f_data["corr_method"] = metadata.group("corr_method")
    f_data["clust_method"] = metadata.group("clust_method")

    all_results.append(f_data)

100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 265.41it/s]


In [19]:
df = pd.concat(all_results, ignore_index=True)

In [20]:
df.shape

(150, 6)

In [21]:
df.head()

Unnamed: 0,id,n_clusters,partition,si_score,corr_method,clust_method
0,SpectralClustering #0,2,"[1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...",0.058329,clustermatch_k2,SpectralClustering
1,SpectralClustering #1,5,"[0, 0, 0, 4, 0, 4, 0, 0, 4, 0, 1, 0, 0, 0, 0, ...",0.054201,clustermatch_k2,SpectralClustering
2,SpectralClustering #2,10,"[3, 3, 3, 7, 3, 7, 3, 3, 7, 3, 3, 3, 3, 3, 3, ...",0.038756,clustermatch_k2,SpectralClustering
3,SpectralClustering #3,15,"[4, 4, 4, 10, 13, 12, 4, 4, 10, 4, 13, 12, 4, ...",0.031417,clustermatch_k2,SpectralClustering
4,SpectralClustering #4,20,"[13, 13, 11, 5, 19, 4, 11, 11, 4, 11, 19, 13, ...",0.027535,clustermatch_k2,SpectralClustering


In [22]:
df.dtypes

id               object
n_clusters        int64
partition        object
si_score        float64
corr_method      object
clust_method     object
dtype: object

In [23]:
# convert to int32
df["n_clusters"] = df["n_clusters"].astype("int32")

# to category dtype
df["id"] = df["id"].astype("category")
# df["tissue"] = df["tissue"].astype("category")
# df["gene_sel_strategy"] = df["gene_sel_strategy"].astype("category")
df["corr_method"] = df["corr_method"].astype("category")
df["clust_method"] = df["clust_method"].astype("category")

In [24]:
display(df.dtypes)
assert df.dtypes.loc["id"] == "category"

id              category
n_clusters         int32
partition         object
si_score         float64
corr_method     category
clust_method    category
dtype: object

In [25]:
df.iloc[0]["partition"]

array([1, 1, 0, ..., 1, 1, 0], dtype=int32)

In [26]:
df.sample(n=5)

Unnamed: 0,id,n_clusters,partition,si_score,corr_method,clust_method
124,SpectralClustering #24,200,"[117, 51, 82, 73, 51, 128, 117, 82, 38, 16, 16...",-0.111704,spearman_abs,SpectralClustering
55,SpectralClustering #5,25,"[22, 22, 18, 24, 22, 11, 20, 22, 21, 17, 8, 12...",0.042112,pearson_abs,SpectralClustering
105,SpectralClustering #5,25,"[18, 18, 8, 15, 18, 15, 20, 17, 15, 20, 18, 20...",0.003711,spearman_abs,SpectralClustering
77,SpectralClustering #2,10,"[5, 5, 8, 7, 5, 8, 5, 5, 0, 9, 8, 5, 5, 5, 5, ...",0.023363,pearson_full,SpectralClustering
99,SpectralClustering #24,200,"[32, 139, 124, 72, 41, 97, 31, 151, 79, 178, 8...",0.033938,pearson_full,SpectralClustering


## Some stats

In [27]:
df["n_clusters"].unique()

array([  2,   5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,
        65,  70,  75,  80,  85,  90,  95, 100, 125, 150, 175, 200],
      dtype=int32)

In [28]:
# df["tissue"].unique()

In [29]:
# df["gene_sel_strategy"].unique()

In [30]:
df["corr_method"].unique()

['clustermatch_k2', 'clustermatch_k2to5', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']
Categories (6, object): ['clustermatch_k2', 'clustermatch_k2to5', 'pearson_abs', 'pearson_full', 'spearman_abs', 'spearman_full']

In [31]:
df["clust_method"].unique()

['SpectralClustering']
Categories (1, object): ['SpectralClustering']

## Testing

In [32]:
assert not df.isna().any().any()

# Save

In [33]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

saveRDS = ro.r["saveRDS"]
readRDS = ro.r["readRDS"]

In [34]:
data = df

## Pickle

In [35]:
display(OUTPUT_FILE)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/recount2/clustering/recount_data_prep_PLIER-clustering.pkl')

In [36]:
data.to_pickle(OUTPUT_FILE)

## RDS

In [None]:
# output_rds_file = OUTPUT_FILE.with_suffix(".rds")
# display(output_rds_file)

In [None]:
# with localconverter(ro.default_converter + pandas2ri.converter):
# #     data["partition"] = data["partition"].apply(lambda x: ro.IntVector(x.tolist()))
#     data_r = ro.conversion.py2rpy(data)

In [None]:
# data_r

In [None]:
# saveRDS(data_r, str(output_rds_file))

In [None]:
# # testing
# data_r = readRDS(str(output_rds_file))

In [None]:
# with localconverter(ro.default_converter + pandas2ri.converter):
#     data_again = ro.conversion.rpy2py(data_r)

#     # convert index to int, otherwise it's converted to string
#     data_again.index = data_again.index.astype(int)

In [None]:
# data_again.shape

In [None]:
# data_again.head()

In [None]:
# pd.testing.assert_frame_equal(
#     data,
#     data_again,
#     check_names=False,  # do not check "name" attribute of index and column
#     check_exact=True,  # since this is a binary format, it should match exactly
# )

## tsv.gz

In [None]:
output_text_file = OUTPUT_FILE.with_suffix(".tsv.gz")
display(output_text_file)

In [None]:
data_text = data.copy()
data_text["partition"] = data_text["partition"].apply(lambda x: repr(x.tolist()))

In [None]:
data_text.to_csv(output_text_file, sep="\t", index=False, float_format="%.5e")

In [None]:
# testing
data_again = pd.read_csv(output_text_file, sep="\t")  # , index_col=0)
data_again["partition"] = data_again["partition"].apply(
    lambda x: np.array(eval(x), dtype="int32")
)

In [None]:
data_again.shape

In [None]:
data_again.head()

In [None]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_names=False,  # do not check "name" attribute of index and column
    check_dtype=False,  # do not check dtypes: do not distinguish between int64 and int32, for instance
    check_categorical=False,
    check_exact=False,
    rtol=1e-5,
    atol=5e-5,
)