# Description

TODO/UPDATE: It computes gene enrichment on *all* the clustering results obtained using some correlation measure on GTEx v8 (specified under `Settings` below).

# Modules loading

In [1]:
import re
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.gene_enrich import run_enrich

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
CORRELATION_METHOD_NAME = "pearson_abs"

In [4]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [5]:
# clusterProfiler settings
ENRICH_FUNCTION = "enrichPathway"
ENRICH_PARAMS = "human"

# Paths

In [6]:
INPUT_DIR = DATASET_CONFIG["CLUSTERING_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/clustering')

In [7]:
# this directory has the input data given to the clustering methods
SIMILARITY_MATRICES_DIR = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
display(SIMILARITY_MATRICES_DIR)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices')

In [8]:
SIMILARITY_MATRIX_FILENAME_TEMPLATE = DATASET_CONFIG[
    "SIMILARITY_MATRIX_FILENAME_TEMPLATE"
]
display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

'gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl'

In [9]:
OUTPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment')

# Get data files

In [10]:
filename_pattern = re.compile(DATASET_CONFIG["CLUSTERING_FILENAME_PATTERN"])

In [11]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
        and m.group("corr_method") == CORRELATION_METHOD_NAME
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

5

[PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_artery_tibial-var_pc_log2-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_muscle_skeletal-var_pc_log2-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_whole_blood-var_pc_log2-pearson_abs-SpectralClustering.pkl')]

# Preview input data

## Clustering results

In [12]:
tmp = pd.read_pickle(input_files[0])

In [13]:
tmp.shape

(25, 4)

In [14]:
tmp.head()

Unnamed: 0_level_0,params,partition,n_clusters,si_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SpectralClustering #0,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",2,0.080422
SpectralClustering #1,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[1, 4, 0, 4, 0, 4, 4, 1, 0, 1, 4, 4, 4, 0, 4, ...",5,0.081886
SpectralClustering #2,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[6, 7, 7, 7, 6, 6, 7, 9, 6, 9, 7, 5, 7, 6, 6, ...",10,0.102141
SpectralClustering #3,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[11, 1, 0, 0, 11, 11, 0, 0, 11, 9, 0, 14, 0, 1...",15,0.117802
SpectralClustering #4,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[19, 8, 12, 2, 2, 2, 16, 11, 19, 6, 2, 13, 16,...",20,0.110024


## Similarity matrices (input to clustering methods)

In [15]:
similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
    tissue="adipose_subcutaneous",
    gene_sel_strategy="var_pc_log2",
    corr_method=CORRELATION_METHOD_NAME.split("_")[0]
    if not CORRELATION_METHOD_NAME.startswith("clustermatch")
    else CORRELATION_METHOD_NAME,
)
display(similarity_matrix_filename)

'gtex_v8_data_adipose_subcutaneous-var_pc_log2-pearson.pkl'

In [16]:
tmp = pd.read_pickle(SIMILARITY_MATRICES_DIR / similarity_matrix_filename)

In [17]:
tmp.shape

(5000, 5000)

In [18]:
tmp.head()

gene_ens_id,ENSG00000129824.15,ENSG00000149968.11,ENSG00000134184.12,ENSG00000224114.1,ENSG00000173432.10,ENSG00000067048.16,ENSG00000229807.10,ENSG00000012817.15,ENSG00000134339.8,ENSG00000175084.11,...,ENSG00000197467.13,ENSG00000279807.1,ENSG00000107742.12,ENSG00000176454.13,ENSG00000212206.1,ENSG00000104611.11,ENSG00000181800.5,ENSG00000171049.8,ENSG00000257671.1,ENSG00000180448.10
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,0.142708,0.044838,0.086169,-0.421258,0.865448,-0.859269,0.865491,-0.367116,0.138336,...,0.059709,-0.064784,-0.187353,0.170236,-0.121441,-0.045449,0.01852,-0.036597,0.020401,-0.080637
ENSG00000149968.11,0.142708,1.0,-0.007867,-0.010732,-0.053097,0.09384,-0.11733,0.145575,-0.042243,0.017482,...,0.016968,-0.108133,-0.085922,0.003982,-0.116853,0.084377,0.030756,0.001033,0.081805,-0.016195
ENSG00000134184.12,0.044838,-0.007867,1.0,0.048325,-0.060237,-0.003045,0.01888,0.030437,-0.058037,0.01318,...,0.142475,-0.012856,-0.001916,0.055313,-0.065313,0.066425,0.091907,-0.122295,-0.029169,0.038163
ENSG00000224114.1,0.086169,-0.010732,0.048325,1.0,-0.013719,0.020956,-0.033068,0.030229,0.002116,0.009568,...,-0.03495,0.002969,-0.026505,0.038772,-0.052049,0.007405,0.022898,0.076558,-0.012262,0.040187
ENSG00000173432.10,-0.421258,-0.053097,-0.060237,-0.013719,1.0,-0.415001,0.293888,-0.412644,0.928801,-0.13188,...,-0.106866,-0.067553,0.151411,-0.292238,0.024054,0.075032,-0.114333,0.081625,0.000631,0.009827


### Convert Ensembl Gene IDs to Entrez IDs

In [19]:
input_filename = conf.GTEX["DATA_DIR"] / "gtex_entrez_gene_ids_mappings.pkl"
display(input_filename)
assert input_filename.exists()

PosixPath('/opt/data/data/gtex_v8/gtex_entrez_gene_ids_mappings.pkl')

In [20]:
gene_ids_mappings = pd.read_pickle(input_filename)

In [21]:
gene_ids_mappings.shape

(34527, 3)

In [22]:
gene_ids_mappings.head()

Unnamed: 0,gene_ens_id_v,ensembl_id,entrez_id
0,ENSG00000145309.5,ENSG00000145309,85438
1,ENSG00000175820.3,ENSG00000175820,643677
2,ENSG00000083454.21,ENSG00000083454,5026
3,ENSG00000071794.15,ENSG00000071794,6596
4,ENSG00000211918.1,ENSG00000211918,28503


In [23]:
gene_id_maps = gene_ids_mappings.set_index("gene_ens_id_v")["entrez_id"].to_dict()

In [24]:
dict(list(gene_id_maps.items())[0:2])

{'ENSG00000145309.5': '85438', 'ENSG00000175820.3': '643677'}

In [25]:
# is map from ensembl to entrez unique?
_tmp_index = [gene_id_maps[x] for x in tmp.index if x in gene_id_maps]
display(len(_tmp_index))
display(_tmp_index[:5])

4201

['6192', '4314', '2944', '100271063', '6288']

# Run

In [26]:
n_partitions_per_file = pd.read_pickle(input_files[0]).shape[0]
display(n_partitions_per_file)

25

In [27]:
# the number of tasks is the number of input files times number of partitions per file
n_tasks = len(input_files) * n_partitions_per_file
n_tasks = int(n_tasks)
display(f"number of tasks: {n_tasks}")

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor, tqdm(
    total=n_tasks, ncols=100
) as pbar:
    for clustering_filepath in input_files:
        # extract from input clustering filename some sections, such as tissue name, etc
        m = re.search(filename_pattern, str(clustering_filepath.name))

        tissue = m.group("tissue")
        gene_sel_strategy = m.group("gene_sel_strategy")
        corr_method = m.group("corr_method")

        # update pbar description
        pbar.set_description(f"{tissue}/{gene_sel_strategy}")
        #         pbar.set_description(f"{corr_method}")

        # create output filepath template
        full_output_filename_template = (
            f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ENRICH_PARAMS}.pkl"
        )

        # read clustering results
        clustering_df = pd.read_pickle(clustering_filepath)

        # get partitions' numbers
        tmp_partition = clustering_df.iloc[0].partition
        n_genes = tmp_partition.shape[0]
        n_clusters = np.unique(tmp_partition).shape[0]

        # use those sections to read the list of genes from the input data
        # file that the clustering algorithm received
        similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
            tissue=tissue,
            gene_sel_strategy=gene_sel_strategy,
            corr_method=corr_method.split("_")[0]
            if not corr_method.startswith("clustermatch")
            else corr_method,
        )

        # get the universe of genes
        all_gene_ens_ids = pd.read_pickle(
            SIMILARITY_MATRICES_DIR / similarity_matrix_filename
        ).index.tolist()

        # convert gene ensembl ids to entrez and create clustering partition mask
        partition_mask = []
        all_gene_ids = []
        entrez_ids_added = set()  # this is faster

        for x in all_gene_ens_ids:
            if x not in gene_id_maps:
                partition_mask.append(False)
                continue

            new_entrez_id = gene_id_maps[x]

            # TODO: maybe this avoiding of repeated gene ids is not necessary?
            # do not add repeated ids
            if new_entrez_id in entrez_ids_added:
                partition_mask.append(False)
                continue

            all_gene_ids.append(new_entrez_id)
            entrez_ids_added.add(new_entrez_id)
            partition_mask.append(True)

        partition_mask = np.array(partition_mask, dtype=bool)
        all_gene_ids = np.array(all_gene_ids)
        assert np.unique(all_gene_ids).shape[0] == all_gene_ids.shape[0]
        assert all_gene_ids.shape[0] == np.sum(partition_mask)

        # iterate over clustering solutions (partitions) and GO ontologies
        futures = [
            executor.submit(
                run_enrich,
                all_gene_ids,
                "ENTREZID",
                cr.partition[partition_mask],
                ENRICH_FUNCTION,
                ENRICH_PARAMS,
            )
            for cr_idx, cr in clustering_df.sort_values("n_clusters").iterrows()
        ]

        # collect results
        results_full = []

        for task in as_completed(futures):
            task_results = task.result()

            # continue if no enrichment found
            if task_results is None:
                pbar.update(1)
                continue

            results_full.append(task_results)

            pbar.update(1)

        if len(results_full) == 0:
            # no significant results, continue
            continue

        # merge and serve
        pbar.set_description(f"{tissue}/{gene_sel_strategy}/saving")

        # full
        results_full_df = pd.concat(results_full, ignore_index=True).sort_values(
            ["n_clusters", "pvalue_adjust"]
        )

        results_full_df.to_pickle(
            OUTPUT_DIR
            / f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ENRICH_PARAMS}.pkl",
        )

'number of tasks: 125'

whole_blood/var_pc_log2/saving: 100%|██████████████████████████| 125/125 [7:28:00<00:00, 215.04s/it]
