# Description

TODO/UPDATE: It computes gene enrichment on *all* the clustering results obtained using some correlation measure on GTEx v8 (specified under `Settings` below).

# Modules loading

In [1]:
import re
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.gene_enrich import run_enrich

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX

In [3]:
CORRELATION_METHOD_NAME = "clustermatch"

In [4]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [5]:
# clusterProfiler settings
ENRICH_FUNCTION = "enrichPathway"
ENRICH_PARAMS = "human"

# Paths

In [6]:
INPUT_DIR = DATASET_CONFIG["CLUSTERING_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/clustering')

In [7]:
# this directory has the input data given to the clustering methods
SIMILARITY_MATRICES_DIR = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
display(SIMILARITY_MATRICES_DIR)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices')

In [8]:
SIMILARITY_MATRIX_FILENAME_TEMPLATE = DATASET_CONFIG[
    "SIMILARITY_MATRIX_FILENAME_TEMPLATE"
]
display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

'gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl'

In [9]:
OUTPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_set_enrichment')

# Get data files

In [10]:
filename_pattern = re.compile(DATASET_CONFIG["CLUSTERING_FILENAME_PATTERN"])

In [11]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
        and m.group("corr_method") == CORRELATION_METHOD_NAME
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

5

[PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_artery_tibial-var_pc_log2-clustermatch-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_muscle_skeletal-var_pc_log2-clustermatch-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_skin_sun_exposed_lower_leg-var_pc_log2-clustermatch-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/gtex_v8/clustering/gtex_v8_data_whole_blood-var_pc_log2-clustermatch-SpectralClustering.pkl')]

# Preview input data

## Clustering results

In [12]:
tmp = pd.read_pickle(input_files[0])

In [13]:
tmp.shape

(25, 4)

In [14]:
tmp.head()

Unnamed: 0_level_0,params,partition,n_clusters,si_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SpectralClustering #0,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0.025231
SpectralClustering #1,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[4, 0, 4, 1, 0, 0, 0, 4, 0, 4, 0, 4, 1, 4, 0, ...",5,0.026704
SpectralClustering #2,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[1, 1, 6, 1, 9, 5, 5, 5, 9, 5, 1, 1, 6, 9, 9, ...",10,0.032722
SpectralClustering #3,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[0, 0, 13, 10, 1, 10, 13, 10, 14, 12, 13, 10, ...",15,0.03206
SpectralClustering #4,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[15, 0, 7, 18, 4, 15, 15, 15, 11, 8, 18, 13, 1...",20,0.033032


## Similarity matrices (input to clustering methods)

In [15]:
similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
    tissue="adipose_subcutaneous",
    gene_sel_strategy="var_pc_log2",
    corr_method=CORRELATION_METHOD_NAME.split("_")[0]
    if not CORRELATION_METHOD_NAME.startswith("clustermatch")
    else CORRELATION_METHOD_NAME,
)
display(similarity_matrix_filename)

'gtex_v8_data_adipose_subcutaneous-var_pc_log2-clustermatch.pkl'

In [16]:
tmp = pd.read_pickle(SIMILARITY_MATRICES_DIR / similarity_matrix_filename)

In [17]:
tmp.shape

(5000, 5000)

In [18]:
tmp.head()

gene_ens_id,ENSG00000129824.15,ENSG00000149968.11,ENSG00000134184.12,ENSG00000224114.1,ENSG00000173432.10,ENSG00000067048.16,ENSG00000229807.10,ENSG00000012817.15,ENSG00000134339.8,ENSG00000175084.11,...,ENSG00000197467.13,ENSG00000279807.1,ENSG00000107742.12,ENSG00000176454.13,ENSG00000212206.1,ENSG00000104611.11,ENSG00000181800.5,ENSG00000171049.8,ENSG00000257671.1,ENSG00000180448.10
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,0.092378,0.007262,0.011841,0.105775,0.479273,0.478657,0.483544,0.117969,0.043658,...,0.004467,0.014411,0.014983,0.045078,0.007907,0.009798,0.011161,0.00962,0.003538,0.005812
ENSG00000149968.11,0.092378,1.0,0.00457,0.00917,0.015736,0.078307,0.073894,0.074441,0.017325,0.014241,...,0.04125,0.024692,0.008636,0.014949,0.029433,0.001359,0.019925,0.012707,0.030752,0.003618
ENSG00000134184.12,0.007262,0.00457,1.0,0.00567,0.005632,0.005024,0.005195,0.006747,0.002646,0.008142,...,0.023497,0.004689,0.00549,0.006992,0.006293,0.008088,0.003917,0.011875,0.001767,0.00645
ENSG00000224114.1,0.011841,0.00917,0.00567,1.0,0.007314,0.00442,0.004736,0.002574,0.007253,0.012894,...,0.007156,0.005168,0.002192,0.008732,0.003973,0.003198,0.004628,0.014917,0.0019,0.003556
ENSG00000173432.10,0.105775,0.015736,0.005632,0.007314,1.0,0.107761,0.089364,0.105775,0.815978,0.04639,...,0.04639,0.002897,0.040011,0.126462,0.003632,0.015732,0.004559,0.024573,0.008535,0.00771


### Convert Ensembl Gene IDs to Entrez IDs

In [19]:
input_filename = conf.GTEX["DATA_DIR"] / "gtex_entrez_gene_ids_mappings.pkl"
display(input_filename)
assert input_filename.exists()

PosixPath('/opt/data/data/gtex_v8/gtex_entrez_gene_ids_mappings.pkl')

In [20]:
gene_ids_mappings = pd.read_pickle(input_filename)

In [21]:
gene_ids_mappings.shape

(34527, 3)

In [22]:
gene_ids_mappings.head()

Unnamed: 0,gene_ens_id_v,ensembl_id,entrez_id
0,ENSG00000145309.5,ENSG00000145309,85438
1,ENSG00000175820.3,ENSG00000175820,643677
2,ENSG00000083454.21,ENSG00000083454,5026
3,ENSG00000071794.15,ENSG00000071794,6596
4,ENSG00000211918.1,ENSG00000211918,28503


In [23]:
gene_id_maps = gene_ids_mappings.set_index("gene_ens_id_v")["entrez_id"].to_dict()

In [24]:
dict(list(gene_id_maps.items())[0:2])

{'ENSG00000145309.5': '85438', 'ENSG00000175820.3': '643677'}

In [25]:
# is map from ensembl to entrez unique?
_tmp_index = [gene_id_maps[x] for x in tmp.index if x in gene_id_maps]
display(len(_tmp_index))
display(_tmp_index[:5])

4201

['6192', '4314', '2944', '100271063', '6288']

# Run

In [26]:
n_partitions_per_file = pd.read_pickle(input_files[0]).shape[0]
display(n_partitions_per_file)

25

In [27]:
# the number of tasks is the number of input files times number of partitions per file
n_tasks = len(input_files) * n_partitions_per_file
n_tasks = int(n_tasks)
display(f"number of tasks: {n_tasks}")

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor, tqdm(
    total=n_tasks, ncols=100
) as pbar:
    for clustering_filepath in input_files:
        # extract from input clustering filename some sections, such as tissue name, etc
        m = re.search(filename_pattern, str(clustering_filepath.name))

        tissue = m.group("tissue")
        gene_sel_strategy = m.group("gene_sel_strategy")
        corr_method = m.group("corr_method")

        # update pbar description
        pbar.set_description(f"{tissue}/{gene_sel_strategy}")
        #         pbar.set_description(f"{corr_method}")

        # create output filepath template
        full_output_filename_template = (
            f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ENRICH_PARAMS}.pkl"
        )

        # read clustering results
        clustering_df = pd.read_pickle(clustering_filepath)

        # get partitions' numbers
        tmp_partition = clustering_df.iloc[0].partition
        n_genes = tmp_partition.shape[0]
        n_clusters = np.unique(tmp_partition).shape[0]

        # use those sections to read the list of genes from the input data
        # file that the clustering algorithm received
        similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
            tissue=tissue,
            gene_sel_strategy=gene_sel_strategy,
            corr_method=corr_method.split("_")[0]
            if not corr_method.startswith("clustermatch")
            else corr_method,
        )

        # get the universe of genes
        all_gene_ens_ids = pd.read_pickle(
            SIMILARITY_MATRICES_DIR / similarity_matrix_filename
        ).index.tolist()

        # convert gene ensembl ids to entrez and create clustering partition mask
        partition_mask = []
        all_gene_ids = []
        entrez_ids_added = set()  # this is faster

        for x in all_gene_ens_ids:
            if x not in gene_id_maps:
                partition_mask.append(False)
                continue

            new_entrez_id = gene_id_maps[x]

            # TODO: maybe this avoiding of repeated gene ids is not necessary?
            # do not add repeated ids
            if new_entrez_id in entrez_ids_added:
                partition_mask.append(False)
                continue

            all_gene_ids.append(new_entrez_id)
            entrez_ids_added.add(new_entrez_id)
            partition_mask.append(True)

        partition_mask = np.array(partition_mask, dtype=bool)
        all_gene_ids = np.array(all_gene_ids)
        assert np.unique(all_gene_ids).shape[0] == all_gene_ids.shape[0]
        assert all_gene_ids.shape[0] == np.sum(partition_mask)

        # iterate over clustering solutions (partitions) and GO ontologies
        futures = [
            executor.submit(
                run_enrich,
                all_gene_ids,
                "ENTREZID",
                cr.partition[partition_mask],
                ENRICH_FUNCTION,
                ENRICH_PARAMS,
            )
            for cr_idx, cr in clustering_df.sort_values("n_clusters").iterrows()
        ]

        # collect results
        results_full = []

        for task in as_completed(futures):
            task_results = task.result()

            # continue if no enrichment found
            if len(task_results) == 0:
                pbar.update(1)
                continue

            results_full.append(task_results[0])

            pbar.update(1)

        if len(results_full) == 0:
            # no significant results, continue
            continue

        # merge and serve
        pbar.set_description(f"{tissue}/{gene_sel_strategy}/saving")

        # full
        results_full_df = pd.concat(results_full, ignore_index=True).sort_values(
            ["n_clusters", "pvalue_adjust"]
        )

        results_full_df.to_pickle(
            OUTPUT_DIR
            / f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ENRICH_PARAMS}.pkl",
        )

'number of tasks: 125'

whole_blood/var_pc_log2/saving: 100%|█████████████████████████████| 125/125 [05:16<00:00,  2.53s/it]
