# Description

It computes gene enrichment on *all* the clustering results (obtained using some correlation measure) on a dataset.
All these settings are specified below under `Settings`.

# Modules loading

In [1]:
import re
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.gene_enrich import run_enrich

# Settings

In [2]:
DATASET_CONFIG = conf.RECOUNT2

In [3]:
# we do not need to split by method for recount2
# CORRELATION_METHOD_NAME = "pearson_abs"

In [4]:
# GENE_SELECTION_STRATEGY = "var_pc_log2"

In [5]:
# clusterProfiler settings
ENRICH_FUNCTION = "enrichGO"
SIMPLIFY_CUTOFF = 0.7
GO_ONTOLOGIES = ("BP", "CC", "MF")

# Paths

In [6]:
INPUT_DIR = DATASET_CONFIG["CLUSTERING_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/opt/data/results/recount2/clustering')

In [7]:
# this directory has the input data given to the clustering methods
SIMILARITY_MATRICES_DIR = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
display(SIMILARITY_MATRICES_DIR)

PosixPath('/opt/data/results/recount2/similarity_matrices')

In [8]:
SIMILARITY_MATRIX_FILENAME_TEMPLATE = DATASET_CONFIG[
    "SIMILARITY_MATRIX_FILENAME_TEMPLATE"
]
display(SIMILARITY_MATRIX_FILENAME_TEMPLATE)

'recount_data_prep_PLIER-{corr_method}.pkl'

In [9]:
OUTPUT_DIR = DATASET_CONFIG["GENE_ENRICHMENT_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/recount2/gene_set_enrichment')

# Get data files

In [10]:
filename_pattern = re.compile(DATASET_CONFIG["CLUSTERING_FILENAME_PATTERN"])

In [11]:
# get input data files according to Settings
input_files = sorted(
    [
        f
        for f in INPUT_DIR.iterdir()
        if (m := re.search(filename_pattern, str(f))) is not None
    ]
)
display(len(input_files))
display(input_files[:5])

assert len(input_files) > 0

6

[PosixPath('/opt/data/results/recount2/clustering/recount_data_prep_PLIER-clustermatch_k2-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/recount2/clustering/recount_data_prep_PLIER-clustermatch_k2to5-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/recount2/clustering/recount_data_prep_PLIER-pearson_abs-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/recount2/clustering/recount_data_prep_PLIER-pearson_full-SpectralClustering.pkl'),
 PosixPath('/opt/data/results/recount2/clustering/recount_data_prep_PLIER-spearman_abs-SpectralClustering.pkl')]

# Preview input data

## Clustering results

In [12]:
tmp = pd.read_pickle(input_files[0])

In [13]:
tmp.shape

(25, 4)

In [14]:
tmp.head()

Unnamed: 0_level_0,params,partition,n_clusters,si_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SpectralClustering #0,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...",2,0.058329
SpectralClustering #1,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[0, 0, 0, 4, 0, 4, 0, 0, 4, 0, 1, 0, 0, 0, 0, ...",5,0.054201
SpectralClustering #2,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[3, 3, 3, 7, 3, 7, 3, 3, 7, 3, 3, 3, 3, 3, 3, ...",10,0.038756
SpectralClustering #3,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[4, 4, 4, 10, 13, 12, 4, 4, 10, 4, 13, 12, 4, ...",15,0.031417
SpectralClustering #4,"{""affinity"": ""precomputed"", ""assign_labels"": ""...","[13, 13, 11, 5, 19, 4, 11, 11, 4, 11, 19, 13, ...",20,0.027535


## Similarity matrices (input to clustering methods)

In [15]:
similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
    corr_method="clustermatch_k2",
)
display(similarity_matrix_filename)

'recount_data_prep_PLIER-clustermatch_k2.pkl'

In [16]:
tmp = pd.read_pickle(SIMILARITY_MATRICES_DIR / similarity_matrix_filename)

In [17]:
tmp.shape

(6750, 6750)

In [18]:
tmp.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.359879,0.161606,0.069152,0.179718,0.241732,0.273178,0.240247,0.22299,0.220751,...,0.008947,0.065011,0.020923,0.244929,0.129664,0.067795,0.106808,0.090599,0.099318,0.255519
MMP14,0.359879,1.0,0.144209,0.0602,0.271937,0.192011,0.258037,0.207851,0.187494,0.193339,...,0.011178,0.075366,0.033851,0.134849,0.096144,0.051084,0.068585,0.110153,0.111159,0.264664
DSP,0.161606,0.144209,1.0,0.042226,0.062718,0.048719,0.207457,0.277144,0.018145,0.147427,...,0.006362,0.060892,0.005772,0.027932,0.135008,0.003201,0.142329,0.143227,-2.7e-05,0.312919
MARCKSL1,0.069152,0.0602,0.042226,1.0,0.041695,0.066842,0.021648,0.039905,0.100069,0.043523,...,0.099796,0.05016,0.036485,0.13652,0.028806,0.131773,0.02184,0.044066,0.057527,0.099114
SPARC,0.179718,0.271937,0.062718,0.041695,1.0,0.067178,0.123818,0.169873,0.081537,0.117885,...,0.034611,0.033891,0.00314,0.040078,0.128344,0.009827,0.104421,0.013683,0.072254,0.13684


# Run

In [19]:
simplified_cutoff_str = f"{SIMPLIFY_CUTOFF:.2f}".replace(".", "")
display(simplified_cutoff_str)

'070'

In [20]:
n_partitions_per_file = pd.read_pickle(input_files[0]).shape[0]
display(n_partitions_per_file)

25

In [21]:
# the number of tasks is the number of input files times number of partitions per file times 3 (BP, CC, MF)
n_tasks = len(input_files) * n_partitions_per_file * 3
n_tasks = int(n_tasks)
display(f"number of tasks: {n_tasks}")

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor, tqdm(
    total=n_tasks, ncols=100
) as pbar:
    for clustering_filepath in input_files:
        # extract from input clustering filename some sections, such as tissue name, etc
        m = re.search(filename_pattern, str(clustering_filepath.name))

        #         tissue = m.group("tissue")
        #         gene_sel_strategy = m.group("gene_sel_strategy")
        corr_method = m.group("corr_method")

        # update pbar description
        #         pbar.set_description(f"{tissue}/{gene_sel_strategy}")
        pbar.set_description(f"{corr_method}")

        # create output filepath template
        full_output_filename_template = (
            f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{{ontology}}_full.pkl"
        )
        simplified_output_filename_template = f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{{ontology}}_simplified_{simplified_cutoff_str}.pkl"

        # read clustering results
        clustering_df = pd.read_pickle(clustering_filepath)

        # get partitions' numbers
        tmp_partition = clustering_df.iloc[0].partition
        n_genes = tmp_partition.shape[0]
        n_clusters = np.unique(tmp_partition).shape[0]

        # use those sections to read the list of genes from the input data
        # file that the clustering algorithm received
        similarity_matrix_filename = SIMILARITY_MATRIX_FILENAME_TEMPLATE.format(
            #             tissue=tissue,
            #             gene_sel_strategy=gene_sel_strategy,
            corr_method=corr_method.split("_")[0]
            if not corr_method.startswith("clustermatch")
            else corr_method,
        )

        # get the universe of genes
        all_gene_ids = pd.read_pickle(
            SIMILARITY_MATRICES_DIR / similarity_matrix_filename
        ).index.tolist()
        all_gene_ids = np.array(all_gene_ids)
        assert all_gene_ids.shape[0] == n_genes

        # iterate over clustering solutions (partitions) and GO ontologies
        futures = {
            executor.submit(
                run_enrich,
                all_gene_ids,
                cr.partition,
                ENRICH_FUNCTION,
                ontology,
                key_type="SYMBOL",
                simplify_cutoff=SIMPLIFY_CUTOFF,
            ): ontology
            for cr_idx, cr in clustering_df.sort_values("n_clusters").iterrows()
            for ontology in GO_ONTOLOGIES
            if not (
                (
                    OUTPUT_DIR / full_output_filename_template.format(ontology=ontology)
                ).exists()
                and (
                    OUTPUT_DIR
                    / simplified_output_filename_template.format(ontology=ontology)
                ).exists()
            )
        }

        # FIXME: this n_expected here is horrible
        #  I leave it here for now
        futures_n_expected = int(len(GO_ONTOLOGIES) * clustering_df.shape[0])

        futures_diff = futures_n_expected - len(futures)
        if futures_diff > 0:
            pbar.update(futures_diff)

        if futures_diff == futures_n_expected:
            continue

        # collect results
        results_full = defaultdict(list)
        results_simplified = defaultdict(list)

        for task in as_completed(futures):
            ont = futures[task]
            task_results = task.result()

            # continue if no enrichment found
            if len(task_results) == 0:
                pbar.update(1)
                continue

            results_full[ont].append(task_results[0])

            if len(task_results) > 1:
                results_simplified[ont].append(task_results[1])

            pbar.update(1)

        if len(results_full) == 0:
            # no significant results, continue
            continue

        # merge and serve
        pbar.set_description(f"{corr_method}/saving")

        for ontology in GO_ONTOLOGIES:
            # full
            results_full_df = pd.concat(
                results_full[ontology], ignore_index=True
            ).sort_values(["n_clusters", "fdr_per_partition"])

            results_full_df.to_pickle(
                OUTPUT_DIR
                / f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ontology}_full.pkl",
            )

            # simplified
            if len(results_simplified) > 0:
                results_simplified_df = pd.concat(
                    results_simplified[ontology], ignore_index=True
                ).sort_values(["n_clusters", "fdr_per_partition"])

                results_simplified_df.to_pickle(
                    OUTPUT_DIR
                    / f"{clustering_filepath.stem}-{ENRICH_FUNCTION}-{ontology}_simplified_{simplified_cutoff_str}.pkl",
                )

'number of tasks: 450'

spearman_full/saving: 100%|███████████████████████████████████| 450/450 [14:33:08<00:00, 116.42s/it]
