# Description

This notebook reads clustering results taking the top 4 partitions with more clusters, and analyzes each cluster providing a list of latent variables (LV) that are driving that cluster. For example, for the hypertension traits, it might find an LV with genes expressed in cardiomyocytes or other potentially related cell types.

It uses the `papermill` API to run the notebook `interpret_cluster.run.ipynb` (which serves as a template) for each cluster. Results are saved in folder `cluster_analyses`.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import shutil
import subprocess
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import pandas as pd
import papermill as pm

import conf

# Load best partitions

In [4]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering')

In [5]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [6]:
best_partitions = pd.read_pickle(input_file)

In [7]:
assert best_partitions.index.is_unique

In [8]:
best_partitions.shape

(59, 4)

In [9]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ami_mean,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.301839,True
10,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.301738,True
9,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.301301,True
4,eac_complete_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.300385,True
8,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.300171,True


# Select top k partitions

In [10]:
# I take the top 4 partitions (according to their number of clusters).
# These are the partitions that will be analyzed in the manuscript.
selected_partition_ks = best_partitions[best_partitions["selected"]].index.sort_values(
    ascending=False
)[:4]
display(selected_partition_ks)

Int64Index([45, 41, 38, 28], dtype='int64', name='k')

# Run interpretation

In [11]:
CLUSTER_ANALYSIS_OUTPUT_DIR = Path(
    conf.RESULTS["CLUSTERING_INTERPRETATION_OUTPUT_DIR"],
    "cluster_analyses",
).resolve()
display(CLUSTER_ANALYSIS_OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/nbs/20_cluster_interpretation/cluster_analyses')

In [12]:
CLUSTER_ANALYSIS_OUTPUT_DIR.mkdir(exist_ok=True)

In [13]:
def run_notebook(input_nb, output_nb, parameters):
    pm.execute_notebook(
        input_nb,
        output_nb,
        progress_bar=False,
        parameters=parameters,
    )

In [14]:
for part_k in selected_partition_ks:
    print(f"Partition k:{part_k}", flush=True)

    output_folder = Path(CLUSTER_ANALYSIS_OUTPUT_DIR, f"part{part_k}").resolve()
    shutil.rmtree(output_folder, ignore_errors=True)
    output_folder.mkdir()

    part = best_partitions.loc[part_k, "partition"]
    part_clusters = pd.Series(part).value_counts()

    # always skip the biggest cluster in each partition
    for c_size_idx, c in enumerate(part_clusters.index[1:]):
        print(f"  Cluster: {c}", flush=True)

        input_nb = Path(
            conf.RESULTS["CLUSTERING_INTERPRETATION_OUTPUT_DIR"],
            "interpret_cluster.run.ipynb",
        ).resolve()

        output_nb = Path(
            output_folder, f"{c_size_idx:02}-part{part_k}_k{c}.ipynb"
        ).resolve()

        parameters = dict(PARTITION_K=part_k, PARTITION_CLUSTER_ID=c)

        run_notebook(input_nb, output_nb, parameters)

Partition k:45
  Cluster: 0
  Cluster: 38
  Cluster: 35
  Cluster: 39
  Cluster: 25
  Cluster: 36
  Cluster: 30
  Cluster: 29
  Cluster: 27
  Cluster: 42
  Cluster: 34
  Cluster: 41
  Cluster: 24
  Cluster: 23
  Cluster: 31
  Cluster: 21
  Cluster: 20
  Cluster: 32
  Cluster: 12
  Cluster: 8
  Cluster: 43
  Cluster: 15
  Cluster: 14
  Cluster: 9
  Cluster: 7
  Cluster: 19
  Cluster: 17
  Cluster: 1
  Cluster: 6
  Cluster: 16
  Cluster: 26
  Cluster: 13
  Cluster: 11
  Cluster: 5
  Cluster: 40
  Cluster: 4
  Cluster: 2
  Cluster: 3
  Cluster: 33
  Cluster: 10
  Cluster: 22
  Cluster: 37
  Cluster: 28
  Cluster: 44
Partition k:41
  Cluster: 35
  Cluster: 38
  Cluster: 36
  Cluster: 37
  Cluster: 32
  Cluster: 34
  Cluster: 28
  Cluster: 39
  Cluster: 25
  Cluster: 33
  Cluster: 26
  Cluster: 20
  Cluster: 23
  Cluster: 24
  Cluster: 17
  Cluster: 22
  Cluster: 19
  Cluster: 12
  Cluster: 16
  Cluster: 29
  Cluster: 21
  Cluster: 10
  Cluster: 9
  Cluster: 7
  Cluster: 11
  Cluster: 8
  C