# Description

TODO

# Environment variables

In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
from IPython.display import display

N_JOBS = 2

In [3]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=2
env: OPEN_BLAS_NUM_THREADS=2
env: NUMEXPR_NUM_THREADS=2
env: OMP_NUM_THREADS=2


# Modules loading

In [31]:
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
from sklearn.cluster import SpectralClustering
from tqdm import tqdm
# import matplotlib.pyplot as plt
# import seaborn as sns

from clustermatch import conf
from clustermatch.utils import simplify_string

# Settings

In [5]:
CORRELATION_METHOD_NAME = "clustermatch"

In [6]:
GENE_SELECTION_STRATEGY = "var_pc_log2"

In [7]:
# Tissues with largest sample size from GTEx (see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
TISSUES = [
    "Muscle - Skeletal",
    "Whole Blood",
    "Skin - Sun Exposed (Lower leg)",
    "Adipose - Subcutaneous",
    "Artery - Tibial",
]

In [8]:
K_RANGE = [2] + np.arange(5, 100 + 1, 5).tolist() + [125, 150, 175, 200]

In [9]:
N_INIT = 50

In [10]:
INITIAL_RANDOM_STATE = 12345

In [36]:
def process_similarity_matrix(similarity_matrix):
    # for clustermatch, negative values are meaningless, so we replace them by zero
    similarity_matrix[similarity_matrix < 0.0] = 0.0
    return similarity_matrix

In [12]:
def get_distance_matrix(similarity_matrix):
    """
    Converts the processed similarity matrix into a distance matrix.
    """
    return 1.0 - similarity_matrix

# Paths

In [13]:
INPUT_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
display(INPUT_DIR)
assert INPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/similarity_matrices')

In [14]:
OUTPUT_DIR = conf.GTEX["CLUSTERING_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(INPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/similarity_matrices')

# Setup clustering options

In [15]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_RANGE"] = K_RANGE
CLUSTERING_OPTIONS["KMEANS_N_INIT"] = N_INIT

display(CLUSTERING_OPTIONS)

{'K_RANGE': [2,
  5,
  10,
  15,
  20,
  25,
  30,
  35,
  40,
  45,
  50,
  55,
  60,
  65,
  70,
  75,
  80,
  85,
  90,
  95,
  100,
  125,
  150,
  175,
  200],
 'KMEANS_N_INIT': 50}

# Get data files

In [16]:
tissue_names = [simplify_string(t.lower()) for t in TISSUES]
display(tissue_names)

['muscle_skeletal',
 'whole_blood',
 'skin_sun_exposed_lower_leg',
 'adipose_subcutaneous',
 'artery_tibial']

In [17]:
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}-{CORRELATION_METHOD_NAME}.pkl")))
input_files = [
    f for f in input_files if any(f"gtex_v8_data_{tn}-" in f.name for tn in tissue_names)
]
display(len(input_files))
display(input_files)

assert len(input_files) == len(TISSUES), len(TISSUES)
display(input_files[:5])

1

[PosixPath('/home/miltondp/projects/labs/greenelab/clustermatch_repos/clustermatch-gene-expr/base/results/gtex_v8/similarity_matrices/gtex_v8_data_muscle_skeletal-var_pc_log2-clustermatch.pkl')]

AssertionError: 5

## Show the content of one similarity matrix

In [18]:
sim_matrix = pd.read_pickle(input_files[0])

In [19]:
sim_matrix.shape

(5000, 5000)

In [20]:
sim_matrix.head()

gene_ens_id,ENSG00000129824.15,ENSG00000173432.10,ENSG00000134184.12,ENSG00000124107.5,ENSG00000248746.5,ENSG00000160808.9,ENSG00000224114.1,ENSG00000149968.11,ENSG00000134339.8,ENSG00000225972.1,...,ENSG00000176853.15,ENSG00000229108.1,ENSG00000107816.17,ENSG00000203896.9,ENSG00000103047.7,ENSG00000164309.14,ENSG00000122783.16,ENSG00000108622.10,ENSG00000275155.1,ENSG00000176155.18
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,1.0,0.077671,0.007732,0.047127,0.024821,0.057628,0.020145,0.039757,0.065109,0.011351,...,0.064277,0.011286,0.063137,0.070143,0.034394,0.012673,0.037288,0.063345,0.016883,0.083314
ENSG00000173432.10,0.077671,1.0,0.006585,0.152826,0.014058,0.150876,0.011295,0.052657,0.740176,0.009023,...,0.112789,0.008191,0.068112,0.078765,0.07607,0.010097,0.096578,0.034157,0.013767,0.212478
ENSG00000134184.12,0.007732,0.006585,1.0,0.008955,0.002264,0.004715,0.003211,0.00811,0.00822,0.004516,...,0.004827,0.00863,0.016618,0.005108,0.008761,0.005632,0.006766,0.008962,0.011051,0.008469
ENSG00000124107.5,0.047127,0.152826,0.008955,1.0,0.098144,0.300738,0.012364,0.113659,0.126653,0.00711,...,0.099721,0.027318,0.026732,0.065606,0.255969,0.044182,0.194441,0.007456,0.004392,0.181435
ENSG00000248746.5,0.024821,0.014058,0.002264,0.098144,1.0,0.07597,0.005325,0.024211,0.01279,0.006844,...,0.024099,0.001094,0.007958,0.014175,0.041099,0.012013,0.021514,0.007689,0.004239,0.032633


# Clustering

In [21]:
# from clustering.methods import DeltaSpectralClustering
# from sklearn.metrics import silhouette_score

## Clusterers

In [22]:
# np.sqrt(sim_matrix.shape[0])

In [23]:
# from clustering.methods import DeltaSpectralClustering

In [38]:
CLUSTERERS = {}

idx = 0
random_state = INITIAL_RANDOM_STATE

for k in CLUSTERING_OPTIONS["K_RANGE"]:
    #     for delta_value in CLUSTERING_OPTIONS["DELTAS"]:
    #         for i in range(CLUSTERING_OPTIONS["N_REPS_PER_K"]):
    clus = SpectralClustering(
        eigen_solver="arpack",
        n_clusters=k,
        n_init=CLUSTERING_OPTIONS["KMEANS_N_INIT"],
        affinity="precomputed",
        random_state=random_state,
    )

    method_name = type(clus).__name__
    CLUSTERERS[f"{method_name} #{idx}"] = clus

    random_state = random_state + 1
    idx = idx + 1

In [39]:
display(len(CLUSTERERS))

25

In [40]:
_iter = iter(CLUSTERERS.items())
display(next(_iter))
display(next(_iter))

('SpectralClustering #0',
 SpectralClustering(affinity='precomputed', eigen_solver='arpack', n_clusters=2,
                    n_init=50, random_state=12345))

('SpectralClustering #1',
 SpectralClustering(affinity='precomputed', eigen_solver='arpack', n_clusters=5,
                    n_init=50, random_state=12346))

In [41]:
clustering_method_name = method_name
display(clustering_method_name)

'SpectralClustering'

## Generate ensemble

In [45]:
from sklearn.metrics import silhouette_score
from clustermatch.clustering import generate_ensemble
# from utils import generate_result_set_name

In [46]:
# # generate a temporary folder where to store the ensemble and avoid computing it again
# ensemble_folder = Path(
#     BASE_FOLDER,
#     "results",
#     METHOD_NAME,
# ).resolve()
# display(ensemble_folder)

# ensemble_folder.mkdir(parents=True, exist_ok=True)

In [47]:
pbar = tqdm(input_files, ncols=100)

for tissue_data_file in pbar:
    pbar.set_description(tissue_data_file.stem)

    # read
    sim_matrix = pd.read_pickle(tissue_data_file)
    sim_matrix = process_similarity_matrix(sim_matrix)

    ensemble = generate_ensemble(
        sim_matrix,
        CLUSTERERS,
        attributes=["n_clusters"],
        tqdm_args={"leave": False, "ncols": 100},
    )
    
    _tmp = ensemble["n_clusters"].value_counts().unique()
    assert _tmp.shape[0] == 1
    assert _tmp[0] == 1
    
    assert not ensemble["n_clusters"].isna().any()
    
    assert ensemble.shape[0] == len(CLUSTERERS)
    
    assert np.all(
        [
            part["partition"].shape[0] == sim_matrix.shape[0]
            for idx, part in ensemble.iterrows()
        ]
    )
    
    # no partition has negative labels or nan
    assert not np.any([np.isnan(part["partition"]).any() for idx, part in ensemble.iterrows()])
    assert not np.any([(part["partition"] < 0).any() for idx, part in ensemble.iterrows()])
    
    _real_k_values = ensemble["partition"].apply(lambda x: np.unique(x).shape[0])
    display(_real_k_values)
    assert np.all(ensemble["n_clusters"].values == _real_k_values.values)
    
    # add clustering quality measures
    dist_matrix = get_distance_matrix(sim_matrix)
    ensemble = ensemble.assign(
        si_score=ensemble["partition"].apply(
            lambda x: silhouette_score(dist_matrix, x, metric="precomputed")
        ),
    )

    # save
    output_filename = f"{tissue_data_file.stem}-{CORRELATION_METHOD_NAME}.pkl"
    ensemble.to_pickle(path=OUTPUT_DIR / output_filename)

gtex_v8_data_muscle_skeletal-var_pc_log2-clustermatch:   0%|                  | 0/1 [00:00<?, ?it/s]
  0%|                                                                        | 0/25 [00:00<?, ?it/s][A
  4%|██▌                                                             | 1/25 [00:02<00:58,  2.44s/it][A
  8%|█████                                                           | 2/25 [00:05<01:00,  2.63s/it][A
 12%|███████▋                                                        | 3/25 [00:08<01:08,  3.10s/it][A
 16%|██████████▏                                                     | 4/25 [00:12<01:12,  3.47s/it][A
 20%|████████████▊                                                   | 5/25 [00:17<01:16,  3.81s/it][A
 24%|███████████████▎                                                | 6/25 [00:23<01:24,  4.47s/it][A
 28%|█████████████████▉                                              | 7/25 [00:31<01:45,  5.87s/it][A
 32%|████████████████████▍                                         

clusterer_id
SpectralClustering #0       2
SpectralClustering #1       5
SpectralClustering #2      10
SpectralClustering #3      15
SpectralClustering #4      20
SpectralClustering #5      25
SpectralClustering #6      30
SpectralClustering #7      35
SpectralClustering #8      40
SpectralClustering #9      45
SpectralClustering #10     50
SpectralClustering #11     55
SpectralClustering #12     60
SpectralClustering #13     65
SpectralClustering #14     70
SpectralClustering #15     75
SpectralClustering #16     80
SpectralClustering #17     85
SpectralClustering #18     90
SpectralClustering #19     95
SpectralClustering #20    100
SpectralClustering #21    125
SpectralClustering #22    150
SpectralClustering #23    175
SpectralClustering #24    200
Name: partition, dtype: int64

gtex_v8_data_muscle_skeletal-var_pc_log2-clustermatch: 100%|█████████| 1/1 [06:30<00:00, 390.85s/it]


# Cluster quality

**TODO**: move this to another notebook

In [None]:
# with pd.option_context("display.max_rows", None, "display.max_columns", None):
#     _df = ensemble.groupby(["n_clusters", "delta"]).mean()
#     display(_df)

In [None]:
# with sns.plotting_context("talk", font_scale=0.75), sns.axes_style(
#     "whitegrid", {"grid.linestyle": "--"}
# ):
#     fig = plt.figure(figsize=(14, 6))
#     ax = sns.pointplot(data=ensemble, x="n_clusters", y="si_score", hue="delta")
#     ax.set_ylabel("Silhouette index\n(higher is better)")
#     ax.set_xlabel("Number of clusters ($k$)")
#     ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
#     plt.grid(True)
#     plt.tight_layout()