# Description

Runs hierarchical clustering on the umap version of the data.

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

2

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=2
env: OPEN_BLAS_NUM_THREADS=2
env: NUMEXPR_NUM_THREADS=2
env: OMP_NUM_THREADS=2


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name

# Settings

In [5]:
np.random.seed(0)

## Input data

In [6]:
INPUT_SUBSET = "umap"

In [7]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [8]:
# parameters of the dimentionality reduction steps
DR_OPTIONS = {
    "n_components": 50,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [9]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0'

## Clustering

In [10]:
from sklearn.cluster import AgglomerativeClustering

In [11]:
CLUSTERING_ATTRIBUTES_TO_SAVE = ["n_clusters"]

In [12]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_MIN"] = 2
CLUSTERING_OPTIONS["K_MAX"] = 75  # sqrt(3749) + some more to get closer to 295
CLUSTERING_OPTIONS["LINKAGE"] = {"ward", "complete", "average", "single"}
CLUSTERING_OPTIONS["AFFINITY"] = "euclidean"

display(CLUSTERING_OPTIONS)

{'K_MIN': 2,
 'K_MAX': 75,
 'LINKAGE': {'average', 'complete', 'single', 'ward'},
 'AFFINITY': 'euclidean'}

In [13]:
CLUSTERERS = {}

idx = 0

for k in range(CLUSTERING_OPTIONS["K_MIN"], CLUSTERING_OPTIONS["K_MAX"] + 1):
    for linkage in CLUSTERING_OPTIONS["LINKAGE"]:
        if linkage == "ward":
            affinity = "euclidean"
        else:
            affinity = "precomputed"

        clus = AgglomerativeClustering(
            n_clusters=k,
            affinity=affinity,
            linkage=linkage,
        )

        method_name = type(clus).__name__
        CLUSTERERS[f"{method_name} #{idx}"] = clus

        idx = idx + 1

In [14]:
display(len(CLUSTERERS))

296

In [15]:
_iter = iter(CLUSTERERS.items())
display(next(_iter))
display(next(_iter))

('AgglomerativeClustering #0',
 AgglomerativeClustering(affinity='precomputed', linkage='average'))

('AgglomerativeClustering #1',
 AgglomerativeClustering(affinity='precomputed', linkage='single'))

In [16]:
clustering_method_name = method_name
display(clustering_method_name)

'AgglomerativeClustering'

## Output directory

In [17]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["CLUSTERING_RUNS_DIR"],
    f"{INPUT_SUBSET}-{INPUT_STEM}",
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores')

# Load input file

In [18]:
data = pd.read_pickle(input_filepath)

In [19]:
data.shape

(3752, 50)

In [20]:
data.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
100001_raw-Food_weight,9.58728,8.050184,9.340894,8.884522,4.562011,9.261327,9.078164,8.009141,9.288872,9.827831,...,9.753575,0.440851,-0.077164,-0.184829,9.345852,0.622542,-0.051136,4.266558,8.047482,-0.317385
100002_raw-Energy,9.910512,8.372274,9.605316,9.808598,4.64197,9.360196,8.848985,7.768594,9.231921,9.328465,...,9.647689,0.581584,0.54793,0.600128,8.976416,1.249645,0.234843,4.636343,7.173085,0.137119
100003_raw-Protein,9.923088,8.38064,9.609899,9.836457,4.647365,9.366775,8.835996,7.767549,9.230281,9.314386,...,9.642637,0.580362,0.565898,0.621073,8.967297,1.271435,0.24396,4.648671,7.149967,0.148192
100004_raw-Fat,9.898531,8.375697,9.600443,9.785713,4.639249,9.360721,8.886237,7.783376,9.237095,9.345434,...,9.64827,0.573073,0.521042,0.576926,8.979161,1.227988,0.231848,4.622777,7.205517,0.147385
100005_raw-Carbohydrate,9.895807,8.376662,9.605841,9.791338,4.636903,9.352356,8.867312,7.774787,9.233479,9.344456,...,9.652949,0.575161,0.529565,0.579919,8.988125,1.231771,0.226778,4.62436,7.194818,0.123104


In [21]:
assert not data.isna().any().any()

# Clustering

## Generate ensemble

In [22]:
from sklearn.metrics import pairwise_distances
from clustering.ensembles.utils import generate_ensemble

In [23]:
data_dist = pairwise_distances(data, metric=CLUSTERING_OPTIONS["AFFINITY"])

In [24]:
data_dist.shape

(3752, 3752)

In [25]:
pd.Series(data_dist.flatten()).describe().apply(str)

count            14077504.0
mean      3.086747407913208
std      1.5393855571746826
min                     0.0
25%        2.05721515417099
50%       3.083280563354492
75%      3.9437204003334045
max      15.400805473327637
dtype: object

In [26]:
ensemble = generate_ensemble(
    data_dist,
    CLUSTERERS,
    attributes=CLUSTERING_ATTRIBUTES_TO_SAVE,
    affinity_matrix=data_dist,
)

  0%|          | 0/296 [00:00<?, ?it/s]

  0%|          | 1/296 [00:00<01:27,  3.37it/s]

  1%|          | 2/296 [00:00<01:17,  3.79it/s]

  1%|          | 3/296 [00:21<31:43,  6.50s/it]

  1%|▏         | 4/296 [00:21<22:35,  4.64s/it]

  2%|▏         | 5/296 [00:22<16:10,  3.34s/it]

  2%|▏         | 6/296 [00:22<11:33,  2.39s/it]

  2%|▏         | 7/296 [00:43<38:22,  7.97s/it]

  3%|▎         | 8/296 [00:43<27:13,  5.67s/it]

  3%|▎         | 9/296 [00:43<19:24,  4.06s/it]

  3%|▎         | 10/296 [00:44<13:47,  2.89s/it]

  4%|▎         | 11/296 [01:05<39:31,  8.32s/it]

  4%|▍         | 12/296 [01:05<28:00,  5.92s/it]

  4%|▍         | 13/296 [01:05<19:57,  4.23s/it]

  5%|▍         | 14/296 [01:05<14:10,  3.02s/it]

  5%|▌         | 15/296 [01:26<39:17,  8.39s/it]

  5%|▌         | 16/296 [01:27<27:50,  5.97s/it]

  6%|▌         | 17/296 [01:27<19:49,  4.26s/it]

  6%|▌         | 18/296 [01:27<14:05,  3.04s/it]

  6%|▋         | 19/296 [01:48<38:48,  8.41s/it]

  7%|▋         | 20/296 [01:48<27:29,  5.98s/it]

  7%|▋         | 21/296 [01:49<19:35,  4.27s/it]

  7%|▋         | 22/296 [01:49<13:54,  3.05s/it]

  8%|▊         | 23/296 [02:10<38:11,  8.39s/it]

  8%|▊         | 24/296 [02:10<27:03,  5.97s/it]

  8%|▊         | 25/296 [02:10<19:15,  4.27s/it]

  9%|▉         | 26/296 [02:10<13:41,  3.04s/it]

  9%|▉         | 27/296 [02:31<37:35,  8.39s/it]

  9%|▉         | 28/296 [02:32<26:37,  5.96s/it]

 10%|▉         | 29/296 [02:32<18:57,  4.26s/it]

 10%|█         | 30/296 [02:32<13:27,  3.04s/it]

 10%|█         | 31/296 [02:53<37:12,  8.42s/it]

 11%|█         | 32/296 [02:53<26:21,  5.99s/it]

 11%|█         | 33/296 [02:54<18:45,  4.28s/it]

 11%|█▏        | 34/296 [02:54<13:19,  3.05s/it]

 12%|█▏        | 35/296 [03:15<36:30,  8.39s/it]

 12%|█▏        | 36/296 [03:15<25:51,  5.97s/it]

 12%|█▎        | 37/296 [03:15<18:24,  4.27s/it]

 13%|█▎        | 38/296 [03:15<13:04,  3.04s/it]

 13%|█▎        | 39/296 [03:36<36:02,  8.41s/it]

 14%|█▎        | 40/296 [03:37<25:31,  5.98s/it]

 14%|█▍        | 41/296 [03:37<18:10,  4.28s/it]

 14%|█▍        | 42/296 [03:37<12:54,  3.05s/it]

 15%|█▍        | 43/296 [03:58<35:22,  8.39s/it]

 15%|█▍        | 44/296 [03:58<25:03,  5.97s/it]

 15%|█▌        | 45/296 [03:59<17:50,  4.26s/it]

 16%|█▌        | 46/296 [03:59<12:40,  3.04s/it]

 16%|█▌        | 47/296 [04:20<34:54,  8.41s/it]

 16%|█▌        | 48/296 [04:20<24:42,  5.98s/it]

 17%|█▋        | 49/296 [04:20<17:35,  4.27s/it]

 17%|█▋        | 50/296 [04:21<12:29,  3.05s/it]

 17%|█▋        | 51/296 [04:42<34:19,  8.41s/it]

 18%|█▊        | 52/296 [04:42<24:18,  5.98s/it]

 18%|█▊        | 53/296 [04:42<17:18,  4.27s/it]

 18%|█▊        | 54/296 [04:42<12:16,  3.05s/it]

 19%|█▊        | 55/296 [05:03<33:41,  8.39s/it]

 19%|█▉        | 56/296 [05:03<23:51,  5.97s/it]

 19%|█▉        | 57/296 [05:04<16:59,  4.26s/it]

 20%|█▉        | 58/296 [05:04<12:03,  3.04s/it]

 20%|█▉        | 59/296 [05:25<33:14,  8.42s/it]

 20%|██        | 60/296 [05:25<23:32,  5.98s/it]

 21%|██        | 61/296 [05:25<16:45,  4.28s/it]

 21%|██        | 62/296 [05:26<11:53,  3.05s/it]

 21%|██▏       | 63/296 [05:46<32:31,  8.38s/it]

 22%|██▏       | 64/296 [05:47<23:01,  5.96s/it]

 22%|██▏       | 65/296 [05:47<16:23,  4.26s/it]

 22%|██▏       | 66/296 [05:47<11:37,  3.03s/it]

 23%|██▎       | 67/296 [06:08<32:04,  8.40s/it]

 23%|██▎       | 68/296 [06:09<22:42,  5.97s/it]

 23%|██▎       | 69/296 [06:09<16:09,  4.27s/it]

 24%|██▎       | 70/296 [06:09<11:27,  3.04s/it]

 24%|██▍       | 71/296 [06:30<31:34,  8.42s/it]

 24%|██▍       | 72/296 [06:30<22:21,  5.99s/it]

 25%|██▍       | 73/296 [06:31<15:54,  4.28s/it]

 25%|██▌       | 74/296 [06:31<11:17,  3.05s/it]

 25%|██▌       | 75/296 [06:52<31:05,  8.44s/it]

 26%|██▌       | 76/296 [06:52<22:00,  6.00s/it]

 26%|██▌       | 77/296 [06:52<15:39,  4.29s/it]

 26%|██▋       | 78/296 [06:53<11:06,  3.06s/it]

 27%|██▋       | 79/296 [07:13<30:27,  8.42s/it]

 27%|██▋       | 80/296 [07:14<21:33,  5.99s/it]

 27%|██▋       | 81/296 [07:14<15:19,  4.28s/it]

 28%|██▊       | 82/296 [07:14<10:52,  3.05s/it]

 28%|██▊       | 83/296 [07:35<29:51,  8.41s/it]

 28%|██▊       | 84/296 [07:35<21:07,  5.98s/it]

 29%|██▊       | 85/296 [07:36<15:01,  4.27s/it]

 29%|██▉       | 86/296 [07:36<10:39,  3.05s/it]

 29%|██▉       | 87/296 [07:57<29:13,  8.39s/it]

 30%|██▉       | 88/296 [07:57<20:40,  5.97s/it]

 30%|███       | 89/296 [07:57<14:42,  4.26s/it]

 30%|███       | 90/296 [07:58<10:25,  3.04s/it]

 31%|███       | 91/296 [08:18<28:36,  8.37s/it]

 31%|███       | 92/296 [08:19<20:14,  5.95s/it]

 31%|███▏      | 93/296 [08:19<14:23,  4.26s/it]

 32%|███▏      | 94/296 [08:19<10:12,  3.03s/it]

 32%|███▏      | 95/296 [08:40<28:10,  8.41s/it]

 32%|███▏      | 96/296 [08:40<19:56,  5.98s/it]

 33%|███▎      | 97/296 [08:41<14:10,  4.27s/it]

 33%|███▎      | 98/296 [08:41<10:03,  3.05s/it]

 33%|███▎      | 99/296 [09:02<27:34,  8.40s/it]

 34%|███▍      | 100/296 [09:02<19:30,  5.97s/it]

 34%|███▍      | 101/296 [09:02<13:52,  4.27s/it]

 34%|███▍      | 102/296 [09:03<09:50,  3.04s/it]

 35%|███▍      | 103/296 [09:23<26:59,  8.39s/it]

 35%|███▌      | 104/296 [09:24<19:05,  5.97s/it]

 35%|███▌      | 105/296 [09:24<13:34,  4.26s/it]

 36%|███▌      | 106/296 [09:24<09:37,  3.04s/it]

 36%|███▌      | 107/296 [09:45<26:33,  8.43s/it]

 36%|███▋      | 108/296 [09:46<18:47,  6.00s/it]

 37%|███▋      | 109/296 [09:46<13:21,  4.29s/it]

 37%|███▋      | 110/296 [09:46<09:28,  3.05s/it]

 38%|███▊      | 111/296 [10:07<26:01,  8.44s/it]

 38%|███▊      | 112/296 [10:07<18:24,  6.00s/it]

 38%|███▊      | 113/296 [10:08<13:04,  4.29s/it]

 39%|███▊      | 114/296 [10:08<09:16,  3.06s/it]

 39%|███▉      | 115/296 [10:29<25:21,  8.41s/it]

 39%|███▉      | 116/296 [10:29<17:55,  5.98s/it]

 40%|███▉      | 117/296 [10:29<12:44,  4.27s/it]

 40%|███▉      | 118/296 [10:30<09:02,  3.05s/it]

 40%|████      | 119/296 [10:50<24:47,  8.41s/it]

 41%|████      | 120/296 [10:51<17:31,  5.98s/it]

 41%|████      | 121/296 [10:51<12:27,  4.27s/it]

 41%|████      | 122/296 [10:51<08:49,  3.05s/it]

 42%|████▏     | 123/296 [11:12<24:14,  8.41s/it]

 42%|████▏     | 124/296 [11:12<17:08,  5.98s/it]

 42%|████▏     | 125/296 [11:13<12:10,  4.27s/it]

 43%|████▎     | 126/296 [11:13<08:37,  3.05s/it]

 43%|████▎     | 127/296 [11:34<23:35,  8.38s/it]

 43%|████▎     | 128/296 [11:34<16:40,  5.96s/it]

 44%|████▎     | 129/296 [11:34<11:51,  4.26s/it]

 44%|████▍     | 130/296 [11:35<08:23,  3.03s/it]

 44%|████▍     | 131/296 [11:55<22:58,  8.36s/it]

 45%|████▍     | 132/296 [11:56<16:14,  5.94s/it]

 45%|████▍     | 133/296 [11:56<11:32,  4.25s/it]

 45%|████▌     | 134/296 [11:56<08:10,  3.03s/it]

 46%|████▌     | 135/296 [12:17<22:24,  8.35s/it]

 46%|████▌     | 136/296 [12:17<15:50,  5.94s/it]

 46%|████▋     | 137/296 [12:17<11:14,  4.24s/it]

 47%|████▋     | 138/296 [12:18<07:58,  3.03s/it]

 47%|████▋     | 139/296 [12:39<22:00,  8.41s/it]

 47%|████▋     | 140/296 [12:39<15:33,  5.98s/it]

 48%|████▊     | 141/296 [12:39<11:02,  4.27s/it]

 48%|████▊     | 142/296 [12:39<07:49,  3.05s/it]

 48%|████▊     | 143/296 [13:00<21:22,  8.38s/it]

 49%|████▊     | 144/296 [13:01<15:06,  5.96s/it]

 49%|████▉     | 145/296 [13:01<10:43,  4.26s/it]

 49%|████▉     | 146/296 [13:01<07:35,  3.04s/it]

 50%|████▉     | 147/296 [13:22<20:51,  8.40s/it]

 50%|█████     | 148/296 [13:22<14:44,  5.97s/it]

 50%|█████     | 149/296 [13:23<10:27,  4.27s/it]

 51%|█████     | 150/296 [13:23<07:24,  3.04s/it]

 51%|█████     | 151/296 [13:44<20:16,  8.39s/it]

 51%|█████▏    | 152/296 [13:44<14:19,  5.97s/it]

 52%|█████▏    | 153/296 [13:44<10:09,  4.26s/it]

 52%|█████▏    | 154/296 [13:44<07:11,  3.04s/it]

 52%|█████▏    | 155/296 [14:05<19:44,  8.40s/it]

 53%|█████▎    | 156/296 [14:06<13:56,  5.97s/it]

 53%|█████▎    | 157/296 [14:06<09:53,  4.27s/it]

 53%|█████▎    | 158/296 [14:06<07:00,  3.04s/it]

 54%|█████▎    | 159/296 [14:27<19:08,  8.39s/it]

 54%|█████▍    | 160/296 [14:27<13:31,  5.96s/it]

 54%|█████▍    | 161/296 [14:28<09:35,  4.26s/it]

 55%|█████▍    | 162/296 [14:28<06:47,  3.04s/it]

 55%|█████▌    | 163/296 [14:48<18:30,  8.35s/it]

 55%|█████▌    | 164/296 [14:49<13:04,  5.94s/it]

 56%|█████▌    | 165/296 [14:49<09:16,  4.25s/it]

 56%|█████▌    | 166/296 [14:49<06:33,  3.03s/it]

 56%|█████▋    | 167/296 [15:10<18:00,  8.38s/it]

 57%|█████▋    | 168/296 [15:10<12:42,  5.96s/it]

 57%|█████▋    | 169/296 [15:11<09:00,  4.26s/it]

 57%|█████▋    | 170/296 [15:11<06:22,  3.04s/it]

 58%|█████▊    | 171/296 [15:32<17:26,  8.37s/it]

 58%|█████▊    | 172/296 [15:32<12:17,  5.95s/it]

 58%|█████▊    | 173/296 [15:32<08:43,  4.25s/it]

 59%|█████▉    | 174/296 [15:33<06:09,  3.03s/it]

 59%|█████▉    | 175/296 [15:53<16:56,  8.40s/it]

 59%|█████▉    | 176/296 [15:54<11:57,  5.98s/it]

 60%|█████▉    | 177/296 [15:54<08:28,  4.27s/it]

 60%|██████    | 178/296 [15:54<05:59,  3.04s/it]

 60%|██████    | 179/296 [16:15<16:23,  8.41s/it]

 61%|██████    | 180/296 [16:15<11:33,  5.98s/it]

 61%|██████    | 181/296 [16:16<08:11,  4.27s/it]

 61%|██████▏   | 182/296 [16:16<05:47,  3.05s/it]

 62%|██████▏   | 183/296 [16:37<15:50,  8.41s/it]

 62%|██████▏   | 184/296 [16:37<11:09,  5.98s/it]

 62%|██████▎   | 185/296 [16:37<07:54,  4.27s/it]

 63%|██████▎   | 186/296 [16:38<05:35,  3.05s/it]

 63%|██████▎   | 187/296 [16:59<15:20,  8.44s/it]

 64%|██████▎   | 188/296 [16:59<10:48,  6.00s/it]

 64%|██████▍   | 189/296 [16:59<07:38,  4.29s/it]

 64%|██████▍   | 190/296 [16:59<05:24,  3.06s/it]

 65%|██████▍   | 191/296 [17:20<14:46,  8.44s/it]

 65%|██████▍   | 192/296 [17:21<10:24,  6.00s/it]

 65%|██████▌   | 193/296 [17:21<07:21,  4.29s/it]

 66%|██████▌   | 194/296 [17:21<05:11,  3.06s/it]

 66%|██████▌   | 195/296 [17:42<14:08,  8.40s/it]

 66%|██████▌   | 196/296 [17:42<09:57,  5.97s/it]

 67%|██████▋   | 197/296 [17:43<07:02,  4.27s/it]

 67%|██████▋   | 198/296 [17:43<04:58,  3.04s/it]

 67%|██████▋   | 199/296 [18:04<13:38,  8.43s/it]

 68%|██████▊   | 200/296 [18:04<09:35,  6.00s/it]

 68%|██████▊   | 201/296 [18:05<06:47,  4.29s/it]

 68%|██████▊   | 202/296 [18:05<04:47,  3.05s/it]

 69%|██████▊   | 203/296 [18:26<13:04,  8.43s/it]

 69%|██████▉   | 204/296 [18:26<09:11,  5.99s/it]

 69%|██████▉   | 205/296 [18:26<06:29,  4.28s/it]

 70%|██████▉   | 206/296 [18:26<04:34,  3.05s/it]

 70%|██████▉   | 207/296 [18:47<12:29,  8.42s/it]

 70%|███████   | 208/296 [18:48<08:46,  5.98s/it]

 71%|███████   | 209/296 [18:48<06:12,  4.28s/it]

 71%|███████   | 210/296 [18:48<04:22,  3.05s/it]

 71%|███████▏  | 211/296 [19:09<11:53,  8.40s/it]

 72%|███████▏  | 212/296 [19:09<08:21,  5.97s/it]

 72%|███████▏  | 213/296 [19:10<05:54,  4.27s/it]

 72%|███████▏  | 214/296 [19:10<04:09,  3.04s/it]

 73%|███████▎  | 215/296 [19:31<11:21,  8.41s/it]

 73%|███████▎  | 216/296 [19:31<07:58,  5.98s/it]

 73%|███████▎  | 217/296 [19:31<05:37,  4.27s/it]

 74%|███████▎  | 218/296 [19:32<03:57,  3.05s/it]

 74%|███████▍  | 219/296 [19:53<10:48,  8.43s/it]

 74%|███████▍  | 220/296 [19:53<07:35,  5.99s/it]

 75%|███████▍  | 221/296 [19:53<05:21,  4.28s/it]

 75%|███████▌  | 222/296 [19:53<03:45,  3.05s/it]

 75%|███████▌  | 223/296 [20:14<10:13,  8.40s/it]

 76%|███████▌  | 224/296 [20:15<07:09,  5.97s/it]

 76%|███████▌  | 225/296 [20:15<05:03,  4.27s/it]

 76%|███████▋  | 226/296 [20:15<03:32,  3.04s/it]

 77%|███████▋  | 227/296 [20:36<09:38,  8.39s/it]

 77%|███████▋  | 228/296 [20:36<06:45,  5.96s/it]

 77%|███████▋  | 229/296 [20:36<04:45,  4.26s/it]

 78%|███████▊  | 230/296 [20:37<03:20,  3.04s/it]

 78%|███████▊  | 231/296 [20:58<09:06,  8.41s/it]

 78%|███████▊  | 232/296 [20:58<06:22,  5.98s/it]

 79%|███████▊  | 233/296 [20:58<04:29,  4.27s/it]

 79%|███████▉  | 234/296 [20:58<03:08,  3.05s/it]

 79%|███████▉  | 235/296 [21:19<08:31,  8.39s/it]

 80%|███████▉  | 236/296 [21:20<05:57,  5.96s/it]

 80%|████████  | 237/296 [21:20<04:11,  4.26s/it]

 80%|████████  | 238/296 [21:20<02:56,  3.04s/it]

 81%|████████  | 239/296 [21:41<08:01,  8.44s/it]

 81%|████████  | 240/296 [21:41<05:36,  6.00s/it]

 81%|████████▏ | 241/296 [21:42<03:56,  4.29s/it]

 82%|████████▏ | 242/296 [21:42<02:45,  3.06s/it]

 82%|████████▏ | 243/296 [22:03<07:27,  8.45s/it]

 82%|████████▏ | 244/296 [22:03<05:12,  6.01s/it]

 83%|████████▎ | 245/296 [22:03<03:39,  4.30s/it]

 83%|████████▎ | 246/296 [22:04<02:33,  3.06s/it]

 83%|████████▎ | 247/296 [22:25<06:54,  8.46s/it]

 84%|████████▍ | 248/296 [22:25<04:48,  6.02s/it]

 84%|████████▍ | 249/296 [22:25<03:22,  4.30s/it]

 84%|████████▍ | 250/296 [22:26<02:21,  3.07s/it]

 85%|████████▍ | 251/296 [22:47<06:21,  8.48s/it]

 85%|████████▌ | 252/296 [22:47<04:25,  6.03s/it]

 85%|████████▌ | 253/296 [22:47<03:05,  4.31s/it]

 86%|████████▌ | 254/296 [22:47<02:09,  3.07s/it]

 86%|████████▌ | 255/296 [23:09<05:47,  8.48s/it]

 86%|████████▋ | 256/296 [23:09<04:01,  6.03s/it]

 87%|████████▋ | 257/296 [23:09<02:48,  4.31s/it]

 87%|████████▋ | 258/296 [23:09<01:56,  3.07s/it]

 88%|████████▊ | 259/296 [23:30<05:13,  8.47s/it]

 88%|████████▊ | 260/296 [23:31<03:36,  6.02s/it]

 88%|████████▊ | 261/296 [23:31<02:30,  4.31s/it]

 89%|████████▊ | 262/296 [23:31<01:44,  3.07s/it]

 89%|████████▉ | 263/296 [23:52<04:39,  8.46s/it]

 89%|████████▉ | 264/296 [23:53<03:12,  6.02s/it]

 90%|████████▉ | 265/296 [23:53<02:13,  4.30s/it]

 90%|████████▉ | 266/296 [23:53<01:31,  3.06s/it]

 90%|█████████ | 267/296 [24:14<04:05,  8.45s/it]

 91%|█████████ | 268/296 [24:14<02:48,  6.01s/it]

 91%|█████████ | 269/296 [24:15<01:55,  4.30s/it]

 91%|█████████ | 270/296 [24:15<01:19,  3.06s/it]

 92%|█████████▏| 271/296 [24:36<03:31,  8.45s/it]

 92%|█████████▏| 272/296 [24:36<02:24,  6.01s/it]

 92%|█████████▏| 273/296 [24:36<01:38,  4.30s/it]

 93%|█████████▎| 274/296 [24:37<01:07,  3.06s/it]

 93%|█████████▎| 275/296 [24:58<02:58,  8.48s/it]

 93%|█████████▎| 276/296 [24:58<02:00,  6.03s/it]

 94%|█████████▎| 277/296 [24:58<01:21,  4.31s/it]

 94%|█████████▍| 278/296 [24:59<00:55,  3.07s/it]

 94%|█████████▍| 279/296 [25:20<02:24,  8.47s/it]

 95%|█████████▍| 280/296 [25:20<01:36,  6.02s/it]

 95%|█████████▍| 281/296 [25:20<01:04,  4.31s/it]

 95%|█████████▌| 282/296 [25:20<00:42,  3.07s/it]

 96%|█████████▌| 283/296 [25:42<01:50,  8.48s/it]

 96%|█████████▌| 284/296 [25:42<01:12,  6.03s/it]

 96%|█████████▋| 285/296 [25:42<00:47,  4.31s/it]

 97%|█████████▋| 286/296 [25:42<00:30,  3.07s/it]

 97%|█████████▋| 287/296 [26:03<01:16,  8.46s/it]

 97%|█████████▋| 288/296 [26:04<00:48,  6.01s/it]

 98%|█████████▊| 289/296 [26:04<00:30,  4.30s/it]

 98%|█████████▊| 290/296 [26:04<00:18,  3.06s/it]

 98%|█████████▊| 291/296 [26:25<00:42,  8.46s/it]

 99%|█████████▊| 292/296 [26:25<00:24,  6.01s/it]

 99%|█████████▉| 293/296 [26:26<00:12,  4.30s/it]

 99%|█████████▉| 294/296 [26:26<00:06,  3.06s/it]

100%|█████████▉| 295/296 [26:47<00:08,  8.47s/it]

100%|██████████| 296/296 [26:47<00:00,  6.03s/it]

100%|██████████| 296/296 [26:47<00:00,  5.43s/it]




In [27]:
# the number should be close to 295 (the number of partitions generated by k-means/spectral clustering)
ensemble.shape

(296, 3)

In [28]:
ensemble.head()

Unnamed: 0_level_0,clusterer_params,partition,n_clusters
clusterer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AgglomerativeClustering #0,"{'affinity': 'precomputed', 'compute_full_tree...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
AgglomerativeClustering #1,"{'affinity': 'precomputed', 'compute_full_tree...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
AgglomerativeClustering #2,"{'affinity': 'euclidean', 'compute_full_tree':...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2
AgglomerativeClustering #3,"{'affinity': 'precomputed', 'compute_full_tree...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
AgglomerativeClustering #4,"{'affinity': 'precomputed', 'compute_full_tree...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3


In [29]:
ensemble["n_clusters"].value_counts().head()

75    4
20    4
22    4
23    4
24    4
Name: n_clusters, dtype: int64

In [30]:
ensemble_stats = ensemble["n_clusters"].describe()
display(ensemble_stats)

count    296.000000
mean      38.500000
std       21.396182
min        2.000000
25%       20.000000
50%       38.500000
75%       57.000000
max       75.000000
Name: n_clusters, dtype: float64

### Testing

In [31]:
assert ensemble_stats["min"] > 1

In [32]:
assert not ensemble["n_clusters"].isna().any()

In [33]:
assert ensemble.shape[0] == len(CLUSTERERS)

In [34]:
# all partitions have the right size
assert np.all(
    [part["partition"].shape[0] == data.shape[0] for idx, part in ensemble.iterrows()]
)

In [35]:
# no partition has negative clusters (noisy points)
assert not np.any([(part["partition"] < 0).any() for idx, part in ensemble.iterrows()])

## Save

In [36]:
del CLUSTERING_OPTIONS["LINKAGE"]

output_filename = Path(
    RESULTS_DIR,
    generate_result_set_name(
        CLUSTERING_OPTIONS,
        prefix=f"{clustering_method_name}-",
        suffix=".pkl",
    ),
).resolve()
display(output_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores/AgglomerativeClustering-AFFINITY_euclidean-K_MAX_75-K_MIN_2.pkl')

In [37]:
ensemble.to_pickle(output_filename)