# Description

It combines all clustering solutions generated into a single consolidated solution using consensus clustering.

# Environment variables

In [1]:
from IPython.display import display

# set numpy n_jobs to 1, since I'll be using n_jobs later
NUMPY_N_JOBS = 1
display(NUMPY_N_JOBS)

1

In [2]:
%env MKL_NUM_THREADS=$NUMPY_N_JOBS
%env OPEN_BLAS_NUM_THREADS=$NUMPY_N_JOBS
%env NUMEXPR_NUM_THREADS=$NUMPY_N_JOBS
%env OMP_NUM_THREADS=$NUMPY_N_JOBS

env: MKL_NUM_THREADS=1
env: OPEN_BLAS_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

import conf

# Settings

In [5]:
RANDOM_GENERATOR = np.random.default_rng(12345)

## Consensus clustering

In [6]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_MIN"] = 2
CLUSTERING_OPTIONS["K_MAX"] = 40

display(CLUSTERING_OPTIONS)

{'K_MIN': 2, 'K_MAX': 40}

In [7]:
# output dir for this notebook
RESULTS_DIR = Path(conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering')

# Load ensemble

In [8]:
output_file = Path(RESULTS_DIR, "ensemble.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble.npy')

In [9]:
full_ensemble = np.load(output_file)

In [10]:
display(full_ensemble.shape)

(4428, 3749)

# Load ensemble coassociation distance matrix

In [11]:
output_file = Path(RESULTS_DIR, "ensemble_coassoc_matrix.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble_coassoc_matrix.npy')

In [12]:
ensemble_coassoc_matrix = np.load(output_file)

In [13]:
display(ensemble_coassoc_matrix.shape)

(3749, 3749)

In [14]:
display(ensemble_coassoc_matrix)

array([[0.        , 0.05004529, 0.04868659, ..., 0.62794384, 0.66445714,
        0.67006341],
       [0.05004529, 0.        , 0.02241848, ..., 0.62771739, 0.66514286,
        0.66779891],
       [0.04868659, 0.02241848, 0.        , ..., 0.62703804, 0.66285714,
        0.66530797],
       ...,
       [0.62794384, 0.62771739, 0.62703804, ..., 0.        , 0.59288808,
        0.66282746],
       [0.66445714, 0.66514286, 0.66285714, ..., 0.59288808, 0.        ,
        0.6127194 ],
       [0.67006341, 0.66779891, 0.66530797, ..., 0.66282746, 0.6127194 ,
        0.        ]])

# Consensus clustering

In [15]:
from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm

from clustering.ensemble import (
    eac_single_coassoc_matrix,
    eac_complete_coassoc_matrix,
    eac_average_coassoc_matrix,
    run_method_and_compute_agreement,
)

In [16]:
all_consensus_methods = set(
    (
        eac_single_coassoc_matrix,
        eac_complete_coassoc_matrix,
        eac_average_coassoc_matrix,
    )
)
display(all_consensus_methods)

{<function clustering.ensemble.eac_average_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_complete_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_single_coassoc_matrix(coassoc_matrix, k)>}

In [17]:
consensus_results = []

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(
            run_method_and_compute_agreement,
            m,
            ensemble_coassoc_matrix,
            full_ensemble,
            k,
        ): (m.__name__, k)
        for m in all_consensus_methods
        for k in range(CLUSTERING_OPTIONS["K_MIN"], CLUSTERING_OPTIONS["K_MAX"] + 1)
    }

    for future in tqdm(as_completed(tasks), total=len(tasks), disable=False, ncols=100):
        method_name, k = tasks[future]
        part, performance_values = future.result()

        method_results = {
            "method": method_name,
            "partition": part,
            "k": k,
        }
        method_results.update(performance_values)

        consensus_results.append(method_results)

  0%|                                                                       | 0/117 [00:00<?, ?it/s]

  1%|▌                                                              | 1/117 [00:23<46:01, 23.80s/it]

  2%|█                                                              | 2/117 [00:24<32:29, 16.95s/it]

  3%|█▌                                                             | 3/117 [00:26<23:31, 12.39s/it]

  3%|██▏                                                            | 4/117 [00:49<29:16, 15.55s/it]

  4%|██▋                                                            | 5/117 [00:52<21:50, 11.70s/it]

  5%|███▏                                                           | 6/117 [00:54<16:28,  8.90s/it]

  6%|███▊                                                           | 7/117 [01:18<24:23, 13.31s/it]

  7%|████▎                                                          | 8/117 [01:21<18:35, 10.23s/it]

  8%|████▊                                                          | 9/117 [01:23<14:24,  8.00s/it]

  9%|█████▎                                                        | 10/117 [01:48<23:03, 12.93s/it]

  9%|█████▊                                                        | 11/117 [01:51<17:42, 10.02s/it]

 10%|██████▎                                                       | 12/117 [01:55<14:03,  8.03s/it]

 11%|██████▉                                                       | 13/117 [02:19<22:41, 13.10s/it]

 12%|███████▍                                                      | 14/117 [02:24<17:50, 10.39s/it]

 13%|███████▉                                                      | 15/117 [02:27<14:14,  8.37s/it]

 14%|████████▍                                                     | 16/117 [02:53<22:46, 13.53s/it]

 15%|█████████                                                     | 17/117 [02:57<17:59, 10.79s/it]

 15%|█████████▌                                                    | 18/117 [03:01<14:30,  8.79s/it]

 16%|██████████                                                    | 19/117 [03:27<22:45, 13.94s/it]

 17%|██████████▌                                                   | 20/117 [03:32<18:02, 11.16s/it]

 18%|███████████▏                                                  | 21/117 [03:36<14:42,  9.19s/it]

 19%|███████████▋                                                  | 22/117 [04:03<22:40, 14.32s/it]

 20%|████████████▏                                                 | 23/117 [04:08<18:09, 11.59s/it]

 21%|████████████▋                                                 | 24/117 [04:13<14:51,  9.59s/it]

 21%|█████████████▏                                                | 25/117 [04:40<22:35, 14.73s/it]

 22%|█████████████▊                                                | 26/117 [04:45<18:09, 11.98s/it]

 23%|██████████████▎                                               | 27/117 [04:50<14:56,  9.96s/it]

 24%|██████████████▊                                               | 28/117 [05:19<22:49, 15.39s/it]

 25%|███████████████▎                                              | 29/117 [05:25<18:27, 12.58s/it]

 26%|███████████████▉                                              | 30/117 [05:30<15:04, 10.39s/it]

 26%|████████████████▍                                             | 31/117 [05:58<22:38, 15.80s/it]

 27%|████████████████▉                                             | 32/117 [06:05<18:26, 13.01s/it]

 28%|█████████████████▍                                            | 33/117 [06:10<15:04, 10.77s/it]

 29%|██████████████████                                            | 34/117 [06:39<22:26, 16.22s/it]

 30%|██████████████████▌                                           | 35/117 [06:46<18:15, 13.36s/it]

 31%|███████████████████                                           | 36/117 [06:52<15:00, 11.12s/it]

 32%|███████████████████▌                                          | 37/117 [07:21<22:14, 16.68s/it]

 32%|████████████████████▏                                         | 38/117 [07:28<18:01, 13.69s/it]

 33%|████████████████████▋                                         | 39/117 [07:34<14:52, 11.44s/it]

 34%|█████████████████████▏                                        | 40/117 [07:44<14:02, 10.95s/it]

 35%|█████████████████████▋                                        | 41/117 [07:51<12:24,  9.79s/it]

 36%|██████████████████████▎                                       | 42/117 [07:59<11:17,  9.04s/it]

 37%|██████████████████████▊                                       | 43/117 [08:09<11:43,  9.51s/it]

 38%|███████████████████████▎                                      | 44/117 [08:17<10:47,  8.88s/it]

 38%|███████████████████████▊                                      | 45/117 [08:24<10:09,  8.47s/it]

 39%|████████████████████████▍                                     | 46/117 [08:35<11:00,  9.30s/it]

 40%|████████████████████████▉                                     | 47/117 [08:43<10:21,  8.88s/it]

 41%|█████████████████████████▍                                    | 48/117 [08:51<09:52,  8.59s/it]

 42%|█████████████████████████▉                                    | 49/117 [09:03<10:48,  9.53s/it]

 43%|██████████████████████████▍                                   | 50/117 [09:11<10:09,  9.10s/it]

 44%|███████████████████████████                                   | 51/117 [09:19<09:44,  8.85s/it]

 44%|███████████████████████████▌                                  | 52/117 [09:31<10:42,  9.88s/it]

 45%|████████████████████████████                                  | 53/117 [09:40<10:01,  9.39s/it]

 46%|████████████████████████████▌                                 | 54/117 [09:48<09:39,  9.19s/it]

 47%|█████████████████████████████▏                                | 55/117 [10:01<10:35, 10.25s/it]

 48%|█████████████████████████████▋                                | 56/117 [10:10<09:57,  9.79s/it]

 49%|██████████████████████████████▏                               | 57/117 [10:19<09:32,  9.54s/it]

 50%|██████████████████████████████▋                               | 58/117 [10:32<10:25, 10.60s/it]

 50%|███████████████████████████████▎                              | 59/117 [10:41<09:48, 10.15s/it]

 51%|███████████████████████████████▊                              | 60/117 [10:51<09:28,  9.97s/it]

 52%|████████████████████████████████▎                             | 61/117 [11:04<10:16, 11.02s/it]

 53%|████████████████████████████████▊                             | 62/117 [11:13<09:38, 10.51s/it]

 54%|█████████████████████████████████▍                            | 63/117 [11:23<09:20, 10.38s/it]

 55%|█████████████████████████████████▉                            | 64/117 [11:37<10:04, 11.40s/it]

 56%|██████████████████████████████████▍                           | 65/117 [11:47<09:23, 10.83s/it]

 56%|██████████████████████████████████▉                           | 66/117 [11:57<09:07, 10.74s/it]

 57%|███████████████████████████████████▌                          | 67/117 [12:11<09:49, 11.78s/it]

 58%|████████████████████████████████████                          | 68/117 [12:21<09:07, 11.17s/it]

 59%|████████████████████████████████████▌                         | 69/117 [12:32<08:49, 11.04s/it]

 60%|█████████████████████████████████████                         | 70/117 [12:47<09:32, 12.17s/it]

 61%|█████████████████████████████████████▌                        | 71/117 [12:57<08:52, 11.58s/it]

 62%|██████████████████████████████████████▏                       | 72/117 [13:08<08:34, 11.43s/it]

 62%|██████████████████████████████████████▋                       | 73/117 [13:23<09:10, 12.51s/it]

 63%|███████████████████████████████████████▏                      | 74/117 [13:34<08:32, 11.93s/it]

 64%|███████████████████████████████████████▋                      | 75/117 [13:45<08:15, 11.79s/it]

 65%|████████████████████████████████████████▎                     | 76/117 [14:00<08:45, 12.83s/it]

 66%|████████████████████████████████████████▊                     | 77/117 [14:11<08:11, 12.29s/it]

 67%|█████████████████████████████████████████▎                    | 78/117 [14:23<07:55, 12.19s/it]

 68%|█████████████████████████████████████████▊                    | 79/117 [14:24<05:36,  8.86s/it]

 68%|██████████████████████████████████████████▍                   | 80/117 [14:36<05:59,  9.72s/it]

 69%|██████████████████████████████████████████▉                   | 81/117 [14:49<06:19, 10.54s/it]

 70%|███████████████████████████████████████████▍                  | 82/117 [14:52<04:51,  8.33s/it]

 71%|███████████████████████████████████████████▉                  | 83/117 [15:04<05:21,  9.46s/it]

 72%|████████████████████████████████████████████▌                 | 84/117 [15:17<05:48, 10.56s/it]

 73%|█████████████████████████████████████████████                 | 85/117 [15:21<04:32,  8.53s/it]

 74%|█████████████████████████████████████████████▌                | 86/117 [15:33<05:01,  9.73s/it]

 74%|██████████████████████████████████████████████                | 87/117 [15:47<05:25, 10.85s/it]

 75%|██████████████████████████████████████████████▋               | 88/117 [15:51<04:16,  8.83s/it]

 76%|███████████████████████████████████████████████▏              | 89/117 [16:04<04:41, 10.05s/it]

 77%|███████████████████████████████████████████████▋              | 90/117 [16:18<05:03, 11.24s/it]

 78%|████████████████████████████████████████████████▏             | 91/117 [16:24<04:08,  9.58s/it]

 79%|████████████████████████████████████████████████▊             | 92/117 [16:37<04:25, 10.63s/it]

 79%|█████████████████████████████████████████████████▎            | 93/117 [16:52<04:46, 11.93s/it]

 80%|█████████████████████████████████████████████████▊            | 94/117 [16:58<03:55, 10.23s/it]

 81%|██████████████████████████████████████████████████▎           | 95/117 [17:12<04:08, 11.27s/it]

 82%|██████████████████████████████████████████████████▊           | 96/117 [17:27<04:22, 12.49s/it]

 83%|███████████████████████████████████████████████████▍          | 97/117 [17:34<03:35, 10.80s/it]

 84%|███████████████████████████████████████████████████▉          | 98/117 [17:48<03:43, 11.78s/it]

 85%|████████████████████████████████████████████████████▍         | 99/117 [18:04<03:54, 13.00s/it]

 85%|████████████████████████████████████████████████████▏        | 100/117 [18:11<03:12, 11.32s/it]

 86%|████████████████████████████████████████████████████▋        | 101/117 [18:26<03:17, 12.32s/it]

 87%|█████████████████████████████████████████████████████▏       | 102/117 [18:42<03:21, 13.43s/it]

 88%|█████████████████████████████████████████████████████▋       | 103/117 [18:49<02:44, 11.73s/it]

 89%|██████████████████████████████████████████████████████▏      | 104/117 [19:05<02:46, 12.79s/it]

 90%|██████████████████████████████████████████████████████▋      | 105/117 [19:21<02:47, 13.94s/it]

 91%|███████████████████████████████████████████████████████▎     | 106/117 [19:30<02:14, 12.20s/it]

 91%|███████████████████████████████████████████████████████▊     | 107/117 [19:45<02:11, 13.13s/it]

 92%|████████████████████████████████████████████████████████▎    | 108/117 [20:02<02:09, 14.42s/it]

 93%|████████████████████████████████████████████████████████▊    | 109/117 [20:11<01:42, 12.87s/it]

 94%|█████████████████████████████████████████████████████████▎   | 110/117 [20:27<01:35, 13.65s/it]

 95%|█████████████████████████████████████████████████████████▊   | 111/117 [20:45<01:29, 14.92s/it]

 96%|██████████████████████████████████████████████████████████▍  | 112/117 [20:55<01:06, 13.36s/it]

 97%|██████████████████████████████████████████████████████████▉  | 113/117 [21:11<00:56, 14.14s/it]

 97%|███████████████████████████████████████████████████████████▍ | 114/117 [21:29<00:45, 15.29s/it]

 98%|███████████████████████████████████████████████████████████▉ | 115/117 [21:39<00:27, 13.76s/it]

 99%|████████████████████████████████████████████████████████████▍| 116/117 [21:54<00:14, 14.34s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:12<00:00, 15.18s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:12<00:00, 11.38s/it]




In [18]:
# TODO: check if each partition is really generating k clusters

In [19]:
consensus_results = pd.DataFrame(consensus_results)

In [20]:
display(consensus_results.shape)

(117, 12)

In [21]:
consensus_results.head()

Unnamed: 0,method,partition,k,ari_mean,ari_median,ari_std,ami_mean,ami_median,ami_std,nmi_mean,nmi_median,nmi_std
0,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,0.240317,0.037024,0.349342,0.24949,0.080186,0.324666,0.251982,0.082369,0.323813
1,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,0.263578,0.054609,0.356479,0.284536,0.112219,0.32368,0.288405,0.116221,0.322205
2,eac_average_coassoc_matrix,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4,0.287902,0.124714,0.331105,0.315595,0.187761,0.275006,0.320842,0.192473,0.273039
3,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,0.287923,0.124713,0.331126,0.318194,0.191098,0.27662,0.324061,0.195962,0.274261
4,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,0.223957,0.147902,0.226714,0.299478,0.262243,0.142417,0.306541,0.268924,0.141986


## Testing

In [22]:
assert not consensus_results.isna().any().any()

In [23]:
# check that the number of clusters in the partitions are the expected ones
_real_k_values = consensus_results["partition"].apply(lambda x: np.unique(x).shape[0])
display(_real_k_values)
assert np.all(consensus_results["k"].values == _real_k_values.values)

0       2
1       3
2       4
3       5
4       6
       ..
112    36
113    37
114    38
115    39
116    40
Name: partition, Length: 117, dtype: int64

## Save

In [24]:
output_file = Path(RESULTS_DIR, "consensus_clustering_runs.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/consensus_clustering_runs.pkl')

In [25]:
consensus_results.to_pickle(output_file)