# Description

It combines all clustering solutions generated into a single consolidated solution using consensus clustering.

# Environment variables

In [1]:
from IPython.display import display

# set numpy n_jobs to 1, since I'll be using n_jobs later
NUMPY_N_JOBS = 1
display(NUMPY_N_JOBS)

1

In [2]:
%env MKL_NUM_THREADS=$NUMPY_N_JOBS
%env OPEN_BLAS_NUM_THREADS=$NUMPY_N_JOBS
%env NUMEXPR_NUM_THREADS=$NUMPY_N_JOBS
%env OMP_NUM_THREADS=$NUMPY_N_JOBS

env: MKL_NUM_THREADS=1
env: OPEN_BLAS_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

import conf

# Settings

In [5]:
RANDOM_GENERATOR = np.random.default_rng(12345)

## Consensus clustering

In [6]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_MIN"] = 2
CLUSTERING_OPTIONS["K_MAX"] = 40

display(CLUSTERING_OPTIONS)

{'K_MIN': 2, 'K_MAX': 40}

In [7]:
# output dir for this notebook
RESULTS_DIR = Path(conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering')

# Load ensemble

In [8]:
output_file = Path(RESULTS_DIR, "ensemble.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble.npy')

In [9]:
full_ensemble = np.load(output_file)

In [10]:
display(full_ensemble.shape)

(4428, 3749)

# Load ensemble coassociation distance matrix

In [11]:
output_file = Path(RESULTS_DIR, "ensemble_coassoc_matrix.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble_coassoc_matrix.npy')

In [12]:
ensemble_coassoc_matrix = np.load(output_file)

In [13]:
display(ensemble_coassoc_matrix.shape)

(3749, 3749)

In [14]:
display(ensemble_coassoc_matrix)

array([[0.        , 0.05615942, 0.05389493, ..., 0.59986413, 0.66171429,
        0.66757246],
       [0.05615942, 0.        , 0.02740036, ..., 0.60122283, 0.65988571,
        0.66530797],
       [0.05389493, 0.02740036, 0.        , ..., 0.59782609, 0.66102857,
        0.66281703],
       ...,
       [0.59986413, 0.60122283, 0.59782609, ..., 0.        , 0.6699339 ,
        0.66531165],
       [0.66171429, 0.65988571, 0.66102857, ..., 0.6699339 , 0.        ,
        0.58650558],
       [0.66757246, 0.66530797, 0.66281703, ..., 0.66531165, 0.58650558,
        0.        ]])

# Consensus clustering

In [15]:
from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm

from clustering.ensemble import (
    eac_single_coassoc_matrix,
    eac_complete_coassoc_matrix,
    eac_average_coassoc_matrix,
    run_method_and_compute_agreement,
)

In [16]:
all_consensus_methods = set(
    (
        eac_single_coassoc_matrix,
        eac_complete_coassoc_matrix,
        eac_average_coassoc_matrix,
    )
)
display(all_consensus_methods)

{<function clustering.ensemble.eac_average_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_complete_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_single_coassoc_matrix(coassoc_matrix, k)>}

In [17]:
consensus_results = []

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(
            run_method_and_compute_agreement,
            m,
            ensemble_coassoc_matrix,
            full_ensemble,
            k,
        ): (m.__name__, k)
        for m in all_consensus_methods
        for k in range(CLUSTERING_OPTIONS["K_MIN"], CLUSTERING_OPTIONS["K_MAX"] + 1)
    }

    for future in tqdm(as_completed(tasks), total=len(tasks), disable=False, ncols=100):
        method_name, k = tasks[future]
        part, performance_values = future.result()

        method_results = {
            "method": method_name,
            "partition": part,
            "k": k,
        }
        method_results.update(performance_values)

        consensus_results.append(method_results)

  0%|                                                                       | 0/117 [00:00<?, ?it/s]

  1%|▌                                                              | 1/117 [00:23<46:00, 23.79s/it]

  2%|█                                                              | 2/117 [00:24<32:34, 17.00s/it]

  3%|█▌                                                             | 3/117 [00:26<23:29, 12.37s/it]

  3%|██▏                                                            | 4/117 [00:49<29:24, 15.61s/it]

  4%|██▋                                                            | 5/117 [00:53<22:17, 11.94s/it]

  5%|███▏                                                           | 6/117 [00:54<16:27,  8.90s/it]

  6%|███▊                                                           | 7/117 [01:19<24:43, 13.49s/it]

  7%|████▎                                                          | 8/117 [01:22<19:05, 10.51s/it]

  8%|████▊                                                          | 9/117 [01:24<14:29,  8.05s/it]

  9%|█████▎                                                        | 10/117 [01:49<23:19, 13.08s/it]

  9%|█████▊                                                        | 11/117 [01:53<18:08, 10.27s/it]

 10%|██████▎                                                       | 12/117 [01:56<14:18,  8.18s/it]

 11%|██████▉                                                       | 13/117 [02:21<23:01, 13.29s/it]

 12%|███████▍                                                      | 14/117 [02:26<18:26, 10.74s/it]

 13%|███████▉                                                      | 15/117 [02:30<14:45,  8.68s/it]

 14%|████████▍                                                     | 16/117 [02:56<23:29, 13.95s/it]

 15%|█████████                                                     | 17/117 [03:01<18:46, 11.27s/it]

 15%|█████████▌                                                    | 18/117 [03:06<15:10,  9.20s/it]

 16%|██████████                                                    | 19/117 [03:33<23:38, 14.48s/it]

 17%|██████████▌                                                   | 20/117 [03:37<18:45, 11.61s/it]

 18%|███████████▏                                                  | 21/117 [03:42<15:14,  9.53s/it]

 19%|███████████▋                                                  | 22/117 [04:10<23:44, 15.00s/it]

 20%|████████████▏                                                 | 23/117 [04:15<18:59, 12.12s/it]

 21%|████████████▋                                                 | 24/117 [04:20<15:29,  9.99s/it]

 21%|█████████████▏                                                | 25/117 [04:49<23:43, 15.47s/it]

 22%|█████████████▊                                                | 26/117 [04:54<18:56, 12.49s/it]

 23%|██████████████▎                                               | 27/117 [04:59<15:31, 10.35s/it]

 24%|██████████████▊                                               | 28/117 [05:29<23:54, 16.12s/it]

 25%|███████████████▎                                              | 29/117 [05:35<19:09, 13.06s/it]

 26%|███████████████▉                                              | 30/117 [05:41<15:43, 10.84s/it]

 26%|████████████████▍                                             | 31/117 [06:11<23:59, 16.74s/it]

 27%|████████████████▉                                             | 32/117 [06:17<19:08, 13.51s/it]

 28%|█████████████████▍                                            | 33/117 [06:23<15:54, 11.37s/it]

 29%|██████████████████                                            | 34/117 [06:55<23:53, 17.27s/it]

 30%|██████████████████▌                                           | 35/117 [07:00<18:57, 13.87s/it]

 31%|███████████████████                                           | 36/117 [07:07<15:54, 11.78s/it]

 32%|███████████████████▌                                          | 37/117 [07:39<23:40, 17.75s/it]

 32%|████████████████████▏                                         | 38/117 [07:45<18:42, 14.21s/it]

 33%|████████████████████▋                                         | 39/117 [07:53<15:55, 12.25s/it]

 34%|█████████████████████▏                                        | 40/117 [08:02<14:32, 11.33s/it]

 35%|█████████████████████▋                                        | 41/117 [08:08<12:27,  9.83s/it]

 36%|██████████████████████▎                                       | 42/117 [08:17<11:57,  9.57s/it]

 37%|██████████████████████▊                                       | 43/117 [08:27<11:56,  9.68s/it]

 38%|███████████████████████▎                                      | 44/117 [08:34<10:37,  8.73s/it]

 38%|███████████████████████▊                                      | 45/117 [08:43<10:44,  8.96s/it]

 39%|████████████████████████▍                                     | 46/117 [08:53<11:01,  9.31s/it]

 40%|████████████████████████▉                                     | 47/117 [09:00<09:59,  8.57s/it]

 41%|█████████████████████████▍                                    | 48/117 [09:10<10:26,  9.07s/it]

 42%|█████████████████████████▉                                    | 49/117 [09:21<10:43,  9.46s/it]

 43%|██████████████████████████▍                                   | 50/117 [09:28<09:46,  8.75s/it]

 44%|███████████████████████████                                   | 51/117 [09:38<10:12,  9.28s/it]

 44%|███████████████████████████▌                                  | 52/117 [09:49<10:31,  9.71s/it]

 45%|████████████████████████████                                  | 53/117 [09:56<09:37,  9.02s/it]

 46%|████████████████████████████▌                                 | 54/117 [10:07<10:03,  9.58s/it]

 47%|█████████████████████████████▏                                | 55/117 [10:18<10:19, 10.00s/it]

 48%|█████████████████████████████▋                                | 56/117 [10:26<09:24,  9.26s/it]

 49%|██████████████████████████████▏                               | 57/117 [10:37<09:51,  9.87s/it]

 50%|██████████████████████████████▋                               | 58/117 [10:49<10:11, 10.36s/it]

 50%|███████████████████████████████▎                              | 59/117 [10:56<09:13,  9.55s/it]

 51%|███████████████████████████████▊                              | 60/117 [11:08<09:43, 10.24s/it]

 52%|████████████████████████████████▎                             | 61/117 [11:20<09:59, 10.71s/it]

 53%|████████████████████████████████▊                             | 62/117 [11:28<09:01,  9.84s/it]

 54%|█████████████████████████████████▍                            | 63/117 [11:40<09:33, 10.62s/it]

 55%|█████████████████████████████████▉                            | 64/117 [11:52<09:47, 11.08s/it]

 56%|██████████████████████████████████▍                           | 65/117 [12:00<08:45, 10.11s/it]

 56%|██████████████████████████████████▉                           | 66/117 [12:13<09:16, 10.91s/it]

 57%|███████████████████████████████████▌                          | 67/117 [12:26<09:34, 11.50s/it]

 58%|████████████████████████████████████                          | 68/117 [12:34<08:28, 10.38s/it]

 59%|████████████████████████████████████▌                         | 69/117 [12:47<08:57, 11.19s/it]

 60%|█████████████████████████████████████                         | 70/117 [13:00<09:18, 11.88s/it]

 61%|█████████████████████████████████████▌                        | 71/117 [13:08<08:12, 10.70s/it]

 62%|██████████████████████████████████████▏                       | 72/117 [13:21<08:37, 11.50s/it]

 62%|██████████████████████████████████████▋                       | 73/117 [13:35<08:58, 12.24s/it]

 63%|███████████████████████████████████████▏                      | 74/117 [13:44<07:54, 11.03s/it]

 64%|███████████████████████████████████████▋                      | 75/117 [13:57<08:16, 11.82s/it]

 65%|████████████████████████████████████████▎                     | 76/117 [14:12<08:36, 12.59s/it]

 66%|████████████████████████████████████████▊                     | 77/117 [14:20<07:33, 11.33s/it]

 67%|█████████████████████████████████████████▎                    | 78/117 [14:34<07:56, 12.22s/it]

 68%|█████████████████████████████████████████▊                    | 79/117 [14:36<05:42,  9.01s/it]

 68%|██████████████████████████████████████████▍                   | 80/117 [14:45<05:34,  9.05s/it]

 69%|██████████████████████████████████████████▉                   | 81/117 [15:00<06:29, 10.82s/it]

 70%|███████████████████████████████████████████▍                  | 82/117 [15:04<05:03,  8.66s/it]

 71%|███████████████████████████████████████████▉                  | 83/117 [15:13<05:01,  8.87s/it]

 72%|████████████████████████████████████████████▌                 | 84/117 [15:29<06:01, 10.95s/it]

 73%|█████████████████████████████████████████████                 | 85/117 [15:33<04:45,  8.91s/it]

 74%|█████████████████████████████████████████████▌                | 86/117 [15:43<04:43,  9.13s/it]

 74%|██████████████████████████████████████████████                | 87/117 [16:00<05:44, 11.50s/it]

 75%|██████████████████████████████████████████████▋               | 88/117 [16:04<04:34,  9.48s/it]

 76%|███████████████████████████████████████████████▏              | 89/117 [16:14<04:27,  9.56s/it]

 77%|███████████████████████████████████████████████▋              | 90/117 [16:31<05:20, 11.88s/it]

 78%|████████████████████████████████████████████████▏             | 91/117 [16:37<04:18,  9.94s/it]

 79%|████████████████████████████████████████████████▊             | 92/117 [16:47<04:09,  9.98s/it]

 79%|█████████████████████████████████████████████████▎            | 93/117 [17:05<04:57, 12.39s/it]

 80%|█████████████████████████████████████████████████▊            | 94/117 [17:12<04:06, 10.72s/it]

 81%|██████████████████████████████████████████████████▎           | 95/117 [17:23<03:56, 10.74s/it]

 82%|██████████████████████████████████████████████████▊           | 96/117 [17:41<04:32, 13.00s/it]

 83%|███████████████████████████████████████████████████▍          | 97/117 [17:48<03:45, 11.26s/it]

 84%|███████████████████████████████████████████████████▉          | 98/117 [17:59<03:33, 11.25s/it]

 85%|████████████████████████████████████████████████████▍         | 99/117 [18:18<04:02, 13.46s/it]

 85%|████████████████████████████████████████████████████▏        | 100/117 [18:26<03:20, 11.81s/it]

 86%|████████████████████████████████████████████████████▋        | 101/117 [18:37<03:06, 11.67s/it]

 87%|█████████████████████████████████████████████████████▏       | 102/117 [18:56<03:28, 13.87s/it]

 88%|█████████████████████████████████████████████████████▋       | 103/117 [19:05<02:52, 12.29s/it]

 89%|██████████████████████████████████████████████████████▏      | 104/117 [19:17<02:38, 12.17s/it]

 90%|██████████████████████████████████████████████████████▋      | 105/117 [19:36<02:50, 14.25s/it]

 91%|███████████████████████████████████████████████████████▎     | 106/117 [19:46<02:22, 12.97s/it]

 91%|███████████████████████████████████████████████████████▊     | 107/117 [19:59<02:10, 13.09s/it]

 92%|████████████████████████████████████████████████████████▎    | 108/117 [20:19<02:14, 15.00s/it]

 93%|████████████████████████████████████████████████████████▊    | 109/117 [20:29<01:50, 13.77s/it]

 94%|█████████████████████████████████████████████████████████▎   | 110/117 [20:44<01:37, 14.00s/it]

 95%|█████████████████████████████████████████████████████████▊   | 111/117 [21:04<01:34, 15.75s/it]

 96%|██████████████████████████████████████████████████████████▍  | 112/117 [21:16<01:12, 14.54s/it]

 97%|██████████████████████████████████████████████████████████▉  | 113/117 [21:30<00:58, 14.60s/it]

 97%|███████████████████████████████████████████████████████████▍ | 114/117 [21:50<00:48, 16.29s/it]

 98%|███████████████████████████████████████████████████████████▉ | 115/117 [22:03<00:30, 15.06s/it]

 99%|████████████████████████████████████████████████████████████▍| 116/117 [22:18<00:15, 15.27s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:38<00:00, 16.50s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:38<00:00, 11.61s/it]




In [18]:
# TODO: check if each partition is really generating k clusters

In [19]:
consensus_results = pd.DataFrame(consensus_results)

In [20]:
display(consensus_results.shape)

(117, 12)

In [21]:
consensus_results.head()

Unnamed: 0,method,partition,k,ari_mean,ari_median,ari_std,ami_mean,ami_median,ami_std,nmi_mean,nmi_median,nmi_std
0,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,0.226733,0.010542,0.351673,0.238194,0.055225,0.325239,0.240435,0.057239,0.32451
1,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,0.242829,0.015836,0.359734,0.266882,0.080544,0.325173,0.270462,0.083603,0.323833
2,eac_average_coassoc_matrix,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4,0.260508,0.032736,0.339536,0.29643,0.131213,0.27979,0.301305,0.135507,0.27792
3,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,0.260519,0.032744,0.339554,0.298287,0.132804,0.281202,0.303842,0.138038,0.278911
4,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,0.165668,0.09173,0.200208,0.268626,0.236109,0.1288,0.275338,0.237406,0.12818


## Testing

In [22]:
assert not consensus_results.isna().any().any()

In [23]:
# check that the number of clusters in the partitions are the expected ones
_real_k_values = consensus_results["partition"].apply(lambda x: np.unique(x).shape[0])
display(_real_k_values)
assert np.all(consensus_results["k"].values == _real_k_values.values)

0       2
1       3
2       4
3       5
4       6
       ..
112    36
113    37
114    38
115    39
116    40
Name: partition, Length: 117, dtype: int64

## Save

In [24]:
output_file = Path(RESULTS_DIR, "consensus_clustering_runs.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/consensus_clustering_runs.pkl')

In [25]:
consensus_results.to_pickle(output_file)