# Description

It combines all clustering solutions generated into a single consolidated solution using consensus clustering.

# Environment variables

In [1]:
from IPython.display import display

# set numpy n_jobs to 1, since I'll be using n_jobs later
NUMPY_N_JOBS = 1
display(NUMPY_N_JOBS)

1

In [2]:
%env MKL_NUM_THREADS=$NUMPY_N_JOBS
%env OPEN_BLAS_NUM_THREADS=$NUMPY_N_JOBS
%env NUMEXPR_NUM_THREADS=$NUMPY_N_JOBS
%env OMP_NUM_THREADS=$NUMPY_N_JOBS

env: MKL_NUM_THREADS=1
env: OPEN_BLAS_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

import conf

# Settings

In [5]:
RANDOM_GENERATOR = np.random.default_rng(12345)

## Ensemble size

In [6]:
EXPECTED_ENSEMBLE_SIZE = 295

MIN_ENSEMBLE_SIZE = 290
MAX_ENSEMBLE_SIZE = 300

## Consensus clustering

In [7]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_MIN"] = 2
CLUSTERING_OPTIONS["K_MAX"] = 40

display(CLUSTERING_OPTIONS)

{'K_MIN': 2, 'K_MAX': 40}

In [8]:
# output dir for this notebook
RESULTS_DIR = Path(conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering')

# Get ensemble

In [9]:
output_file = Path(RESULTS_DIR, "ensemble.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble.npy')

In [10]:
full_ensemble = np.load(output_file)

In [11]:
display(full_ensemble.shape)

(4428, 3749)

# Get ensemble coassociation distance matrix

In [12]:
output_file = Path(RESULTS_DIR, "ensemble_coassoc_matrix.npy").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/ensemble_coassoc_matrix.npy')

In [13]:
ensemble_coassoc_matrix = np.load(output_file)

In [14]:
display(ensemble_coassoc_matrix.shape)

(3749, 3749)

In [15]:
display(ensemble_coassoc_matrix)

array([[0.        , 0.05004529, 0.04868659, ..., 0.62794384, 0.66445714,
        0.67006341],
       [0.05004529, 0.        , 0.02241848, ..., 0.62771739, 0.66514286,
        0.66779891],
       [0.04868659, 0.02241848, 0.        , ..., 0.62703804, 0.66285714,
        0.66530797],
       ...,
       [0.62794384, 0.62771739, 0.62703804, ..., 0.        , 0.59288808,
        0.66282746],
       [0.66445714, 0.66514286, 0.66285714, ..., 0.59288808, 0.        ,
        0.6127194 ],
       [0.67006341, 0.66779891, 0.66530797, ..., 0.66282746, 0.6127194 ,
        0.        ]])

# Consensus clustering

In [16]:
from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm

from clustering.ensemble import (
    eac_single_coassoc_matrix,
    eac_complete_coassoc_matrix,
    eac_average_coassoc_matrix,
    run_method_and_compute_agreement,
)

In [17]:
all_consensus_methods = set(
    (
        eac_single_coassoc_matrix,
        eac_complete_coassoc_matrix,
        eac_average_coassoc_matrix,
    )
)
display(all_consensus_methods)

{<function clustering.ensemble.eac_average_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_complete_coassoc_matrix(coassoc_matrix, k)>,
 <function clustering.ensemble.eac_single_coassoc_matrix(coassoc_matrix, k)>}

In [18]:
consensus_results = []

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(
            run_method_and_compute_agreement,
            m,
            ensemble_coassoc_matrix,
            full_ensemble,
            k,
        ): (m.__name__, k)
        for m in all_consensus_methods
        for k in range(CLUSTERING_OPTIONS["K_MIN"], CLUSTERING_OPTIONS["K_MAX"] + 1)
    }

    for future in tqdm(as_completed(tasks), total=len(tasks), disable=False, ncols=100):
        method_name, k = tasks[future]
        part, performance_values = future.result()

        method_results = {
            "method": method_name,
            "partition": part,
            "k": k,
        }
        method_results.update(performance_values)

        consensus_results.append(method_results)

  0%|                                                                       | 0/117 [00:00<?, ?it/s]

  1%|▌                                                              | 1/117 [00:23<45:43, 23.65s/it]

  2%|█                                                              | 2/117 [00:24<32:25, 16.92s/it]

  3%|█▌                                                             | 3/117 [00:26<23:25, 12.33s/it]

  3%|██▏                                                            | 4/117 [00:49<29:12, 15.51s/it]

  4%|██▋                                                            | 5/117 [00:52<21:54, 11.73s/it]

  5%|███▏                                                           | 6/117 [00:54<16:18,  8.82s/it]

  6%|███▊                                                           | 7/117 [01:17<24:13, 13.22s/it]

  7%|████▎                                                          | 8/117 [01:21<18:40, 10.28s/it]

  8%|████▊                                                          | 9/117 [01:23<14:12,  7.89s/it]

  9%|█████▎                                                        | 10/117 [01:47<22:52, 12.83s/it]

  9%|█████▊                                                        | 11/117 [01:51<17:50, 10.10s/it]

 10%|██████▎                                                       | 12/117 [01:54<13:58,  7.99s/it]

 11%|██████▉                                                       | 13/117 [02:19<22:28, 12.96s/it]

 12%|███████▍                                                      | 14/117 [02:23<17:50, 10.39s/it]

 13%|███████▉                                                      | 15/117 [02:27<14:09,  8.32s/it]

 14%|████████▍                                                     | 16/117 [02:52<22:34, 13.41s/it]

 15%|█████████                                                     | 17/117 [02:57<17:56, 10.77s/it]

 15%|█████████▌                                                    | 18/117 [03:00<14:21,  8.71s/it]

 16%|██████████                                                    | 19/117 [03:26<22:28, 13.76s/it]

 17%|██████████▌                                                   | 20/117 [03:31<18:03, 11.17s/it]

 18%|███████████▏                                                  | 21/117 [03:35<14:31,  9.08s/it]

 19%|███████████▋                                                  | 22/117 [04:01<22:18, 14.09s/it]

 20%|████████████▏                                                 | 23/117 [04:07<18:12, 11.62s/it]

 21%|████████████▋                                                 | 24/117 [04:12<14:44,  9.51s/it]

 21%|█████████████▏                                                | 25/117 [04:38<22:11, 14.48s/it]

 22%|█████████████▊                                                | 26/117 [04:44<18:13, 12.02s/it]

 23%|██████████████▎                                               | 27/117 [04:49<14:44,  9.83s/it]

 24%|██████████████▊                                               | 28/117 [05:16<22:30, 15.17s/it]

 25%|███████████████▎                                              | 29/117 [05:23<18:31, 12.63s/it]

 26%|███████████████▉                                              | 30/117 [05:28<14:52, 10.25s/it]

 26%|████████████████▍                                             | 31/117 [05:56<22:20, 15.58s/it]

 27%|████████████████▉                                             | 32/117 [06:03<18:26, 13.02s/it]

 28%|█████████████████▍                                            | 33/117 [06:08<14:50, 10.60s/it]

 29%|██████████████████                                            | 34/117 [06:36<22:03, 15.95s/it]

 30%|██████████████████▌                                           | 35/117 [06:43<18:16, 13.37s/it]

 31%|███████████████████                                           | 36/117 [06:49<14:46, 10.94s/it]

 32%|███████████████████▌                                          | 37/117 [07:18<21:48, 16.35s/it]

 32%|████████████████████▏                                         | 38/117 [07:25<18:03, 13.71s/it]

 33%|████████████████████▋                                         | 39/117 [07:31<14:37, 11.25s/it]

 34%|█████████████████████▏                                        | 40/117 [07:40<13:45, 10.73s/it]

 35%|█████████████████████▋                                        | 41/117 [07:48<12:34,  9.93s/it]

 36%|██████████████████████▎                                       | 42/117 [07:55<11:11,  8.96s/it]

 37%|██████████████████████▊                                       | 43/117 [08:05<11:26,  9.28s/it]

 38%|███████████████████████▎                                      | 44/117 [08:14<10:59,  9.03s/it]

 38%|███████████████████████▊                                      | 45/117 [08:21<10:07,  8.44s/it]

 39%|████████████████████████▍                                     | 46/117 [08:31<10:43,  9.07s/it]

 40%|████████████████████████▉                                     | 47/117 [08:40<10:29,  9.00s/it]

 41%|█████████████████████████▍                                    | 48/117 [08:48<09:54,  8.61s/it]

 42%|█████████████████████████▉                                    | 49/117 [08:59<10:32,  9.30s/it]

 43%|██████████████████████████▍                                   | 50/117 [09:08<10:18,  9.22s/it]

 44%|███████████████████████████                                   | 51/117 [09:16<09:45,  8.88s/it]

 44%|███████████████████████████▌                                  | 52/117 [09:27<10:26,  9.63s/it]

 45%|████████████████████████████                                  | 53/117 [09:36<10:09,  9.52s/it]

 46%|████████████████████████████▌                                 | 54/117 [09:45<09:41,  9.23s/it]

 47%|█████████████████████████████▏                                | 55/117 [09:57<10:22, 10.04s/it]

 48%|█████████████████████████████▋                                | 56/117 [10:06<10:02,  9.87s/it]

 49%|██████████████████████████████▏                               | 57/117 [10:15<09:32,  9.54s/it]

 50%|██████████████████████████████▋                               | 58/117 [10:27<10:11, 10.37s/it]

 50%|███████████████████████████████▎                              | 59/117 [10:37<09:52, 10.22s/it]

 51%|███████████████████████████████▊                              | 60/117 [10:46<09:21,  9.85s/it]

 52%|████████████████████████████████▎                             | 61/117 [10:59<10:04, 10.79s/it]

 53%|████████████████████████████████▊                             | 62/117 [11:09<09:41, 10.57s/it]

 54%|█████████████████████████████████▍                            | 63/117 [11:19<09:10, 10.20s/it]

 55%|█████████████████████████████████▉                            | 64/117 [11:32<09:52, 11.18s/it]

 56%|██████████████████████████████████▍                           | 65/117 [11:42<09:28, 10.93s/it]

 56%|██████████████████████████████████▉                           | 66/117 [11:52<08:57, 10.54s/it]

 57%|███████████████████████████████████▌                          | 67/117 [12:06<09:37, 11.54s/it]

 58%|████████████████████████████████████                          | 68/117 [12:17<09:13, 11.30s/it]

 59%|████████████████████████████████████▌                         | 69/117 [12:27<08:41, 10.87s/it]

 60%|█████████████████████████████████████                         | 70/117 [12:41<09:18, 11.88s/it]

 61%|█████████████████████████████████████▌                        | 71/117 [12:52<08:56, 11.66s/it]

 62%|██████████████████████████████████████▏                       | 72/117 [13:02<08:25, 11.24s/it]

 62%|██████████████████████████████████████▋                       | 73/117 [13:17<08:57, 12.21s/it]

 63%|███████████████████████████████████████▏                      | 74/117 [13:28<08:34, 11.97s/it]

 64%|███████████████████████████████████████▋                      | 75/117 [13:39<08:05, 11.56s/it]

 65%|████████████████████████████████████████▎                     | 76/117 [13:53<08:32, 12.49s/it]

 66%|████████████████████████████████████████▊                     | 77/117 [14:05<08:14, 12.37s/it]

 67%|█████████████████████████████████████████▎                    | 78/117 [14:16<07:45, 11.93s/it]

 68%|█████████████████████████████████████████▊                    | 79/117 [14:17<05:30,  8.69s/it]

 68%|██████████████████████████████████████████▍                   | 80/117 [14:30<06:09,  9.98s/it]

 69%|██████████████████████████████████████████▉                   | 81/117 [14:42<06:12, 10.34s/it]

 70%|███████████████████████████████████████████▍                  | 82/117 [14:45<04:45,  8.15s/it]

 71%|███████████████████████████████████████████▉                  | 83/117 [14:58<05:31,  9.76s/it]

 72%|████████████████████████████████████████████▌                 | 84/117 [15:10<05:43, 10.41s/it]

 73%|█████████████████████████████████████████████                 | 85/117 [15:14<04:26,  8.32s/it]

 74%|█████████████████████████████████████████████▌                | 86/117 [15:28<05:11, 10.04s/it]

 74%|██████████████████████████████████████████████                | 87/117 [15:40<05:20, 10.67s/it]

 75%|██████████████████████████████████████████████▋               | 88/117 [15:44<04:11,  8.66s/it]

 76%|███████████████████████████████████████████████▏              | 89/117 [15:58<04:51, 10.42s/it]

 77%|███████████████████████████████████████████████▋              | 90/117 [16:11<04:56, 11.00s/it]

 78%|████████████████████████████████████████████████▏             | 91/117 [16:16<04:04,  9.40s/it]

 79%|████████████████████████████████████████████████▊             | 92/117 [16:31<04:36, 11.06s/it]

 79%|█████████████████████████████████████████████████▎            | 93/117 [16:44<04:39, 11.65s/it]

 80%|█████████████████████████████████████████████████▊            | 94/117 [16:50<03:50, 10.02s/it]

 81%|██████████████████████████████████████████████████▎           | 95/117 [17:06<04:16, 11.68s/it]

 82%|██████████████████████████████████████████████████▊           | 96/117 [17:19<04:14, 12.10s/it]

 83%|███████████████████████████████████████████████████▍          | 97/117 [17:26<03:30, 10.54s/it]

 84%|███████████████████████████████████████████████████▉          | 98/117 [17:42<03:52, 12.22s/it]

 85%|████████████████████████████████████████████████████▍         | 99/117 [17:56<03:47, 12.64s/it]

 85%|████████████████████████████████████████████████████▏        | 100/117 [18:03<03:08, 11.10s/it]

 86%|████████████████████████████████████████████████████▋        | 101/117 [18:20<03:22, 12.68s/it]

 87%|█████████████████████████████████████████████████████▏       | 102/117 [18:33<03:15, 13.05s/it]

 88%|█████████████████████████████████████████████████████▋       | 103/117 [18:41<02:41, 11.53s/it]

 89%|██████████████████████████████████████████████████████▏      | 104/117 [18:58<02:51, 13.16s/it]

 90%|██████████████████████████████████████████████████████▋      | 105/117 [19:13<02:41, 13.50s/it]

 91%|███████████████████████████████████████████████████████▎     | 106/117 [19:21<02:12, 12.00s/it]

 91%|███████████████████████████████████████████████████████▊     | 107/117 [19:38<02:15, 13.52s/it]

 92%|████████████████████████████████████████████████████████▎    | 108/117 [19:53<02:04, 13.88s/it]

 93%|████████████████████████████████████████████████████████▊    | 109/117 [20:03<01:41, 12.69s/it]

 94%|█████████████████████████████████████████████████████████▎   | 110/117 [20:20<01:38, 14.08s/it]

 95%|█████████████████████████████████████████████████████████▊   | 111/117 [20:35<01:25, 14.31s/it]

 96%|██████████████████████████████████████████████████████████▍  | 112/117 [20:45<01:05, 13.08s/it]

 97%|██████████████████████████████████████████████████████████▉  | 113/117 [21:03<00:58, 14.53s/it]

 97%|███████████████████████████████████████████████████████████▍ | 114/117 [21:18<00:44, 14.72s/it]

 98%|███████████████████████████████████████████████████████████▉ | 115/117 [21:29<00:27, 13.51s/it]

 99%|████████████████████████████████████████████████████████████▍| 116/117 [21:47<00:14, 14.72s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:01<00:00, 14.68s/it]

100%|█████████████████████████████████████████████████████████████| 117/117 [22:01<00:00, 11.30s/it]




In [19]:
# TODO: check if each partition is really generating k clusters

In [20]:
consensus_results = pd.DataFrame(consensus_results)

In [21]:
display(consensus_results.shape)

(117, 12)

In [22]:
consensus_results.head()

Unnamed: 0,method,partition,k,ari_mean,ari_median,ari_std,ami_mean,ami_median,ami_std,nmi_mean,nmi_median,nmi_std
0,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,0.240317,0.037024,0.349342,0.24949,0.080186,0.324666,0.251982,0.082369,0.323813
1,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,0.263578,0.054609,0.356479,0.284536,0.112219,0.32368,0.288405,0.116221,0.322205
2,eac_average_coassoc_matrix,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4,0.287902,0.124714,0.331105,0.315595,0.187761,0.275006,0.320842,0.192473,0.273039
3,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,0.287923,0.124713,0.331126,0.318194,0.191098,0.27662,0.324061,0.195962,0.274261
4,eac_average_coassoc_matrix,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,0.223957,0.147902,0.226714,0.299478,0.262243,0.142417,0.306541,0.268924,0.141986


## Save

In [23]:
output_file = Path(RESULTS_DIR, "consensus_clustering_runs.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base3/results/clustering/consensus_clustering/consensus_clustering_runs.pkl')

In [24]:
consensus_results.to_pickle(output_file)