# Description

It reads all the clustering partitions obtained with different algorithms, and generates the ensemble by putting it into a numpy array. Then it obtains the coassociation matrix from the ensemble (see more details below).

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

10

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=10
env: OPEN_BLAS_NUM_THREADS=10
env: NUMEXPR_NUM_THREADS=10
env: OMP_NUM_THREADS=10


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

# Settings

In [5]:
RANDOM_GENERATOR = np.random.default_rng(12345)

In [6]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_genes"

## Ensemble size

For some clustering algorithms it is easy to control the number of final partitions to generate: for instance, for k-means, you can generate partitions from k=2 to k=20 (19 partitions with different number of clusters). However, with algorithms such as DBSCAN this is not very easy to achieve, since for some parameter combinations (`eps` and `min_samples`) it generates partitions with one cluster (which is not an actual partition of the data) that are not included here.

The parameters below specify the expected number of partitions for each clustering algorithm, and a range of allowed sizes. Then, the code below checks that each algorithm has the same representation in the ensemble. For example, if `EXPECTED_ENSEMBLE_SIZE=50`, `MIN_ENSEMBLE_SIZE=45` and `MAX_ENSEMBLE_SIZE=55`, the code below will check that k-means, spectral clustering, DBSCAN, etc, generated between 45 and 55 partitions. If not, it resamples the generated partitions to get 50 (the value specified by `EXPECTED_ENSEMBLE_SIZE`), so each algorithm has approximately the same representation in the full ensemble.

In [7]:
# EXPECTED_ENSEMBLE_SIZE = 295

# MIN_ENSEMBLE_SIZE = 290
# MAX_ENSEMBLE_SIZE = 300

## Consensus clustering

In [8]:
# output dir for this notebook
RESULTS_DIR = Path(NULL_DIR, "consensus_clustering").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/consensus_clustering')

# Get ensemble

## Load partition files

In [9]:
input_dir = Path(
    NULL_DIR,
    "runs",
).resolve()
display(input_dir)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/runs')

In [10]:
included_pkl_files = []

for pkl_file in input_dir.rglob("*.pkl"):
    pkl_file_str = str(pkl_file)

    # skip stability pkl files
    if "-stability-" in pkl_file_str:
        continue

    included_pkl_files.append(pkl_file)

In [11]:
display(len(included_pkl_files))

# 5 algorithms, 3 dataset versions
assert len(included_pkl_files) == (5 * 3) - 1  # because dbscan on z-scores fails here

14

## Combine partition files to get final ensemble

In [12]:
n_partitions = 0

In [13]:
ensembles_list = []

In [14]:
for ens_file in included_pkl_files:
    ens = pd.read_pickle(ens_file)

    #     short_file_path = Path(*ens_file.parts[-2:])

    #     if ens.shape[0] < MIN_ENSEMBLE_SIZE:
    #         print(f"Less partitions than expected in {short_file_path}")

    #         # if less partitions than expected, resample with replacement
    #         ens = ens.sample(
    #             n=EXPECTED_ENSEMBLE_SIZE,
    #             replace=True,
    #             random_state=RANDOM_GENERATOR.bit_generator,
    #         )
    #         assert ens.shape[0] == EXPECTED_ENSEMBLE_SIZE

    #     elif ens.shape[0] > MAX_ENSEMBLE_SIZE:
    #         print(f"More partitions than expected in {short_file_path}")

    #         # if more partitions than expected, take a smaller sample
    #         ens = ens.sample(
    #             n=EXPECTED_ENSEMBLE_SIZE, random_state=RANDOM_GENERATOR.bit_generator
    #         )
    #         assert ens.shape[0] == EXPECTED_ENSEMBLE_SIZE

    ens_full_format = np.concatenate(
        ens["partition"].apply(lambda x: x.reshape(1, -1)), axis=0
    )

    n_partitions += ens_full_format.shape[0]

    ensembles_list.append(ens_full_format)

In [15]:
display(len(ensembles_list))
assert len(ensembles_list) == len(included_pkl_files)

14

In [16]:
n_data_objects = ensembles_list[0].shape[1]
display(n_data_objects)

3752

In [17]:
display(n_partitions)

3844

In [18]:
full_ensemble = ensembles_list[0]
for ens in ensembles_list[1:]:
    full_ensemble = np.concatenate((full_ensemble, ens), axis=0)

In [19]:
display(full_ensemble.shape)
assert full_ensemble.shape == (n_partitions, n_data_objects)

(3844, 3752)

## Save

In [20]:
output_file = Path(RESULTS_DIR, "ensemble.npy").resolve()
display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/consensus_clustering/ensemble.npy')

In [21]:
full_ensemble

array([[ 1.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [64., 27., 27., ..., 27., 27., 62.],
       [ 9.,  9.,  9., ...,  9.,  9., 28.]])

In [22]:
np.save(output_file, full_ensemble)

# Get coassociation matrix from ensemble

The coassociation matrix is a distance matrix derived from the ensemble, where each cell represents the percentage of times a pair of objects (traits and diseases in this case) were not clustered together. It serves as an input for any consensus function (basically, another clustering algorithm) to derive a consensus partition.

In [23]:
from clustering.ensembles.utils import get_ensemble_distance_matrix

In [24]:
ensemble_coassoc_matrix = get_ensemble_distance_matrix(
    full_ensemble,
    n_jobs=conf.GENERAL["N_JOBS"],
)

In [25]:
ensemble_coassoc_matrix.shape

(3752, 3752)

In [26]:
ensemble_coassoc_matrix

array([[0.        , 0.43731932, 0.45571616, ..., 0.49618922, 0.50591327,
        0.76722777],
       [0.43731932, 0.        , 0.41721333, ..., 0.44581475, 0.48517449,
        0.73476891],
       [0.45571616, 0.41721333, 0.        , ..., 0.45683548, 0.44502755,
        0.72689076],
       ...,
       [0.49618922, 0.44581475, 0.45683548, ..., 0.        , 0.38998164,
        0.74816176],
       [0.50591327, 0.48517449, 0.44502755, ..., 0.38998164, 0.        ,
        0.75157563],
       [0.76722777, 0.73476891, 0.72689076, ..., 0.74816176, 0.75157563,
        0.        ]])

## Save

In [27]:
output_file = Path(RESULTS_DIR, "ensemble_coassoc_matrix.npy").resolve()
display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/consensus_clustering/ensemble_coassoc_matrix.npy')

In [28]:
np.save(output_file, ensemble_coassoc_matrix)