# Summary

This notebook allows to reproduce the method results on the BRCA dataset.  
We have analyzed the dataset with both GMM and HDBSCAN algorithms.

In [1]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
import tensorflow as tf
from IPython import get_ipython
from tqdm import tqdm
from collections import Counter

plt.ion()
plt.show()

random_state=1
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



Using TensorFlow backend.


# Preprocessing

In [2]:
# truth_column = "tumor_type"
# truth_values = ['type 1', 'type 2']
# filename = "KIRP"

# df = pd.read_csv("../data/rna_data/KIRP.txt", sep = "\t", low_memory=False)
# meta = pd.read_csv("../data/rna_data/KIRP_All_CDEs.txt", sep = "\t", low_memory=False)

# preprocess.preprocess_rna(df,
#                    meta,
#                    truth_column,
#                    truth_values,
#                    filename,
#                    metric='correlation',#'euclidean',
#                    normalize=True)

# Load preprocessed data

## Start here if preprocessing files have been generated

In [3]:
filename = "BRCA"

data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
z_file =f"../data/rna_data/{filename}_Z_correlation.npy"
additional_df = pd.read_pickle(f"../data/rna_data/{filename}_additional.pkl")

truth = data["y"].values
data = data.drop("y", axis = 1).values
n_clusters = len(np.unique(truth))
Counter(truth), data.shape

# Subspace clustering

(Counter({1: 794, 0: 237}), (1031, 18054))

In [5]:
meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=85,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.4)

*** Computing 1D feature ranking ...
Dispersion tests took 5.34 sec
Entropy computation 13.33 sec
KNN computation 192.56 sec
Sorting and thresholds 0.07 sec
Loading clustering from file
Hierarchical clustering 0.06 sec
Handle redundant features 2.88 sec
Returning 3342 redundant features and  1249 important features


In [6]:
model_file = "../models/gmm_arl.h5"
gmm_arl_population, n = features_2d.run(data,
                                n_clusters,
                                meta_features,
                                model_file=model_file,
                                theta=0.1,
                                add_close_population=True)
print(gmm_arl_population.shape, n)

*** Exploring 2D feature space with NN ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


trimming 26166 to 7356
handle_close_important (7356, 3), total 32871, 0.23803995549678802
relevant_features 6648 => computing 39840 
trimming 19933 to 3342
handle_important_features (10698, 3),  total 40835, 0.23803995549678802
irrelevant_features 8064 => computing 24182
trimming 9937 to 7356
handle_not_important_features (18054, 3), total 25181, 0.23803995549678802
handle_all_features 14712 => computing 44127
trimming 20339 to 10000
handle_all_features (28054, 3),  total 45125, 0.23803995549678802
Returning (28054, 3), explored a total of 144012 feature pairs
(28054, 4) 144012


In [4]:
globalResults = {} # save results from both runs

In [8]:
method = "adapted_ratkowsky_lance"
threshold=0.09#
score_tolerance=0.009
clustering = "gmm"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:7000],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 14712, orig size 18054, nb imp : 1249
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:12<02:00,  4.31s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"


Selecting (1, 4) from archive


 19%|█▉        | 6/31 [00:41<03:09,  7.59s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"


Selecting (7, 4) from archive


 29%|██▉       | 9/31 [03:36<10:39, 29.07s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"


Selecting (1, 4) from archive


 39%|███▊      | 12/31 [03:56<04:21, 13.74s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"


Selecting (6, 4) from archive


 48%|████▊     | 15/31 [05:42<05:38, 21.16s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"


Selecting (8, 4) from archive


 58%|█████▊    | 18/31 [06:26<03:08, 14.46s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"


Selecting (3, 4) from archive


 68%|██████▊   | 21/31 [06:47<01:28,  8.89s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"
6,0.216358,0.06,"[7509, 7510, 7512, 11081, 11082, 17200]","[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",6.0,"Counter({1: 646, 0: 385})"


Selecting (7, 4) from archive


 77%|███████▋  | 24/31 [07:12<00:52,  7.54s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"
6,0.216358,0.06,"[7509, 7510, 7512, 11081, 11082, 17200]","[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",6.0,"Counter({1: 646, 0: 385})"
7,0.222361,0.0,"[9086, 11405]","[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",2.0,"Counter({1: 534, 0: 497})"


Selecting (1, 4) from archive


 87%|████████▋ | 27/31 [07:33<00:26,  6.57s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"
6,0.216358,0.06,"[7509, 7510, 7512, 11081, 11082, 17200]","[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",6.0,"Counter({1: 646, 0: 385})"
7,0.222361,0.0,"[9086, 11405]","[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",2.0,"Counter({1: 534, 0: 497})"
8,0.215699,0.02,"[2829, 5360, 8875]","[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, ...",3.0,"Counter({1: 641, 0: 390})"


Selecting (2, 4) from archive


 97%|█████████▋| 30/31 [07:54<00:06,  6.46s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"
6,0.216358,0.06,"[7509, 7510, 7512, 11081, 11082, 17200]","[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",6.0,"Counter({1: 646, 0: 385})"
7,0.222361,0.0,"[9086, 11405]","[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",2.0,"Counter({1: 534, 0: 497})"
8,0.215699,0.02,"[2829, 5360, 8875]","[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, ...",3.0,"Counter({1: 641, 0: 390})"
9,0.218267,0.0,"[1237, 2155]","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, ...",2.0,"Counter({0: 567, 1: 464})"


Selecting (1, 4) from archive


100%|██████████| 31/31 [08:06<00:00, 15.70s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.241855,0.32,"[17460, 17463]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",2.0,"Counter({0: 734, 1: 297})"
1,0.248232,0.66,"[43, 79, 135, 362, 400, 401, 721, 758, 988, 20...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, ...",68.0,"Counter({0: 803, 1: 228})"
2,0.227094,0.1,"[13786, 13787, 13788]","[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",3.0,"Counter({1: 575, 0: 456})"
3,0.237427,0.35,"[581, 1247, 1343, 1602, 1964, 2906, 3398, 4178...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",48.0,"Counter({0: 730, 1: 301})"
4,0.221746,-0.03,"[44, 57, 62, 600, 824, 1221, 1971, 1972, 2579,...","[1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, ...",16.0,"Counter({1: 708, 0: 323})"
5,0.229541,0.16,"[4264, 4739, 10177, 11004, 12171]","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...",5.0,"Counter({0: 627, 1: 404})"
6,0.216358,0.06,"[7509, 7510, 7512, 11081, 11082, 17200]","[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",6.0,"Counter({1: 646, 0: 385})"
7,0.222361,0.0,"[9086, 11405]","[0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",2.0,"Counter({1: 534, 0: 497})"
8,0.215699,0.02,"[2829, 5360, 8875]","[1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, ...",3.0,"Counter({1: 641, 0: 390})"
9,0.218267,0.0,"[1237, 2155]","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, ...",2.0,"Counter({0: 567, 1: 464})"


In [9]:
method = "adapted_silhouette"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:7000],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 14712, orig size 18054, nb imp : 1249
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:48<07:45, 16.61s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"


Selecting (3, 4) from archive


 19%|█▉        | 6/31 [03:01<12:09, 29.18s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"


Selecting (32, 4) from archive


 29%|██▉       | 9/31 [05:35<13:30, 36.83s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"


Selecting (20, 4) from archive


 39%|███▊      | 12/31 [09:37<17:23, 54.94s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"


Selecting (4, 4) from archive


 48%|████▊     | 15/31 [13:11<14:59, 56.19s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"


Selecting (46, 4) from archive


 58%|█████▊    | 18/31 [16:16<11:03, 51.03s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"


Selecting (36, 4) from archive


 68%|██████▊   | 21/31 [18:37<07:04, 42.43s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"
6,0.551471,0.03,"[216, 3992, 5742, 15924]","[0, 1, 1, 0, 0, 1, 0, 1, 0, -1, 1, 1, 0, 0, 1,...",4.0,"Counter({1: 576, 0: 446, -1: 9})"


Selecting (23, 4) from archive


 77%|███████▋  | 24/31 [20:46<04:27, 38.20s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"
6,0.551471,0.03,"[216, 3992, 5742, 15924]","[0, 1, 1, 0, 0, 1, 0, 1, 0, -1, 1, 1, 0, 0, 1,...",4.0,"Counter({1: 576, 0: 446, -1: 9})"
7,0.48859,0.03,"[413, 1124, 3427, 3484, 4828, 5011, 12200]","[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",7.0,"Counter({1: 761, 0: 250, -1: 20})"


Selecting (2, 4) from archive


 87%|████████▋ | 27/31 [22:58<02:26, 36.58s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"
6,0.551471,0.03,"[216, 3992, 5742, 15924]","[0, 1, 1, 0, 0, 1, 0, 1, 0, -1, 1, 1, 0, 0, 1,...",4.0,"Counter({1: 576, 0: 446, -1: 9})"
7,0.48859,0.03,"[413, 1124, 3427, 3484, 4828, 5011, 12200]","[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",7.0,"Counter({1: 761, 0: 250, -1: 20})"
8,0.443046,-0.0,"[312, 337, 2519, 2963, 3113, 3751, 4537, 6078,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",17.0,"Counter({1: 917, 0: 92, -1: 22})"


Selecting (14, 4) from archive


 97%|█████████▋| 30/31 [26:05<00:43, 43.96s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"
6,0.551471,0.03,"[216, 3992, 5742, 15924]","[0, 1, 1, 0, 0, 1, 0, 1, 0, -1, 1, 1, 0, 0, 1,...",4.0,"Counter({1: 576, 0: 446, -1: 9})"
7,0.48859,0.03,"[413, 1124, 3427, 3484, 4828, 5011, 12200]","[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",7.0,"Counter({1: 761, 0: 250, -1: 20})"
8,0.443046,-0.0,"[312, 337, 2519, 2963, 3113, 3751, 4537, 6078,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",17.0,"Counter({1: 917, 0: 92, -1: 22})"
9,0.449962,0.41,"[22, 222, 1169, 2402, 2932, 4239, 6236, 6721, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",13.0,"Counter({0: 903, 1: 108, -1: 20})"


Selecting (21, 4) from archive


100%|██████████| 31/31 [27:34<00:00, 53.37s/it]


Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.636636,-0.03,"[894, 3372, 3373, 3829, 5397, 6937, 6938, 8202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18.0,"Counter({0: 998, -1: 23, 1: 10})"
1,0.52052,0.62,"[120, 401, 768, 2424, 2525, 4631, 6100, 10301]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...",8.0,"Counter({0: 803, 1: 207, -1: 21})"
2,0.395957,0.02,"[849, 1445, 1972, 3051, 3693, 3694, 3914, 5874...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",11.0,"Counter({1: 907, 0: 107, -1: 17})"
3,0.484254,-0.04,"[20, 1510, 3127, 3375, 3424, 4328, 6575, 7231,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21.0,"Counter({1: 995, 0: 22, -1: 14})"
4,0.493135,0.5,"[124, 400, 994, 2012, 3338, 5087, 5187, 5237, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1...",32.0,"Counter({1: 860, -1: 95, 0: 76})"
5,0.395158,0.36,"[1012, 2890, 3052, 4550, 6498, 9810, 15804]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0,"Counter({0: 916, 1: 85, -1: 30})"
6,0.551471,0.03,"[216, 3992, 5742, 15924]","[0, 1, 1, 0, 0, 1, 0, 1, 0, -1, 1, 1, 0, 0, 1,...",4.0,"Counter({1: 576, 0: 446, -1: 9})"
7,0.48859,0.03,"[413, 1124, 3427, 3484, 4828, 5011, 12200]","[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",7.0,"Counter({1: 761, 0: 250, -1: 20})"
8,0.443046,-0.0,"[312, 337, 2519, 2963, 3113, 3751, 4537, 6078,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",17.0,"Counter({1: 917, 0: 92, -1: 22})"
9,0.449962,0.41,"[22, 222, 1169, 2402, 2932, 4239, 6236, 6721, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",13.0,"Counter({0: 903, 1: 108, -1: 20})"


# Supervised analysis

In [10]:
from sklearn import mixture
import hdbscan

In [11]:
ranked_features = feature_ranking.supervised_feature_ranking(data, truth, 
                        nbTopFeatures = data.shape[1])
data = data[:, ranked_features]
imp_f = np.arange(50)

In [12]:
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = data[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.7146646841530905, 1
 HDBSCAN ari = 0.3006095266129624, 24


In [30]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest
sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = sel[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.7051014904286836, 4
 HDBSCAN ari = 0.5310723627973412, 2


# Analyze entire dataset

In [None]:
input_data = data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =10).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")
pred = KMeans(n_clusters= n_clusters).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

# Cluster PCA

In [None]:
# input_data = data
# gmm = mixture.GaussianMixture(n_components=n_clusters,
#                       covariance_type="full", random_state=0)
# pred = gmm.fit_predict(input_data)
# ari = adjusted_rand_score(truth, pred)
# print(f"GMM ari = {ari}")

In [None]:
# Predict on PCA
pca = PCA(2)
pca_data = pca.fit_transform(data)
input_data = pca_data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =10).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")