# Summary

This notebook allows to reproduce the method results on the BRCA dataset.  
We have analyzed the dataset with both GMM and HDBSCAN algorithms.

In [1]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
import tensorflow as tf
from IPython import get_ipython
from tqdm import tqdm
from collections import Counter

plt.ion()
plt.show()

random_state=1
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



Using TensorFlow backend.

In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



# Preprocessing

In [2]:
# truth_column = "tumor_type"
# truth_values = ['type 1', 'type 2']
# filename = "KIRP"

# df = pd.read_csv("../data/rna_data/KIRP.txt", sep = "\t", low_memory=False)
# meta = pd.read_csv("../data/rna_data/KIRP_All_CDEs.txt", sep = "\t", low_memory=False)

# preprocess.preprocess_rna(df,
#                    meta,
#                    truth_column,
#                    truth_values,
#                    filename,
#                    metric='correlation',#'euclidean',
#                    normalize=True)

# Load preprocessed data

## Start here if preprocessing files have been generated

In [3]:
filename = "BRCA"

data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
z_file =f"../data/rna_data/{filename}_Z_correlation.npy"
additional_df = pd.read_pickle(f"../data/rna_data/{filename}_additional.pkl")

truth = data["y"].values
data = data.drop("y", axis = 1).values
n_clusters = len(np.unique(truth))
Counter(truth), data.shape

# Subspace clustering

(Counter({1: 794, 0: 237}), (1031, 18054))

In [4]:
meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=85,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.4)

*** Computing 1D feature ranking ...
Dispersion tests took 5.86 sec
Entropy computation 13.18 sec
KNN computation 205.93 sec
Sorting and thresholds 0.06 sec
Loading clustering from file
Hierarchical clustering 0.06 sec
Handle redundant features 2.31 sec
Returning 3342 redundant features and  1249 important features


In [5]:
model_file = "../models/gmm_arl.h5"
gmm_arl_population, n = features_2d.run(data,
                                n_clusters,
                                meta_features,
                                model_file=model_file,
                                theta=0.1,
                                add_close_population=True)
print(gmm_arl_population.shape, n)

*** Exploring 2D feature space with NN ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


trimming 26166 to 7356
handle_close_important (7356, 3), total 32871, 0.23803995549678802
relevant_features 6648 => computing 39840 
trimming 19933 to 3342
handle_important_features (10698, 3),  total 40835, 0.23803995549678802
irrelevant_features 8064 => computing 24182
trimming 9937 to 7356
handle_not_important_features (18054, 3), total 25181, 0.23803995549678802
handle_all_features 14712 => computing 44127
trimming 20339 to 10000
handle_all_features (28054, 3),  total 45125, 0.23803995549678802
Returning (28054, 3), explored a total of 144012 feature pairs
(28054, 4) 144012


In [5]:
globalResults = {} # save results from both runs

In [7]:
method = "adapted_ratkowsky_lance"
threshold=0.09#
score_tolerance=0.009
clustering = "gmm"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 14712, orig size 18054, nb imp : 1249
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:14<02:17,  4.91s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"


adding 4 random population


 19%|█▉        | 6/31 [00:47<03:28,  8.36s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"


adding 11 random population


 29%|██▉       | 9/31 [05:44<18:14, 49.75s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"


adding 16 random population


 39%|███▊      | 12/31 [27:25<1:07:06, 211.93s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"


adding 7 random population


 48%|████▊     | 15/31 [28:25<22:01, 82.60s/it]   

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"


adding 4 random population


 58%|█████▊    | 18/31 [32:40<14:51, 68.60s/it] 

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"


adding 11 random population


 68%|██████▊   | 21/31 [43:24<20:06, 120.65s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"
6,0.19,0.59,"[650, 2686, 2792, 2873, 3262, 4580, 5587, 6579...",0.42,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.28,0.42,14.0,"Counter({1: 789, 0: 242})"


adding 34 random population


 77%|███████▋  | 24/31 [45:10<06:49, 58.45s/it] 

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"
6,0.19,0.59,"[650, 2686, 2792, 2873, 3262, 4580, 5587, 6579...",0.42,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.28,0.42,14.0,"Counter({1: 789, 0: 242})"
7,0.23,-0.03,"[687, 797, 1048, 1213, 1418, 1547, 1548, 1814,...",0.05,"[1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.54,0.36,94.0,"Counter({0: 685, 1: 346})"


adding 3 random population


 87%|████████▋ | 27/31 [50:17<04:26, 66.66s/it] 

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"
6,0.19,0.59,"[650, 2686, 2792, 2873, 3262, 4580, 5587, 6579...",0.42,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.28,0.42,14.0,"Counter({1: 789, 0: 242})"
7,0.23,-0.03,"[687, 797, 1048, 1213, 1418, 1547, 1548, 1814,...",0.05,"[1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.54,0.36,94.0,"Counter({0: 685, 1: 346})"
8,0.19,0.07,"[1826, 3450, 3465, 3969, 10802, 16603]",0.13,"[1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, ...",0.1,0.36,6.0,"Counter({0: 557, 1: 474})"


adding 4 random population


 97%|█████████▋| 30/31 [50:50<00:29, 29.41s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"
6,0.19,0.59,"[650, 2686, 2792, 2873, 3262, 4580, 5587, 6579...",0.42,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.28,0.42,14.0,"Counter({1: 789, 0: 242})"
7,0.23,-0.03,"[687, 797, 1048, 1213, 1418, 1547, 1548, 1814,...",0.05,"[1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.54,0.36,94.0,"Counter({0: 685, 1: 346})"
8,0.19,0.07,"[1826, 3450, 3465, 3969, 10802, 16603]",0.13,"[1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, ...",0.1,0.36,6.0,"Counter({0: 557, 1: 474})"
9,0.18,0.41,"[363, 2330, 4762, 5378, 15337, 15340, 17443]",0.26,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, ...",0.18,0.43,7.0,"Counter({1: 764, 0: 267})"


adding 13 random population


100%|██████████| 31/31 [51:12<00:00, 99.12s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.21,0.07,"[11363, 14739, 14740, 14741]",0.04,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",0.17,0.43,4.0,"Counter({1: 623, 0: 408})"
1,0.24,0.66,"[43, 362, 400, 401, 464, 721, 994, 1026, 1330,...",0.49,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.72,0.5,54.0,"Counter({1: 812, 0: 219})"
2,0.19,0.28,"[473, 474, 581, 1247, 1343, 1437, 1460, 1602, ...",0.18,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, ...",0.73,0.36,147.0,"Counter({1: 688, 0: 343})"
3,0.21,-0.02,"[2676, 3503, 3509, 6756, 7609, 9093, 11332, 14...",0.01,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",0.23,0.42,8.0,"Counter({1: 689, 0: 342})"
4,0.21,-0.02,"[44, 56, 57, 60, 62, 254, 306, 1221, 1452, 174...",0.03,"[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.41,0.36,40.0,"Counter({0: 658, 1: 373})"
5,0.2,0.58,"[22, 47, 68, 79, 758, 1001, 2114, 2813, 2970, ...",0.43,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...",0.56,0.47,45.0,"Counter({1: 856, 0: 175})"
6,0.19,0.59,"[650, 2686, 2792, 2873, 3262, 4580, 5587, 6579...",0.42,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.28,0.42,14.0,"Counter({1: 789, 0: 242})"
7,0.23,-0.03,"[687, 797, 1048, 1213, 1418, 1547, 1548, 1814,...",0.05,"[1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",0.54,0.36,94.0,"Counter({0: 685, 1: 346})"
8,0.19,0.07,"[1826, 3450, 3465, 3969, 10802, 16603]",0.13,"[1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, ...",0.1,0.36,6.0,"Counter({0: 557, 1: 474})"
9,0.18,0.41,"[363, 2330, 4762, 5378, 15337, 15340, 17443]",0.26,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, ...",0.18,0.43,7.0,"Counter({1: 764, 0: 267})"


In [3]:
method = "adapted_ratkowsky_lance"
threshold=0.09#
score_tolerance=0.009
clustering = "gmm"

solutions = pd.read_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

In [4]:
solutions = ga.rank_solutions(solutions, data)

In [5]:
solutions.head()

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure,davies_bouldin,davies_bouldin_silhouette,rank_silhouette,rank_point_biserial,rank_ari,rank_nmi
0,0.24,0.32,"[17460, 17463]",0.19,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",0.23,0.65,2.0,"Counter({0: 734, 1: 297})",0.50551,10.0,1.0,4.0,3.0,3.0
1,0.25,0.68,"[362, 400, 401, 721, 988, 1330, 2062, 2295, 24...",0.52,"[1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, ...",0.61,0.5,40.0,"Counter({1: 780, 0: 251})",0.834492,7.0,4.0,1.0,1.0,1.0
2,0.23,0.09,"[13786, 13787, 13788]",0.07,"[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, ...",0.17,0.49,3.0,"Counter({1: 571, 0: 460})",0.704748,6.0,5.0,5.0,5.0,5.0
3,0.24,0.4,"[1247, 1343, 1962, 2394, 3398, 3617, 5153, 524...",0.25,"[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, ...",0.52,0.46,35.0,"Counter({1: 751, 0: 280})",0.874517,3.0,7.0,2.0,2.0,2.0
4,0.22,-0.02,"[44, 56, 57, 600, 824, 1103, 2579, 3051, 5874,...",0.02,"[1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, ...",0.3,0.39,15.0,"Counter({0: 647, 1: 384})",0.96199,1.0,10.0,3.0,10.0,6.0


In [13]:
method = "adapted_ratkowsky_lance"
threshold=0.09#
score_tolerance=0.009
clustering = "gmm"
solutions = globalResults[f"{clustering}_{method}"] 
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

In [8]:
method = "adapted_silhouette"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 14712, orig size 18054, nb imp : 1249
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:33<05:12, 11.15s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"


adding 12 random population


 19%|█▉        | 6/31 [02:01<08:45, 21.03s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"


adding 1 random population


 29%|██▉       | 9/31 [03:34<08:52, 24.22s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"


adding 6 random population


 39%|███▊      | 12/31 [05:24<08:53, 28.09s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"


adding 19 random population


 48%|████▊     | 15/31 [07:00<07:16, 27.29s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"


adding 32 random population


 58%|█████▊    | 18/31 [08:48<06:12, 28.63s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"


adding 2 random population


 68%|██████▊   | 21/31 [10:27<04:38, 27.88s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"
6,0.13,0.430934,0.57,"[401, 994, 1001, 1579, 2523, 2574, 3103, 4265,...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, ...",0.09,0.44,36.0,"Counter({0: 836, 1: 117, -1: 78})"


adding 45 random population


 77%|███████▋  | 24/31 [12:52<03:52, 33.21s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"
6,0.13,0.430934,0.57,"[401, 994, 1001, 1579, 2523, 2574, 3103, 4265,...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, ...",0.09,0.44,36.0,"Counter({0: 836, 1: 117, -1: 78})"
7,0.15,0.555145,0.01,"[2012, 3516, 5029, 8202, 12788, 14942, 15824]",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.1,0.63,7.0,"Counter({1: 961, 0: 53, -1: 17})"


adding 16 random population


 87%|████████▋ | 27/31 [14:11<01:42, 25.56s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"
6,0.13,0.430934,0.57,"[401, 994, 1001, 1579, 2523, 2574, 3103, 4265,...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, ...",0.09,0.44,36.0,"Counter({0: 836, 1: 117, -1: 78})"
7,0.15,0.555145,0.01,"[2012, 3516, 5029, 8202, 12788, 14942, 15824]",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.1,0.63,7.0,"Counter({1: 961, 0: 53, -1: 17})"
8,0.05,0.466103,0.19,"[3968, 5124, 5602, 8670, 9415, 9418, 17056]",0.09,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1,...",-0.07,0.53,7.0,"Counter({1: 943, -1: 73, 0: 15})"


adding 12 random population


 97%|█████████▋| 30/31 [15:27<00:22, 22.52s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"
6,0.13,0.430934,0.57,"[401, 994, 1001, 1579, 2523, 2574, 3103, 4265,...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, ...",0.09,0.44,36.0,"Counter({0: 836, 1: 117, -1: 78})"
7,0.15,0.555145,0.01,"[2012, 3516, 5029, 8202, 12788, 14942, 15824]",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.1,0.63,7.0,"Counter({1: 961, 0: 53, -1: 17})"
8,0.05,0.466103,0.19,"[3968, 5124, 5602, 8670, 9415, 9418, 17056]",0.09,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1,...",-0.07,0.53,7.0,"Counter({1: 943, -1: 73, 0: 15})"
9,0.06,0.419176,0.01,"[9499, 9869, 10299, 13525]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, ...",0.11,0.52,4.0,"Counter({1: 655, 0: 348, -1: 28})"


adding 4 random population


100%|██████████| 31/31 [16:28<00:00, 31.87s/it]


Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.07,0.453068,0.12,"[1458, 1829, 3937, 3938, 4257, 4435, 9412, 941...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-0.05,0.5,10.0,"Counter({0: 893, -1: 95, 1: 43})"
1,0.07,0.57487,-0.03,"[1164, 1384, 3829, 4412, 5397, 7317, 10944, 11...",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",-0.01,0.62,14.0,"Counter({1: 998, -1: 23, 0: 10})"
2,0.08,0.540217,-0.0,"[3003, 6828, 8391, 11038, 17251]",0.0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.2,0.65,5.0,"Counter({0: 826, 1: 184, -1: 21})"
3,0.08,0.447223,0.02,"[1577, 9216, 11185]",0.01,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.14,0.6,3.0,"Counter({1: 894, 0: 122, -1: 15})"
4,0.07,0.442585,0.3,"[130, 4549, 4550, 9061, 10288, 10297, 16223]",0.17,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.07,0.51,7.0,"Counter({0: 915, 1: 76, -1: 40})"
5,0.09,0.67226,0.0,"[732, 1930, 3872, 4747, 6826, 11250, 12188, 14...",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...",0.13,0.74,10.0,"Counter({1: 1016, 0: 11, -1: 4})"
6,0.13,0.430934,0.57,"[401, 994, 1001, 1579, 2523, 2574, 3103, 4265,...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, ...",0.09,0.44,36.0,"Counter({0: 836, 1: 117, -1: 78})"
7,0.15,0.555145,0.01,"[2012, 3516, 5029, 8202, 12788, 14942, 15824]",0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1,...",0.1,0.63,7.0,"Counter({1: 961, 0: 53, -1: 17})"
8,0.05,0.466103,0.19,"[3968, 5124, 5602, 8670, 9415, 9418, 17056]",0.09,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1,...",-0.07,0.53,7.0,"Counter({1: 943, -1: 73, 0: 15})"
9,0.06,0.419176,0.01,"[9499, 9869, 10299, 13525]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, ...",0.11,0.52,4.0,"Counter({1: 655, 0: 348, -1: 28})"


In [11]:
method = "adapted_silhouette"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"
solutions = globalResults[f"{clustering}_{method}"] 
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

In [6]:
method = "adapted_ratkowsky_lance"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 14712, orig size 18054, nb imp : 1249
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:23<03:31,  7.55s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"


adding 5 random population


 19%|█▉        | 6/31 [01:24<05:51, 14.07s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"


adding 15 random population


 29%|██▉       | 9/31 [02:35<06:17, 17.15s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"


adding 17 random population


 39%|███▊      | 12/31 [03:39<05:24, 17.07s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"


adding 34 random population


 48%|████▊     | 15/31 [04:29<04:00, 15.04s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"


adding 32 random population


 58%|█████▊    | 18/31 [07:04<06:26, 29.72s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"


adding 8 random population


 68%|██████▊   | 21/31 [07:54<03:15, 19.51s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"
6,0.1,0.41,"[671, 1234, 1240, 1819, 1911, 1962, 4680, 4820...",0.3,"[1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -...",-0.3,0.26,30.0,"Counter({1: 646, -1: 290, 0: 95})"


adding 9 random population


 77%|███████▋  | 24/31 [09:20<02:29, 21.40s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"
6,0.1,0.41,"[671, 1234, 1240, 1819, 1911, 1962, 4680, 4820...",0.3,"[1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -...",-0.3,0.26,30.0,"Counter({1: 646, -1: 290, 0: 95})"
7,0.1,0.07,"[44, 54, 57, 543, 724, 822, 1221, 2225, 2676, ...",0.02,"[0, 0, 0, 0, 0, 0, -1, 0, -1, 1, 0, 0, 0, -1, ...",-0.47,0.16,60.0,"Counter({0: 634, -1: 302, 1: 95})"


adding 14 random population


 87%|████████▋ | 27/31 [11:23<01:49, 27.47s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"
6,0.1,0.41,"[671, 1234, 1240, 1819, 1911, 1962, 4680, 4820...",0.3,"[1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -...",-0.3,0.26,30.0,"Counter({1: 646, -1: 290, 0: 95})"
7,0.1,0.07,"[44, 54, 57, 543, 724, 822, 1221, 2225, 2676, ...",0.02,"[0, 0, 0, 0, 0, 0, -1, 0, -1, 1, 0, 0, 0, -1, ...",-0.47,0.16,60.0,"Counter({0: 634, -1: 302, 1: 95})"
8,0.11,0.57,"[43, 810, 994, 1042, 1659, 2062, 2424, 6180, 6...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, ...",0.06,0.39,26.0,"Counter({0: 843, 1: 113, -1: 75})"


adding 37 random population


 97%|█████████▋| 30/31 [12:40<00:22, 22.39s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"
6,0.1,0.41,"[671, 1234, 1240, 1819, 1911, 1962, 4680, 4820...",0.3,"[1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -...",-0.3,0.26,30.0,"Counter({1: 646, -1: 290, 0: 95})"
7,0.1,0.07,"[44, 54, 57, 543, 724, 822, 1221, 2225, 2676, ...",0.02,"[0, 0, 0, 0, 0, 0, -1, 0, -1, 1, 0, 0, 0, -1, ...",-0.47,0.16,60.0,"Counter({0: 634, -1: 302, 1: 95})"
8,0.11,0.57,"[43, 810, 994, 1042, 1659, 2062, 2424, 6180, 6...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, ...",0.06,0.39,26.0,"Counter({0: 843, 1: 113, -1: 75})"
9,0.13,-0.04,"[742, 1510, 3424, 3829, 5397, 10417, 13629, 14...",0.02,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.16,0.66,10.0,"Counter({1: 993, 0: 22, -1: 16})"


adding 1 random population


100%|██████████| 31/31 [13:09<00:00, 25.46s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.41,"[1969, 2582, 2972, 4094, 5519, 6746, 10507, 13...",0.26,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1,...",-0.13,0.34,8.0,"Counter({1: 718, -1: 234, 0: 79})"
1,0.14,0.57,"[400, 401, 824, 2686, 4129, 5235, 6261, 6644, ...",0.38,"[0, 0, 1, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 1, 0,...",0.04,0.38,13.0,"Counter({0: 753, 1: 153, -1: 125})"
2,0.11,0.36,"[552, 607, 1602, 2222, 3490, 5153, 5431, 6099,...",0.23,"[0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1,...",-0.13,0.3,14.0,"Counter({0: 718, -1: 213, 1: 100})"
3,0.13,0.0,"[1090, 6828, 8978, 13615]",0.0,"[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",0.24,0.63,4.0,"Counter({1: 834, 0: 186, -1: 11})"
4,0.1,0.59,"[18, 22, 362, 721, 1169, 1247, 1742, 2523, 257...",0.38,"[1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 0, ...",-0.37,0.31,56.0,"Counter({1: 814, -1: 153, 0: 64})"
5,0.11,0.0,"[9740, 13525, 17440]",0.0,"[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, 0, ...",0.02,0.42,3.0,"Counter({1: 610, 0: 296, -1: 125})"
6,0.1,0.41,"[671, 1234, 1240, 1819, 1911, 1962, 4680, 4820...",0.3,"[1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -...",-0.3,0.26,30.0,"Counter({1: 646, -1: 290, 0: 95})"
7,0.1,0.07,"[44, 54, 57, 543, 724, 822, 1221, 2225, 2676, ...",0.02,"[0, 0, 0, 0, 0, 0, -1, 0, -1, 1, 0, 0, 0, -1, ...",-0.47,0.16,60.0,"Counter({0: 634, -1: 302, 1: 95})"
8,0.11,0.57,"[43, 810, 994, 1042, 1659, 2062, 2424, 6180, 6...",0.37,"[0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, ...",0.06,0.39,26.0,"Counter({0: 843, 1: 113, -1: 75})"
9,0.13,-0.04,"[742, 1510, 3424, 3829, 5397, 10417, 13629, 14...",0.02,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.16,0.66,10.0,"Counter({1: 993, 0: 22, -1: 16})"


# Supervised analysis

In [4]:
from sklearn import mixture
import hdbscan

In [16]:
ranked_features = feature_ranking.supervised_feature_ranking(data, truth, 
                        nbTopFeatures = data.shape[1])
data = data[:, ranked_features]
imp_f = np.arange(50)



In [17]:
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = data[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.7146646841530905, 1
 HDBSCAN ari = 0.3006095266129624, 24


In [18]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest
sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = sel[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.7051014904286836, 4
 HDBSCAN ari = 0.531072362797341, 2


# Analyze entire dataset

In [19]:
input_data = data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =10).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")
pred = KMeans(n_clusters= n_clusters).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

GMM ari = 0.4000460768857944
HDBSCAN ari = 0.0
Kmeans ari = 0.3553886143251339


# Cluster PCA

In [20]:
# input_data = data
# gmm = mixture.GaussianMixture(n_components=n_clusters,
#                       covariance_type="full", random_state=0)
# pred = gmm.fit_predict(input_data)
# ari = adjusted_rand_score(truth, pred)
# print(f"GMM ari = {ari}")

In [5]:
# Predict on PCA
pca = PCA(50)
pca_data = pca.fit_transform(data)
input_data = pca_data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =10).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")

GMM ari = 0.3554584934266965
HDBSCAN ari = 0.0


In [6]:
pred = KMeans(n_clusters= n_clusters).fit(pca_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

Kmeans ari = 0.3531755968149013


# Other methods

In [4]:
from sklearn import mixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
import hdbscan

row = {}
clustering = AffinityPropagation(random_state=5).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"Affinity {ari}")
row["AffinityPropagation"] = ari

clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='discretize',random_state=5).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"Spectral {ari}")
row["Spectral"] = ari

clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"KMeans {ari}")
row["KMeans"] = ari

gmm = mixture.GaussianMixture(n_components=n_clusters,
              covariance_type="full", random_state=0)
pred = gmm.fit_predict(data[:, :8000])
ari = adjusted_rand_score(truth, pred)
print(f"GMM {ari}")
row["GMM"] = ari

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN {ari}")
row["HDBSCAN"] = ari

pca = PCA(2)
pca_data = pca.fit_transform(data)

clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(pca_data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"PCA KMeans {ari}")
row["PCA_KMeans"] = ari

gmm = mixture.GaussianMixture(n_components=n_clusters,
              covariance_type="full", random_state=0)
pred = gmm.fit_predict(pca_data)
ari = adjusted_rand_score(truth, pred)
print(f"PCA GMM {ari}")
row["PCA_GMM"] = ari

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(pca_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"PCAHDBSCAN {ari}")
row["PCA_HDBSCAN"] = ari


Affinity 0.02137021506019116




Spectral -0.0007126956350564706
KMeans 0.39365823679629736
GMM 0.6556457380427297
HDBSCAN 0.11368384984106271
PCA KMeans 0.2702244179367132
PCA GMM 0.3553886143251339
PCAHDBSCAN -0.0025788411103115


NameError: name 'results' is not defined