# Summary

This notebook allows to reproduce the method results on the KIRP dataset.  
We have analyzed the dataset with both GMM and HDBSCAN algorithms.

In [1]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
import tensorflow as tf
from IPython import get_ipython
from tqdm import tqdm
from collections import Counter

plt.ion()
plt.show()

random_state=2
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



Using TensorFlow backend.

In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



# Preprocessing

In [2]:
# truth_column = "tumor_type"
# truth_values = ['type 1', 'type 2']
# filename = "KIRP"

# df = pd.read_csv("../data/rna_data/KIRP.txt", sep = "\t", low_memory=False)
# meta = pd.read_csv("../data/rna_data/KIRP_All_CDEs.txt", sep = "\t", low_memory=False)

# preprocess.preprocess_rna(df,
#                    meta,
#                    truth_column,
#                    truth_values,
#                    filename,
#                    metric='correlation',#'euclidean',
#                    normalize=True)

# Load preprocessed data

## Start here if preprocessing files have been generated

In [3]:
filename = "KIRP"

data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
# z_file =f"../data/rna_data/{filename}_Z_correlation.npy"
# additional_df = pd.read_pickle(f"../data/rna_data/{filename}_additional.pkl")

truth = data["y"].values
data = data.drop("y", axis = 1).values
n_clusters = len(np.unique(truth))
Counter(truth), data.shape

(Counter({0: 77, 1: 86}), (163, 17938))

# Subspace clustering

In [4]:
meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=90,
                                              z_file=None,
                                              metric='correlation',
                                              redundant_threshold=0.6)


*** Computing 1D feature ranking ...
Dispersion tests took 1.0 sec
Entropy computation 3.49 sec
KNN computation 32.98 sec
Sorting and thresholds 0.06 sec
Performing hierarchical clustering...
Hierarchical clustering 20.98 sec
Handle redundant features 2.75 sec
Returning 9769 redundant features and  614 important features


In [5]:
# model_file = "../models/gmm_arl.h5"
# gmm_arl_population, n = features_2d.run(data,
#                                 n_clusters,
#                                 meta_features,
#                                 model_file=model_file,
#                                 theta=0.1,
#                                 add_close_population=False,
#                                 exploration_factor = 5)
# print(gmm_arl_population.shape, n)

In [6]:
globalResults = {} # Save results for both runs

In [7]:
method = "adapted_ratkowsky_lance"
score_tolerance=0.009
clustering = "gmm"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 400

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
# globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 8169, orig size 17938, nb imp : 614
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:11<01:42,  3.65s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"


adding 1 random population


 19%|█▉        | 6/31 [00:38<02:40,  6.43s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"


adding 2 random population


 29%|██▉       | 9/31 [01:01<02:25,  6.61s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"


adding 19 random population


 39%|███▊      | 12/31 [03:37<08:19, 26.28s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"


adding 4 random population


 48%|████▊     | 15/31 [04:22<04:25, 16.58s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"


adding 2 random population


 58%|█████▊    | 18/31 [04:39<01:58,  9.12s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"


adding 3 random population


 68%|██████▊   | 21/31 [07:05<04:16, 25.60s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"
6,0.21,-0.0,"[367, 459, 775, 783, 1338, 1956, 2043, 2984, 2...",0.0,"[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",0.51,0.31,107.0,"Counter({1: 89, 0: 74})"


adding 5 random population


 77%|███████▋  | 24/31 [09:18<03:26, 29.55s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"
6,0.21,-0.0,"[367, 459, 775, 783, 1338, 1956, 2043, 2984, 2...",0.0,"[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",0.51,0.31,107.0,"Counter({1: 89, 0: 74})"
7,0.21,0.17,"[17, 68, 91, 107, 195, 258, 293, 416, 555, 121...",0.16,"[0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, ...",0.88,0.37,152.0,"Counter({1: 102, 0: 61})"


adding 41 random population


 87%|████████▋ | 27/31 [13:27<03:09, 47.48s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"
6,0.21,-0.0,"[367, 459, 775, 783, 1338, 1956, 2043, 2984, 2...",0.0,"[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",0.51,0.31,107.0,"Counter({1: 89, 0: 74})"
7,0.21,0.17,"[17, 68, 91, 107, 195, 258, 293, 416, 555, 121...",0.16,"[0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, ...",0.88,0.37,152.0,"Counter({1: 102, 0: 61})"
8,0.19,0.03,"[8668, 9853, 10200, 16345]",0.03,"[0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, ...",0.18,0.43,4.0,"Counter({1: 108, 0: 55})"


adding 3 random population


 97%|█████████▋| 30/31 [13:41<00:19, 19.25s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"
6,0.21,-0.0,"[367, 459, 775, 783, 1338, 1956, 2043, 2984, 2...",0.0,"[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",0.51,0.31,107.0,"Counter({1: 89, 0: 74})"
7,0.21,0.17,"[17, 68, 91, 107, 195, 258, 293, 416, 555, 121...",0.16,"[0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, ...",0.88,0.37,152.0,"Counter({1: 102, 0: 61})"
8,0.19,0.03,"[8668, 9853, 10200, 16345]",0.03,"[0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, ...",0.18,0.43,4.0,"Counter({1: 108, 0: 55})"
9,0.2,0.06,"[65, 284, 370, 375, 453, 639, 679, 1058, 1197,...",0.05,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",0.64,0.31,152.0,"Counter({0: 92, 1: 71})"


adding 3 random population


100%|██████████| 31/31 [15:40<00:00, 30.33s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.24,0.0,"[251, 3014, 3075, 5621, 5764, 6439, 7511, 8036...",0.01,"[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...",0.38,0.48,15.0,"Counter({0: 122, 1: 41})"
1,0.31,-0.01,"[4187, 4188, 4329, 4935, 6967, 8012, 12291, 13...",0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",0.63,0.63,14.0,"Counter({0: 116, 1: 47})"
2,0.29,-0.0,"[181, 686, 755, 812, 816, 1021, 1105, 1162, 12...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.04,0.71,153.0,"Counter({0: 150, 1: 13})"
3,0.23,-0.0,"[72, 73, 74, 151, 196, 232, 405, 425, 559, 571...",0.0,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...",0.72,0.47,55.0,"Counter({0: 128, 1: 35})"
4,0.23,-0.01,"[11305, 11306]",0.0,"[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, ...",0.21,0.59,2.0,"Counter({1: 110, 0: 53})"
5,0.23,0.01,"[292, 295, 335, 351, 719, 799, 827, 885, 905, ...",0.04,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.17,0.47,157.0,"Counter({0: 132, 1: 31})"
6,0.21,-0.0,"[367, 459, 775, 783, 1338, 1956, 2043, 2984, 2...",0.0,"[1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, ...",0.51,0.31,107.0,"Counter({1: 89, 0: 74})"
7,0.21,0.17,"[17, 68, 91, 107, 195, 258, 293, 416, 555, 121...",0.16,"[0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, ...",0.88,0.37,152.0,"Counter({1: 102, 0: 61})"
8,0.19,0.03,"[8668, 9853, 10200, 16345]",0.03,"[0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, ...",0.18,0.43,4.0,"Counter({1: 108, 0: 55})"
9,0.2,0.06,"[65, 284, 370, 375, 453, 639, 679, 1058, 1197,...",0.05,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",0.64,0.31,152.0,"Counter({0: 92, 1: 71})"


In [8]:
method = "adapted_silhouette"
threshold=0.09
score_tolerance=0.009
clustering = "gmm"

round_size = 1#3
debug = False
ignore_redundant= True
epochs = 1#10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 400

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
# globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/2 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 50%|█████     | 1/2 [00:04<00:04,  4.00s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.17,0.660271,-0.0,"[181, 312, 361, 479, 894, 1677, 2289, 2383, 23...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.79,0.67,51.0,"Counter({0: 156, 1: 7})"


adding 1 random population


100%|██████████| 2/2 [00:36<00:00, 18.49s/it]


Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.17,0.660271,-0.0,"[181, 312, 361, 479, 894, 1677, 2289, 2383, 23...",0.03,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.79,0.67,51.0,"Counter({0: 156, 1: 7})"


In [9]:
method = "adapted_silhouette"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:08<01:19,  2.84s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"


adding 5 random population


 19%|█▉        | 6/31 [00:41<02:51,  6.86s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"


adding 19 random population


 29%|██▉       | 9/31 [01:09<02:33,  6.98s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"


adding 20 random population


 39%|███▊      | 12/31 [01:48<02:46,  8.75s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"


adding 9 random population


 48%|████▊     | 15/31 [01:57<01:19,  4.99s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"


adding 2 random population


 58%|█████▊    | 18/31 [02:07<00:48,  3.72s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"


adding 4 random population


 68%|██████▊   | 21/31 [02:22<00:40,  4.03s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"
6,0.09,0.482375,-0.0,"[440, 663, 1105, 1845, 1938, 2568, 2967, 5132,...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.21,0.5,33.0,"Counter({0: 149, 1: 10, -1: 4})"


adding 17 random population


 77%|███████▋  | 24/31 [02:38<00:30,  4.41s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"
6,0.09,0.482375,-0.0,"[440, 663, 1105, 1845, 1938, 2568, 2967, 5132,...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.21,0.5,33.0,"Counter({0: 149, 1: 10, -1: 4})"
7,0.11,0.470955,-0.0,"[1866, 3694, 8917]",0.02,"[0, 1, 1, 1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1,...",0.16,0.63,3.0,"Counter({1: 115, 0: 39, -1: 9})"


adding 7 random population


 87%|████████▋ | 27/31 [02:51<00:16,  4.01s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"
6,0.09,0.482375,-0.0,"[440, 663, 1105, 1845, 1938, 2568, 2967, 5132,...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.21,0.5,33.0,"Counter({0: 149, 1: 10, -1: 4})"
7,0.11,0.470955,-0.0,"[1866, 3694, 8917]",0.02,"[0, 1, 1, 1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1,...",0.16,0.63,3.0,"Counter({1: 115, 0: 39, -1: 9})"
8,0.12,0.528193,-0.0,"[432, 4332, 4624, 11017]",0.0,"[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",0.27,0.66,4.0,"Counter({1: 131, 0: 27, -1: 5})"


adding 22 random population


 97%|█████████▋| 30/31 [03:02<00:03,  3.76s/it]

Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"
6,0.09,0.482375,-0.0,"[440, 663, 1105, 1845, 1938, 2568, 2967, 5132,...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.21,0.5,33.0,"Counter({0: 149, 1: 10, -1: 4})"
7,0.11,0.470955,-0.0,"[1866, 3694, 8917]",0.02,"[0, 1, 1, 1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1,...",0.16,0.63,3.0,"Counter({1: 115, 0: 39, -1: 9})"
8,0.12,0.528193,-0.0,"[432, 4332, 4624, 11017]",0.0,"[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",0.27,0.66,4.0,"Counter({1: 131, 0: 27, -1: 5})"
9,0.08,0.432158,-0.0,"[2642, 3661, 6784, 7352, 8892, 9865, 9907, 11457]",0.01,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 1,...",0.14,0.49,8.0,"Counter({0: 123, 1: 30, -1: 10})"


adding 6 random population


100%|██████████| 31/31 [03:08<00:00,  6.10s/it]


Unnamed: 0,adapted_ratkowsky_lance,adapted_silhouette,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.11,0.533624,-0.0,"[17, 73, 77, 106, 151, 158, 181, 308, 323, 351...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.66,0.54,166.0,"Counter({0: 147, 1: 12, -1: 4})"
1,0.25,0.679969,-0.0,"[51, 786, 1604, 1793, 2035, 2093, 2289, 2369, ...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.57,0.69,114.0,"Counter({0: 150, 1: 13})"
2,0.13,0.536353,-0.0,"[29, 163, 197, 232, 318, 413, 561, 571, 704, 7...",0.01,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.44,0.54,157.0,"Counter({0: 143, 1: 13, -1: 7})"
3,0.18,0.454544,-0.01,"[980, 13397]",0.0,"[-1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,...",0.11,0.68,2.0,"Counter({0: 98, 1: 51, -1: 14})"
4,0.1,0.386948,0.0,"[2008, 15043]",0.03,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...",0.05,0.58,2.0,"Counter({1: 112, 0: 28, -1: 23})"
5,0.09,0.435602,-0.01,"[9, 186, 395, 2383, 2414, 4627, 5156, 5601, 81...",0.01,"[-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",-0.02,0.46,19.0,"Counter({0: 140, -1: 12, 1: 11})"
6,0.09,0.482375,-0.0,"[440, 663, 1105, 1845, 1938, 2568, 2967, 5132,...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.21,0.5,33.0,"Counter({0: 149, 1: 10, -1: 4})"
7,0.11,0.470955,-0.0,"[1866, 3694, 8917]",0.02,"[0, 1, 1, 1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1,...",0.16,0.63,3.0,"Counter({1: 115, 0: 39, -1: 9})"
8,0.12,0.528193,-0.0,"[432, 4332, 4624, 11017]",0.0,"[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",0.27,0.66,4.0,"Counter({1: 131, 0: 27, -1: 5})"
9,0.08,0.432158,-0.0,"[2642, 3661, 6784, 7352, 8892, 9865, 9907, 11457]",0.01,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 1,...",0.14,0.49,8.0,"Counter({0: 123, 1: 30, -1: 10})"


In [13]:
method = "adapted_ratkowsky_lance"
threshold=0.09
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=None,
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [ 0 35 35 30] [0, 0.35, 0.35, 0.3]
adding 50 random population


 10%|▉         | 3/31 [00:08<01:16,  2.74s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"


adding 5 random population


 19%|█▉        | 6/31 [00:22<01:33,  3.75s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"


adding 11 random population


 29%|██▉       | 9/31 [00:37<01:29,  4.08s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"


adding 1 random population


 39%|███▊      | 12/31 [00:52<01:20,  4.25s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"


adding 23 random population


 48%|████▊     | 15/31 [01:08<01:10,  4.41s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"


adding 2 random population


 58%|█████▊    | 18/31 [01:32<01:14,  5.71s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"


adding 30 random population


 68%|██████▊   | 21/31 [02:34<01:57, 11.78s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"
6,0.14,0.11,"[17, 91, 195, 293, 306, 442, 864, 1248, 1283, ...",0.13,"[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, ...",-0.53,0.35,82.0,"Counter({1: 108, -1: 42, 0: 13})"


adding 37 random population


 77%|███████▋  | 24/31 [03:32<01:31, 13.12s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"
6,0.14,0.11,"[17, 91, 195, 293, 306, 442, 864, 1248, 1283, ...",0.13,"[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, ...",-0.53,0.35,82.0,"Counter({1: 108, -1: 42, 0: 13})"
7,0.15,0.15,"[163, 462, 845, 1370, 2995, 3219, 4308, 4625, ...",0.14,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ...",-0.29,0.37,28.0,"Counter({1: 100, -1: 50, 0: 13})"


adding 8 random population


 87%|████████▋ | 27/31 [03:51<00:32,  8.02s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"
6,0.14,0.11,"[17, 91, 195, 293, 306, 442, 864, 1248, 1283, ...",0.13,"[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, ...",-0.53,0.35,82.0,"Counter({1: 108, -1: 42, 0: 13})"
7,0.15,0.15,"[163, 462, 845, 1370, 2995, 3219, 4308, 4625, ...",0.14,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ...",-0.29,0.37,28.0,"Counter({1: 100, -1: 50, 0: 13})"
8,0.17,-0.01,"[2936, 8860]",0.0,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, ...",0.13,0.66,2.0,"Counter({0: 96, 1: 51, -1: 16})"


adding 16 random population


 97%|█████████▋| 30/31 [04:03<00:05,  5.12s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"
6,0.14,0.11,"[17, 91, 195, 293, 306, 442, 864, 1248, 1283, ...",0.13,"[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, ...",-0.53,0.35,82.0,"Counter({1: 108, -1: 42, 0: 13})"
7,0.15,0.15,"[163, 462, 845, 1370, 2995, 3219, 4308, 4625, ...",0.14,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ...",-0.29,0.37,28.0,"Counter({1: 100, -1: 50, 0: 13})"
8,0.17,-0.01,"[2936, 8860]",0.0,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, ...",0.13,0.66,2.0,"Counter({0: 96, 1: 51, -1: 16})"
9,0.14,-0.0,"[9128, 10645, 10649]",0.0,"[-1, 1, -1, 0, 0, 1, 1, 0, 1, 1, 1, 1, -1, 1, ...",0.03,0.44,3.0,"Counter({1: 95, 0: 38, -1: 30})"


adding 2 random population


100%|██████████| 31/31 [04:09<00:00,  8.04s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,nmi,partition,point_biserial,silhouette,size,structure
0,0.15,-0.0,"[6039, 8917, 9334]",0.02,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...",0.15,0.64,3.0,"Counter({0: 116, 1: 35, -1: 12})"
1,0.21,0.0,"[6784, 7902, 8892]",0.02,"[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...",0.35,0.7,3.0,"Counter({1: 125, 0: 38})"
2,0.14,0.01,"[2369, 11938, 14426, 15429]",0.01,"[-1, 1, 1, 1, 0, 1, 1, 0, 0, 1, -1, 1, -1, 1, ...",-0.03,0.5,4.0,"Counter({1: 112, -1: 33, 0: 18})"
3,0.16,0.03,"[555, 3653, 4508, 4878, 6188, 14750, 14823, 16...",0.06,"[1, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, ...",0.04,0.43,8.0,"Counter({0: 118, -1: 24, 1: 21})"
4,0.17,-0.0,"[308, 827, 1162, 1503, 1658, 1965, 2383, 3181,...",0.02,"[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0...",0.58,0.45,40.0,"Counter({0: 145, 1: 13, -1: 5})"
5,0.2,-0.0,"[51, 181, 273, 492, 578, 703, 755, 799, 885, 1...",0.01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.42,0.58,163.0,"Counter({0: 150, 1: 13})"
6,0.14,0.11,"[17, 91, 195, 293, 306, 442, 864, 1248, 1283, ...",0.13,"[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, ...",-0.53,0.35,82.0,"Counter({1: 108, -1: 42, 0: 13})"
7,0.15,0.15,"[163, 462, 845, 1370, 2995, 3219, 4308, 4625, ...",0.14,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, -1, ...",-0.29,0.37,28.0,"Counter({1: 100, -1: 50, 0: 13})"
8,0.17,-0.01,"[2936, 8860]",0.0,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, ...",0.13,0.66,2.0,"Counter({0: 96, 1: 51, -1: 16})"
9,0.14,-0.0,"[9128, 10645, 10649]",0.0,"[-1, 1, -1, 0, 0, 1, 1, 0, 1, 1, 1, 1, -1, 1, ...",0.03,0.44,3.0,"Counter({1: 95, 0: 38, -1: 30})"


NameError: name 'globalResults' is not defined

# Interpret results

In [12]:
additional_results, best_subspace_match, best_meta_subspace = bio_analysis.clinical_data_analysis(
    additional_df, solutions, n_clusters)

best_subspace_match

Found 2 values for vital_status
Clustering numeric values for days_to_death
Clustering numeric values for days_to_last_followup
No more than 1 class found for additional_studies
Clustering numeric values for age_at_initial_pathologic_diagnosis
No more than 1 class found for b_symptoms
No more than 1 class found for bcr
Found 3 values for bcr_canonical_reason-2
No more than 1 class found for bcr_canonical_reason-3
Found 4 values for bcr_canonical_reason
Found 144 values for bcr_followup_barcode, skipping
Found 144 values for bcr_followup_uuid, skipping
Found 2 values for bcr_patient_canonical_status
Found 4 values for clinical_m
Found 5 values for clinical_n
Found 5 values for clinical_stage
Found 12 values for clinical_t
No more than 1 class found for clinical_trail_drug_classification
No more than 1 class found for day_of_dcc_upload
Clustering numeric values for day_of_form_completion
Clustering numeric values for days_to_birth
No more than 1 class found for days_to_index
No more than

Unnamed: 0,subspace,ari,additional_data,n
0,0,0.31,bcr_canonical_reason,31
1,0,0.31,bcr_patient_canonical_status,163
2,1,0.36,bcr_patient_canonical_status,163
3,2,0.34,bcr_canonical_reason,31
4,3,0.17,race,155
5,4,0.19,bcr_canonical_reason-2,12
6,5,0.34,bcr_canonical_reason,31
7,6,0.34,bcr_patient_canonical_status,163
8,7,0.23,followup_treatment_success,93
9,8,0.19,primary_therapy_outcome_success,116


In [10]:
method = "adapted_silhouette"
threshold=0.1
score_tolerance=0.01
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:7000],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:13<02:06,  4.50s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"


Selecting (27, 4) from archive


 19%|█▉        | 6/31 [00:59<03:52,  9.30s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"


Selecting (13, 4) from archive


 29%|██▉       | 9/31 [01:21<02:42,  7.38s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"


Selecting (39, 4) from archive


 39%|███▊      | 12/31 [02:04<03:11, 10.06s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"


Selecting (1, 4) from archive


 48%|████▊     | 15/31 [02:32<02:23,  8.99s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"


Selecting (30, 4) from archive


 58%|█████▊    | 18/31 [02:55<01:41,  7.78s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"


Selecting (49, 4) from archive


 68%|██████▊   | 21/31 [03:49<01:57, 11.77s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"


Selecting (7, 4) from archive


 77%|███████▋  | 24/31 [04:22<01:10, 10.10s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"


Selecting (9, 4) from archive


 87%|████████▋ | 27/31 [04:49<00:34,  8.69s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"


Selecting (1, 4) from archive


 97%|█████████▋| 30/31 [05:11<00:07,  7.43s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"
9,0.430161,-0.0,"[11, 272, 279, 538, 786, 1122, 1658, 1806, 187...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0,...",60.0,"Counter({0: 147, 1: 12, -1: 4})"


Selecting (44, 4) from archive


100%|██████████| 31/31 [05:30<00:00, 10.65s/it]


Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"
9,0.430161,-0.0,"[11, 272, 279, 538, 786, 1122, 1658, 1806, 187...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0,...",60.0,"Counter({0: 147, 1: 12, -1: 4})"


# Interpret results

In [11]:
additional_results, best_subspace_match, best_meta_subspace = bio_analysis.clinical_data_analysis(
    additional_df, solutions, n_clusters)

best_subspace_match

Found 2 values for vital_status
Clustering numeric values for days_to_death
Clustering numeric values for days_to_last_followup
No more than 1 class found for additional_studies
Clustering numeric values for age_at_initial_pathologic_diagnosis
No more than 1 class found for b_symptoms
No more than 1 class found for bcr
Found 3 values for bcr_canonical_reason-2
No more than 1 class found for bcr_canonical_reason-3
Found 4 values for bcr_canonical_reason
Found 144 values for bcr_followup_barcode, skipping
Found 144 values for bcr_followup_uuid, skipping
Found 2 values for bcr_patient_canonical_status
Found 4 values for clinical_m
Found 5 values for clinical_n
Found 5 values for clinical_stage
Found 12 values for clinical_t
No more than 1 class found for clinical_trail_drug_classification
No more than 1 class found for day_of_dcc_upload
Clustering numeric values for day_of_form_completion
Clustering numeric values for days_to_birth
No more than 1 class found for days_to_index
No more than

Unnamed: 0,subspace,ari,additional_data,n
0,0,0.32,eastern_cancer_oncology_group,51
1,1,0.25,followup_treatment_success,93
2,2,0.36,bcr_patient_canonical_status,163
3,3,0.97,gender,163
4,4,0.34,bcr_canonical_reason-2,12
5,5,0.37,bcr_canonical_reason,31
6,6,0.34,bcr_canonical_reason,31
7,7,0.24,performance_status_scale_timing,47
8,8,0.45,performance_status_scale_timing,47
9,9,0.31,bcr_canonical_reason,31


# Supervised analysis

In [13]:
from sklearn import mixture
import hdbscan

In [14]:
ranked_features = feature_ranking.supervised_feature_ranking(data, truth, 
                        nbTopFeatures = data.shape[1])
data = data[:, ranked_features]
imp_f = np.arange(50)



In [15]:
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = data[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.38019029638363355, 13
 HDBSCAN ari = 0.1330631539832034, 29


In [16]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest
sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = sel[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.3214204613388755, 25
 HDBSCAN ari = 0.12387058928552755, 46


In [17]:
input_data = data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")


pred = KMeans(n_clusters= n_clusters).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

GMM ari = -0.001600206262677153
Kmeans ari = -0.004262637679583187


In [18]:
input_data = data
pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari {ari}")

HDBSCAN ari 0.07843553290200272


In [19]:
# Predict on PCA
pca = PCA(2)
pca_data = pca.fit_transform(data)
input_data = pca_data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")

GMM ari = 0.04654132015355863
HDBSCAN ari = 0.002398858462670687


# Other methods

In [10]:
from sklearn import mixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
import hdbscan

row = {}
clustering = AffinityPropagation(random_state=5).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"Affinity {ari}")
row["AffinityPropagation"] = ari

clustering = SpectralClustering(n_clusters=n_clusters, random_state=0).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"Spectral {ari}")
row["Spectral"] = ari

clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"KMeans {ari}")
row["KMeans"] = ari

gmm = mixture.GaussianMixture(n_components=n_clusters,
              covariance_type="full", random_state=0)
pred = gmm.fit_predict(data[:, :8000])
ari = adjusted_rand_score(truth, pred)
print(f"GMM {ari}")
row["GMM"] = ari

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN {ari}")
row["HDBSCAN"] = ari

pca = PCA(2)
pca_data = pca.fit_transform(data)

clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(pca_data)
ari = adjusted_rand_score(truth, clustering.labels_)
print(f"PCA KMeans {ari}")
row["PCA_KMeans"] = ari

gmm = mixture.GaussianMixture(n_components=n_clusters,
              covariance_type="full", random_state=0)
pred = gmm.fit_predict(pca_data)
ari = adjusted_rand_score(truth, pred)
print(f"PCA GMM {ari}")
row["PCA_GMM"] = ari

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(pca_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"PCAHDBSCAN {ari}")
row["PCA_HDBSCAN"] = ari


Affinity 0.027159549508245145


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, center_shift, n_threads)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, center_shift, n_threads)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, center_shift, n_threads)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, center_shift, n_threads)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, center_shift, n_threads)
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  ret = a @ b
  distances += XX
  labels, c

Spectral 0.0
KMeans -0.0061531912161480115
GMM -0.005742530639915901
HDBSCAN 0.07843553290200272
PCA KMeans -0.006151298001943363
PCA GMM 0.04654132015355863
PCAHDBSCAN 0.002398858462670687
