In [None]:
# !pip install scanpy==1.5.1
#https://bioconductor.org/packages/devel/bioc/vignettes/splatter/inst/doc/splatter.html

In [1]:
import random
import sys
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
from sklearn import preprocessing
import tensorflow as tf
from IPython import get_ipython
from keras.backend.tensorflow_backend import set_session
from tqdm import tqdm
from collections import Counter
import h5py
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

plt.ion()
plt.show()
sys.path.append("..")

random_state=0
random.seed( random_state )
np.random.seed(random_state)
method = "adapted_ratkowsky_lance"
from warnings import filterwarnings
filterwarnings('ignore')

import h5py
import scanpy.api as sc
import scipy as sp
# from preprocess import read_dataset, normalize

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



In [2]:

def read_dataset(adata, transpose=False, test_split=False, copy=False):

    if isinstance(adata, sc.AnnData):
        if copy:
            adata = adata.copy()
    elif isinstance(adata, str):
        adata = sc.read(adata)
    else:
        raise NotImplementedError

    norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.'
    assert 'n_count' not in adata.obs, norm_error

    if adata.X.size < 50e6: # check if adata.X is integer only if array is small
        if sp.sparse.issparse(adata.X):
            assert (adata.X.astype(int) != adata.X).nnz == 0, norm_error
        else:
            assert np.all(adata.X.astype(int) == adata.X), norm_error

    if transpose: adata = adata.transpose()

    if test_split:
        train_idx, test_idx = train_test_split(np.arange(adata.n_obs), test_size=0.1, random_state=42)
        spl = pd.Series(['train'] * adata.n_obs)
        spl.iloc[test_idx] = 'test'
        adata.obs['DCA_split'] = spl.values
    else:
        adata.obs['DCA_split'] = 'train'

    adata.obs['DCA_split'] = adata.obs['DCA_split'].astype('category')
    print('### Autoencoder: Successfully preprocessed {} genes and {} cells.'.format(adata.n_vars, adata.n_obs))

    return adata


def normalize(adata, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True):

    if filter_min_counts:
        sc.pp.filter_genes(adata, min_counts=1)
        sc.pp.filter_cells(adata, min_counts=1)

    if size_factors or normalize_input or logtrans_input:
        adata.raw = adata.copy()
    else:
        adata.raw = adata

    if size_factors:
        sc.pp.normalize_per_cell(adata)
        adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts)
    else:
        adata.obs['size_factors'] = 1.0

    if logtrans_input:
        sc.pp.log1p(adata)

    if normalize_input:
        sc.pp.scale(adata)

    return adata

In [3]:
filename1 = "data/scRNAseq/1/dt3dropout_0"
filename2 = "data/scRNAseq/2/dt3dropout_0"

In [4]:
data_mat = h5py.File(f"{filename1}.h5", "r")
x = np.array(data_mat['X'])
y = np.array(data_mat['Y'])

# preprocessing scRNA-seq read counts matrix
adata = sc.AnnData(x)
adata.obs['Group'] = y

adata = read_dataset(adata,
                 transpose=False,
                 test_split=False,
                 copy=True)

adata = normalize(adata,
                  size_factors=True,
                  normalize_input=True,
                  logtrans_input=True)

idx = np.arange(adata.X.shape[0])
np.random.shuffle(idx)

data1 = adata.X[idx]
# data = preprocessing.MinMaxScaler().fit_transform(data)
truth1 = y[idx]


data_mat = h5py.File(f"{filename2}.h5", "r")
x = np.array(data_mat['X'])
y = np.array(data_mat['Y'])

# preprocessing scRNA-seq read counts matrix
adata = sc.AnnData(x)
adata.obs['Group'] = y

adata = read_dataset(adata,
                 transpose=False,
                 test_split=False,
                 copy=True)

adata = normalize(adata,
                  size_factors=True,
                  normalize_input=True,
                  logtrans_input=True)


data2 = adata.X
# data = preprocessing.MinMaxScaler().fit_transform(data)
truth2 = y
n_clusters = len(np.unique(truth1))

Counter(truth1), data1.shape, Counter(truth2), data2.shape

### Autoencoder: Successfully preprocessed 300 genes and 400 cells.
### Autoencoder: Successfully preprocessed 300 genes and 400 cells.


(Counter({1.0: 141, 0.0: 139, 2.0: 120}),
 (400, 300),
 Counter({2.0: 149, 0.0: 129, 1.0: 122}),
 (400, 300))

In [5]:
uniform_data = np.random.uniform(size = (400,1000))

In [6]:
data = np.hstack([data1, data2, uniform_data])
truth = truth1
data.shape

(400, 1600)

In [7]:
meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=85,
                                              z_file=None,
                                              metric='euclidean',
                                              redundant_threshold=1)

*** Computing 1D feature ranking ...
Dispersion tests took 0.1 sec
Entropy computation 0.79 sec
KNN computation 0.08 sec
Sorting and thresholds 0.03 sec
Performing hierarchical clustering...
Hierarchical clustering 0.37 sec
Handle redundant features 0.01 sec
Returning 0 redundant features and  39 important features


In [8]:
model_file = "models/gmm_arl.h5"
gmm_arl_population, n = features_2d.run(data,
                                n_clusters,
                                meta_features,
                                model_file=model_file,
                                theta=0.1,
                                add_close_population=True,
                                exploration_factor = 3)
print(gmm_arl_population.shape, n)

*** Exploring 2D feature space with NN ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


trimming 5200 to 800
handle_close_important (800, 3), total 5219, 0.1854020059108734
relevant_features 1211 => computing 7219 
trimming 8070 to 0
handle_important_features (800, 3),  total 8203, 0.1854020059108734
irrelevant_features 389 => computing 1144
trimming 2028 to 800
handle_not_important_features (1600, 3), total 2042, 0.1854020059108734
handle_all_features 1600 => computing 4788
trimming 5555 to 1600
handle_all_features (3200, 3),  total 5773, 0.1854020059108734
Returning (3200, 3), explored a total of 21237 feature pairs
(3200, 4) 21237


In [9]:
globalResults = {}

In [10]:
method = "adapted_ratkowsky_lance"
threshold=0.09#
score_tolerance=0.009
clustering = "gmm"

round_size = 4
debug = False
ignore_redundant= True
epochs = 10*round_size
pca = False
sampling = {
    "ARCHIVE2D": { 
        "ga": 0.4,
        "max": 0.4 },
    "CLOSE": { 
        "ga": 0.3,
        "max": 0.3 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:600],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 300,
    max_subspace_size = 80,
    maximisation_size = 100,
    min_cluster_size = 70,
    pca = pca

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
display(solutions)
globalResults[f"{clustering}_{method}_{pca}"] = solutions

  0%|          | 0/41 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 1600, orig size 1600, nb imp : 39
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [40 30 20 10] [0.4, 0.3, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 4/41 [00:18<02:55,  4.75s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"


Selecting (26, 4) from archive


 20%|█▉        | 8/41 [00:46<03:04,  5.61s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"


Selecting (47, 4) from archive


 29%|██▉       | 12/41 [01:14<02:55,  6.04s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"


Selecting (14, 4) from archive


 39%|███▉      | 16/41 [01:41<02:27,  5.92s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"


Selecting (3, 4) from archive


 49%|████▉     | 20/41 [02:06<01:58,  5.66s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"


Selecting (3, 4) from archive


 59%|█████▊    | 24/41 [02:30<01:29,  5.28s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"


Selecting (1, 4) from archive


 68%|██████▊   | 28/41 [02:53<01:05,  5.03s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"
6,0.143988,-0.0,"[249, 722]","[1, 1, 2, 1, 2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, ...",2.0,"Counter({2: 156, 1: 149, 0: 95})"


Selecting (2, 4) from archive


 78%|███████▊  | 32/41 [03:14<00:42,  4.75s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"
6,0.143988,-0.0,"[249, 722]","[1, 1, 2, 1, 2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, ...",2.0,"Counter({2: 156, 1: 149, 0: 95})"
7,0.142802,-0.0,"[680, 766]","[0, 1, 1, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 0, ...",2.0,"Counter({1: 142, 0: 133, 2: 125})"


Selecting (3, 4) from archive


 88%|████████▊ | 36/41 [03:37<00:24,  4.95s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"
6,0.143988,-0.0,"[249, 722]","[1, 1, 2, 1, 2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, ...",2.0,"Counter({2: 156, 1: 149, 0: 95})"
7,0.142802,-0.0,"[680, 766]","[0, 1, 1, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 0, ...",2.0,"Counter({1: 142, 0: 133, 2: 125})"
8,0.142156,-0.0,"[745, 1210]","[1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...",2.0,"Counter({1: 150, 0: 127, 2: 123})"


Selecting (4, 4) from archive


 98%|█████████▊| 40/41 [03:59<00:04,  4.95s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"
6,0.143988,-0.0,"[249, 722]","[1, 1, 2, 1, 2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, ...",2.0,"Counter({2: 156, 1: 149, 0: 95})"
7,0.142802,-0.0,"[680, 766]","[0, 1, 1, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 0, ...",2.0,"Counter({1: 142, 0: 133, 2: 125})"
8,0.142156,-0.0,"[745, 1210]","[1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...",2.0,"Counter({1: 150, 0: 127, 2: 123})"
9,0.14102,-0.0,"[1028, 1291]","[1, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, ...",2.0,"Counter({2: 150, 1: 143, 0: 107})"


Selecting (4, 4) from archive


100%|██████████| 41/41 [04:11<00:00,  6.12s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.221771,0.0,"[316, 378, 418, 461, 503, 530, 546, 555, 564, ...","[2, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 0, 2, 2, ...",11.0,"Counter({2: 149, 0: 129, 1: 122})"
1,0.204868,1.0,"[46, 63, 65, 88, 90, 91, 94, 103, 146, 198, 23...","[1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, ...",13.0,"Counter({1: 141, 0: 139, 2: 120})"
2,0.157856,0.0,"[306, 409, 447, 489, 497, 501, 553, 567]","[1, 1, 0, 1, 2, 1, 0, 2, 0, 1, 0, 0, 0, 1, 1, ...",8.0,"Counter({1: 147, 2: 133, 0: 120})"
3,0.152133,0.0,"[265, 585]","[0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, ...",2.0,"Counter({1: 141, 0: 139, 2: 120})"
4,0.15017,-0.0,"[205, 384]","[1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 1, 0, 0, 2, ...",2.0,"Counter({2: 168, 0: 141, 1: 91})"
5,0.145689,-0.0,"[465, 568]","[0, 2, 1, 2, 0, 1, 2, 2, 1, 2, 1, 1, 1, 0, 0, ...",2.0,"Counter({2: 155, 1: 141, 0: 104})"
6,0.143988,-0.0,"[249, 722]","[1, 1, 2, 1, 2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, ...",2.0,"Counter({2: 156, 1: 149, 0: 95})"
7,0.142802,-0.0,"[680, 766]","[0, 1, 1, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 0, ...",2.0,"Counter({1: 142, 0: 133, 2: 125})"
8,0.142156,-0.0,"[745, 1210]","[1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...",2.0,"Counter({1: 150, 0: 127, 2: 123})"
9,0.14102,-0.0,"[1028, 1291]","[1, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, ...",2.0,"Counter({2: 150, 1: 143, 0: 107})"


In [11]:
method = "adapted_silhouette"
threshold=0.09#0.3,#
score_tolerance=0.009
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:600],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 300,
    max_subspace_size = 100,
    pca = False,
    hdbscan_min_cluster_size =20

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
# solutions.to_pickle(f"data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 1600, orig size 1600, nb imp : 39
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:25<04:11,  8.98s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"


Selecting (23, 4) from archive


 19%|█▉        | 6/31 [01:05<04:17, 10.31s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"


Selecting (11, 4) from archive


 29%|██▉       | 9/31 [01:36<03:33,  9.71s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"


Selecting (19, 4) from archive


 39%|███▊      | 12/31 [02:13<03:11, 10.08s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"


Selecting (12, 4) from archive


 48%|████▊     | 15/31 [02:47<02:41, 10.07s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"


 58%|█████▊    | 18/31 [03:21<02:10, 10.02s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"


Selecting (21, 4) from archive


 68%|██████▊   | 21/31 [03:52<01:36,  9.65s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"
6,0.515782,0.01,"[229, 676, 901, 1448]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0,"Counter({1: 317, 0: 79, -1: 4})"


Selecting (4, 4) from archive


 77%|███████▋  | 24/31 [04:31<01:15, 10.85s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"
6,0.515782,0.01,"[229, 676, 901, 1448]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0,"Counter({1: 317, 0: 79, -1: 4})"
7,0.557753,-0.0,"[205, 428]","[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, -1, 1, 1, 0,...",2.0,"Counter({0: 241, 1: 146, -1: 13})"


Selecting (26, 4) from archive


 87%|████████▋ | 27/31 [05:07<00:43, 10.99s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"
6,0.515782,0.01,"[229, 676, 901, 1448]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0,"Counter({1: 317, 0: 79, -1: 4})"
7,0.557753,-0.0,"[205, 428]","[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, -1, 1, 1, 0,...",2.0,"Counter({0: 241, 1: 146, -1: 13})"
8,0.58006,0.0,"[20, 612, 618, 622, 1224]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5.0,"Counter({1: 362, 0: 28, -1: 10})"


Selecting (1, 4) from archive


 97%|█████████▋| 30/31 [05:54<00:12, 12.56s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"
6,0.515782,0.01,"[229, 676, 901, 1448]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0,"Counter({1: 317, 0: 79, -1: 4})"
7,0.557753,-0.0,"[205, 428]","[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, -1, 1, 1, 0,...",2.0,"Counter({0: 241, 1: 146, -1: 13})"
8,0.58006,0.0,"[20, 612, 618, 622, 1224]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5.0,"Counter({1: 362, 0: 28, -1: 10})"
9,0.571852,0.0,"[257, 374]","[1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 270, 2: 73, 0: 42, -1: 15})"


100%|██████████| 31/31 [06:23<00:00, 12.37s/it]


Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.5442,0.0,"[418, 430, 503, 530, 555, 564, 603, 615, 643, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...",12.0,"Counter({1: 278, 0: 122})"
1,0.486536,-0.0,"[316, 461, 518, 610, 735, 861]","[0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, ...",6.0,"Counter({0: 256, 1: 123, -1: 21})"
2,0.576358,0.0,"[2, 26, 239, 388, 473]","[0, 0, 0, 2, 0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",5.0,"Counter({0: 235, -1: 69, 2: 60, 1: 36})"
3,0.514481,0.59,"[88, 103, 198, 284, 602, 637, 650, 973, 1025]","[0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, ...",9.0,"Counter({0: 261, 1: 139})"
4,0.600035,0.0,"[45, 472]","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ...",2.0,"Counter({1: 325, 0: 51, -1: 24})"
5,0.507601,0.01,"[136, 585]","[1, 0, 1, 1, -1, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2,...",2.0,"Counter({1: 216, 2: 87, 0: 69, -1: 28})"
6,0.515782,0.01,"[229, 676, 901, 1448]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4.0,"Counter({1: 317, 0: 79, -1: 4})"
7,0.557753,-0.0,"[205, 428]","[1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, -1, 1, 1, 0,...",2.0,"Counter({0: 241, 1: 146, -1: 13})"
8,0.58006,0.0,"[20, 612, 618, 622, 1224]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5.0,"Counter({1: 362, 0: 28, -1: 10})"
9,0.571852,0.0,"[257, 374]","[1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 270, 2: 73, 0: 42, -1: 15})"


# Supervised analysis

In [12]:
from sklearn import mixture
import hdbscan

In [15]:
ranked_features = feature_ranking.supervised_feature_ranking(data, truth, 
                        nbTopFeatures = data.shape[1])
data = data[:, ranked_features]
imp_f = np.arange(50)

In [16]:
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = data[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 1.0, 2
 HDBSCAN ari = 0.9921412702468273, 12


In [14]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest
sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = sel[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 1.0, 2
 HDBSCAN ari = 0.9921412702468273, 12


In [None]:
input_data = data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")

pred = KMeans(n_clusters= n_clusters).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

In [None]:
# Predict on PCA
pca = PCA(2)
pca_data = pca.fit_transform(data)
input_data = pca_data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =10).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")

In [None]:
# import scripts.ga_evaluation as ga_evaluation

# r1 = ga_evaluation.random_sampling(data, truth, n_clusters, algo = "gmm")
# r2 = ga_evaluation.random_sampling(data, truth, n_clusters, algo = "hdbscan")
# print(f"Random sampling GMM {r1}, HDBSCAN {r2}")