# Summary

This notebook allows to reproduce the method results on the KIRP dataset.  
We have analyzed the dataset with both GMM and HDBSCAN algorithms.

In [1]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
import tensorflow as tf
from IPython import get_ipython
from tqdm import tqdm
from collections import Counter

plt.ion()
plt.show()

random_state=1
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



Using TensorFlow backend.


# Preprocessing

In [2]:
# truth_column = "tumor_type"
# truth_values = ['type 1', 'type 2']
# filename = "KIRP"

# df = pd.read_csv("../data/rna_data/KIRP.txt", sep = "\t", low_memory=False)
# meta = pd.read_csv("../data/rna_data/KIRP_All_CDEs.txt", sep = "\t", low_memory=False)

# preprocess.preprocess_rna(df,
#                    meta,
#                    truth_column,
#                    truth_values,
#                    filename,
#                    metric='correlation',#'euclidean',
#                    normalize=True)

# Load preprocessed data

## Start here if preprocessing files have been generated

In [3]:
filename = "KIRP"

data = pd.read_pickle(f"../data/rna_data/{filename}.pkl")
z_file =f"../data/rna_data/{filename}_Z_correlation.npy"
additional_df = pd.read_pickle(f"../data/rna_data/{filename}_additional.pkl")

truth = data["y"].values
data = data.drop("y", axis = 1).values
n_clusters = len(np.unique(truth))
Counter(truth), data.shape

(Counter({0: 77, 1: 86}), (163, 17938))

# Subspace clustering

In [4]:
meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=85,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.5)


*** Computing 1D feature ranking ...
Dispersion tests took 0.98 sec
Entropy computation 3.47 sec
KNN computation 35.12 sec
Sorting and thresholds 0.1 sec
Loading clustering from file
Hierarchical clustering 0.05 sec
Handle redundant features 2.92 sec
Returning 6994 redundant features and  1149 important features


In [6]:
model_file = "../models/gmm_arl.h5"
gmm_arl_population, n = features_2d.run(data,
                                n_clusters,
                                meta_features,
                                model_file=model_file,
                                theta=0.1,
                                add_close_population=False,
                                exploration_factor = 5)
print(gmm_arl_population.shape, n)

*** Exploring 2D feature space with NN ...
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


trimming 14781 to 5472
handle_close_important (5472, 3), total 16287, 0.29897671937942505
relevant_features 5644 => computing 56334 
trimming 36955 to 6994
handle_important_features (12466, 3),  total 56334, 0.29897671937942505
irrelevant_features 5300 => computing 26468
trimming 20893 to 5472
handle_not_important_features (17938, 3), total 26468, 0.29897671937942505
handle_all_features 10944 => computing 54688
trimming 38342 to 10000
handle_all_features (27938, 3),  total 54688, 0.29897671937942505
Returning (27938, 3), explored a total of 153777 feature pairs
(27938, 4) 153777


In [7]:
globalResults = {} # Save results for both runs

In [8]:
method = "adapted_ratkowsky_lance"
threshold=0.09
score_tolerance=0.009
clustering = "gmm"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:7000],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 400

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:19<03:10,  6.79s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"


Selecting (1, 4) from archive


 19%|█▉        | 6/31 [00:48<03:16,  7.84s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"


Selecting (1, 4) from archive


 29%|██▉       | 9/31 [01:13<02:48,  7.64s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"


Selecting (34, 4) from archive


 39%|███▊      | 12/31 [02:49<05:41, 17.97s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"


Selecting (1, 4) from archive


 48%|████▊     | 15/31 [03:18<03:07, 11.72s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"


Selecting (11, 4) from archive


 58%|█████▊    | 18/31 [06:05<06:24, 29.60s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"


Selecting (1, 4) from archive


 68%|██████▊   | 21/31 [06:24<02:19, 13.95s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"
6,0.234115,-0.01,"[594, 1469, 3627, 4375, 4943, 10703, 11155, 12...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",8.0,"Counter({0: 92, 1: 71})"


Selecting (2, 4) from archive


 77%|███████▋  | 24/31 [06:43<01:00,  8.67s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"
6,0.234115,-0.01,"[594, 1469, 3627, 4375, 4943, 10703, 11155, 12...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",8.0,"Counter({0: 92, 1: 71})"
7,0.237996,0.01,"[6286, 8434]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.0,"Counter({0: 125, 1: 38})"


Selecting (1, 4) from archive


 87%|████████▋ | 27/31 [07:02<00:27,  6.76s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"
6,0.234115,-0.01,"[594, 1469, 3627, 4375, 4943, 10703, 11155, 12...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",8.0,"Counter({0: 92, 1: 71})"
7,0.237996,0.01,"[6286, 8434]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.0,"Counter({0: 125, 1: 38})"
8,0.263167,0.09,"[461, 462, 466, 4625, 5126, 6138, 6267, 10770,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...",22.0,"Counter({0: 120, 1: 43})"


Selecting (4, 4) from archive


 97%|█████████▋| 30/31 [07:25<00:06,  6.85s/it]

Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"
6,0.234115,-0.01,"[594, 1469, 3627, 4375, 4943, 10703, 11155, 12...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",8.0,"Counter({0: 92, 1: 71})"
7,0.237996,0.01,"[6286, 8434]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.0,"Counter({0: 125, 1: 38})"
8,0.263167,0.09,"[461, 462, 466, 4625, 5126, 6138, 6267, 10770,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...",22.0,"Counter({0: 120, 1: 43})"
9,0.22836,0.16,"[17, 91, 943, 1884, 1994, 2016, 2454, 2813, 28...","[1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",83.0,"Counter({0: 103, 1: 60})"


Selecting (34, 4) from archive


100%|██████████| 31/31 [08:35<00:00, 16.62s/it]


Unnamed: 0,adapted_ratkowsky_lance,ari,features,partition,size,structure
0,0.278408,-0.0,"[4332, 11017]","[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...",2.0,"Counter({1: 126, 0: 37})"
1,0.278181,0.0,"[6784, 8892]","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",2.0,"Counter({0: 125, 1: 38})"
2,0.268626,-0.0,"[181, 351, 555, 750, 812, 816, 1105, 1215, 124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65.0,"Counter({0: 143, 1: 20})"
3,0.310809,-0.01,"[4187, 4188, 4329, 4935, 8012, 12291, 13404, 1...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",11.0,"Counter({0: 116, 1: 47})"
4,0.271371,-0.0,"[51, 163, 308, 561, 1159, 1162, 1176, 1604, 16...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",151.0,"Counter({0: 150, 1: 13})"
5,0.256949,0.01,"[7220, 7230, 7231]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.0,"Counter({0: 150, 1: 13})"
6,0.234115,-0.01,"[594, 1469, 3627, 4375, 4943, 10703, 11155, 12...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, ...",8.0,"Counter({0: 92, 1: 71})"
7,0.237996,0.01,"[6286, 8434]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",2.0,"Counter({0: 125, 1: 38})"
8,0.263167,0.09,"[461, 462, 466, 4625, 5126, 6138, 6267, 10770,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...",22.0,"Counter({0: 120, 1: 43})"
9,0.22836,0.16,"[17, 91, 943, 1884, 1994, 2016, 2454, 2813, 28...","[1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",83.0,"Counter({0: 103, 1: 60})"


# Interpret results

In [9]:
additional_results, best_subspace_match, best_meta_subspace = bio_analysis.clinical_data_analysis(
    additional_df, solutions, n_clusters)

best_subspace_match

Found 2 values for vital_status
Clustering numeric values for days_to_death
Clustering numeric values for days_to_last_followup
No more than 1 class found for additional_studies
Clustering numeric values for age_at_initial_pathologic_diagnosis
No more than 1 class found for b_symptoms
No more than 1 class found for bcr
Found 3 values for bcr_canonical_reason-2
No more than 1 class found for bcr_canonical_reason-3
Found 4 values for bcr_canonical_reason
Found 144 values for bcr_followup_barcode, skipping
Found 144 values for bcr_followup_uuid, skipping
Found 2 values for bcr_patient_canonical_status
Found 4 values for clinical_m
Found 5 values for clinical_n
Found 5 values for clinical_stage
Found 12 values for clinical_t
No more than 1 class found for clinical_trail_drug_classification
No more than 1 class found for day_of_dcc_upload
Clustering numeric values for day_of_form_completion
Clustering numeric values for days_to_birth
No more than 1 class found for days_to_index
No more than

Unnamed: 0,subspace,ari,additional_data,n
0,0,0.26,erythrocyte_sedimentation_rate_result,13
1,1,0.26,erythrocyte_sedimentation_rate_result,13
2,2,0.34,bcr_canonical_reason,31
3,3,0.97,gender,163
4,4,0.36,bcr_patient_canonical_status,163
5,5,0.12,serum_calcium_result,95
6,6,0.07,days_to_death,21
7,6,0.07,bcr_canonical_reason-2,12
8,7,0.33,performance_status_scale_timing,47
9,8,0.23,followup_treatment_success,93


In [10]:
method = "adapted_silhouette"
threshold=0.1
score_tolerance=0.01
clustering = "hdbscan"

round_size = 3
debug = False
ignore_redundant= True
epochs = 10*round_size

sampling = {
    "ARCHIVE2D": { 
        "ga": 0.3,
        "max": 0.3 },
    "CLOSE": { 
        "ga": 0.4,
        "max": 0.4 },
    "IMP1D": { 
        "ga": 0.2,
        "max": 0.2 },
    "RANDOM": { 
        "ga": 0.1,
        "max": 0.1},
}
params = ga.ga_parameters(
    n_clusters,
    data.shape[1],
    truth,
    meta_features,
    method=method,
    truth_methods=['ari'],
    archive_2d=gmm_arl_population[gmm_arl_population["pred"] > threshold].iloc[:7000],
    debug=debug,
    epochs=epochs,
    round_size=round_size,
    sampling = sampling,
    ignore_redundant = ignore_redundant,
    allow_subspace_overlap = False,
    improvement_per_mutation_report = True,
    score_tolerance=score_tolerance,
    clustering = clustering,
    total_maximisation_exploration = 500

)
print(params["sampling_actions"], params["maximisation_sizes"] , params["sampling_prob"])
params

solutions, archive= ga.run(data, params)
solutions.to_pickle(f"../data/{filename}_{clustering}_{method}.pkl")
display(solutions)
globalResults[f"{clustering}_{method}"] = solutions

  0%|          | 0/31 [00:00<?, ?it/s]

*** Optimization algorithm 
Non redundant features 10944, orig size 17938, nb imp : 1149
['ARCHIVE2D', 'CLOSE', 'IMP1D', 'RANDOM'] [30 40 20 10] [0.3, 0.4, 0.2, 0.1]
Selecting (50, 4) from archive


 10%|▉         | 3/31 [00:13<02:06,  4.50s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"


Selecting (27, 4) from archive


 19%|█▉        | 6/31 [00:59<03:52,  9.30s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"


Selecting (13, 4) from archive


 29%|██▉       | 9/31 [01:21<02:42,  7.38s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"


Selecting (39, 4) from archive


 39%|███▊      | 12/31 [02:04<03:11, 10.06s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"


Selecting (1, 4) from archive


 48%|████▊     | 15/31 [02:32<02:23,  8.99s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"


Selecting (30, 4) from archive


 58%|█████▊    | 18/31 [02:55<01:41,  7.78s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"


Selecting (49, 4) from archive


 68%|██████▊   | 21/31 [03:49<01:57, 11.77s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"


Selecting (7, 4) from archive


 77%|███████▋  | 24/31 [04:22<01:10, 10.10s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"


Selecting (9, 4) from archive


 87%|████████▋ | 27/31 [04:49<00:34,  8.69s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"


Selecting (1, 4) from archive


 97%|█████████▋| 30/31 [05:11<00:07,  7.43s/it]

Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"
9,0.430161,-0.0,"[11, 272, 279, 538, 786, 1122, 1658, 1806, 187...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0,...",60.0,"Counter({0: 147, 1: 12, -1: 4})"


Selecting (44, 4) from archive


100%|██████████| 31/31 [05:30<00:00, 10.65s/it]


Unnamed: 0,adapted_silhouette,ari,features,partition,size,structure
0,0.592186,-0.01,"[72, 74, 199, 318, 413, 492, 555, 750, 816, 12...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",160.0,"Counter({0: 143, 1: 12, -1: 8})"
1,0.530758,-0.0,"[500, 1401, 2218, 6104, 6130, 6784, 8892, 9386]","[0, 1, 1, 0, 1, 1, 1, 1, 1, 0, -1, 1, 1, 1, 0,...",8.0,"Counter({1: 125, 0: 31, -1: 7})"
2,0.623562,-0.0,"[40, 163, 181, 187, 273, 308, 578, 663, 755, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",164.0,"Counter({0: 150, 1: 13})"
3,0.580966,-0.01,"[4329, 6967, 8011, 8012, 12291, 13404, 16295, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, ...",13.0,"Counter({1: 116, 0: 47})"
4,0.551196,-0.0,"[4332, 6594, 7575, 11017, 16416, 16417]","[1, 0, -1, 1, 1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1...",6.0,"Counter({1: 123, 0: 29, -1: 11})"
5,0.475349,0.0,"[107, 158, 295, 337, 351, 827, 1149, 1249, 136...","[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",166.0,"Counter({0: 147, 1: 12, -1: 4})"
6,0.459385,0.0,"[885, 1176, 1367, 3514, 5156, 5278, 6672, 7220...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",14.0,"Counter({1: 140, 0: 15, -1: 8})"
7,0.40389,-0.01,"[1064, 4319, 6789, 7597, 9692, 11178, 11699, 1...","[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, ...",13.0,"Counter({1: 129, -1: 17, 0: 17})"
8,0.396975,0.0,"[6286, 8434, 8949]","[1, 0, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...",3.0,"Counter({1: 132, -1: 19, 0: 12})"
9,0.430161,-0.0,"[11, 272, 279, 538, 786, 1122, 1658, 1806, 187...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0,...",60.0,"Counter({0: 147, 1: 12, -1: 4})"


# Interpret results

In [11]:
additional_results, best_subspace_match, best_meta_subspace = bio_analysis.clinical_data_analysis(
    additional_df, solutions, n_clusters)

best_subspace_match

Found 2 values for vital_status
Clustering numeric values for days_to_death
Clustering numeric values for days_to_last_followup
No more than 1 class found for additional_studies
Clustering numeric values for age_at_initial_pathologic_diagnosis
No more than 1 class found for b_symptoms
No more than 1 class found for bcr
Found 3 values for bcr_canonical_reason-2
No more than 1 class found for bcr_canonical_reason-3
Found 4 values for bcr_canonical_reason
Found 144 values for bcr_followup_barcode, skipping
Found 144 values for bcr_followup_uuid, skipping
Found 2 values for bcr_patient_canonical_status
Found 4 values for clinical_m
Found 5 values for clinical_n
Found 5 values for clinical_stage
Found 12 values for clinical_t
No more than 1 class found for clinical_trail_drug_classification
No more than 1 class found for day_of_dcc_upload
Clustering numeric values for day_of_form_completion
Clustering numeric values for days_to_birth
No more than 1 class found for days_to_index
No more than

Unnamed: 0,subspace,ari,additional_data,n
0,0,0.32,eastern_cancer_oncology_group,51
1,1,0.25,followup_treatment_success,93
2,2,0.36,bcr_patient_canonical_status,163
3,3,0.97,gender,163
4,4,0.34,bcr_canonical_reason-2,12
5,5,0.37,bcr_canonical_reason,31
6,6,0.34,bcr_canonical_reason,31
7,7,0.24,performance_status_scale_timing,47
8,8,0.45,performance_status_scale_timing,47
9,9,0.31,bcr_canonical_reason,31


# Supervised analysis

In [6]:
from sklearn import mixture
import hdbscan

In [13]:
ranked_features = feature_ranking.supervised_feature_ranking(data, truth, 
                        nbTopFeatures = data.shape[1])
data = data[:, ranked_features]
imp_f = np.arange(50)

In [14]:
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = data[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.38019029638363355, 13
 HDBSCAN ari = 0.1330631539832034, 29


In [7]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest
sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
gmm_scores = []
hdbscan_scores = []
for i in range(2, 50):
    input_data = sel[:, :i]
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    gmm_scores.append(ari)

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    hdbscan_scores.append(ari)
print(f" GMM ari = {max(gmm_scores)}, {np.argmax(gmm_scores)}")
print(f" HDBSCAN ari = {max(hdbscan_scores)}, {np.argmax(hdbscan_scores)}")

 GMM ari = 0.30754071614679707, 6
 HDBSCAN ari = 0.17022685470051432, 12


In [None]:
input_data = data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")


pred = KMeans(n_clusters= n_clusters).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"Kmeans ari = {ari}")

In [None]:
input_data = data
pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari {ari}")

In [None]:
# Predict on PCA
pca = PCA(2)
pca_data = pca.fit_transform(data)
input_data = pca_data
gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
pred = gmm.fit_predict(input_data)
ari = adjusted_rand_score(truth, pred)
print(f"GMM ari = {ari}")

pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
ari = adjusted_rand_score(truth, pred)
print(f"HDBSCAN ari = {ari}")