# Summary
This notebook apply the proposed method (GMM clustering) to a set of microarray datasets.

In [1]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default


import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scripts.data_generator as data_generator
import scripts.feature_ranking as feature_ranking
import scripts.features_2d as features_2d
import scripts.ga as ga
import scripts.preprocess as preprocess
import scripts.ga_evaluation as ga_evaluation
import scripts.bio_analysis as bio_analysis
import tensorflow as tf
from IPython import get_ipython
from tqdm import tqdm
from collections import Counter
import seaborn as sns
import time


plt.ion()
plt.show()

random_state=1
random.seed( random_state )
np.random.seed(random_state)


%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



Using TensorFlow backend.

In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



# Steps
- Clustering 1d to select best discriminant features

- Clustering 2d to select redundant, close and outlier features

In [2]:
random_state=0
random.seed( random_state )
np.random.seed(random_state)

In [None]:
results = None
filenames = np.array(['alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh', 'west'])
clustering = "gmm"
path = '../data/microarray/'
method = "adapted_ratkowsky_lance"
imp_f = np.arange(20)
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values


    n_clusters = len(np.unique(truth))

    z_file= f"../data/microarray/Z_{name}_correlation.npy"
    print(f"\n##########  {name}, {data.shape}")

    # Clustering 1D
    meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=90,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.4)
    t2 = time.time()
    print(f"TIME: 1d Features : {(t2-t1)/60} min")

#     model_file = f'../models/gmm_arl.h5' 

#     population, n = features_2d.run(data,
#                                 n_clusters,
#                                 meta_features,
#                                 model_file=model_file,
#                                 add_close_population=False)
    t3 = time.time()
    t4 = time.time()
    print(f"TIME: 2d scores: {(t4-t3)/60} min")
    round_size = 3
    epochs = 10*round_size

    sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
    }
#     sampling = {
#         "ARCHIVE2D": { 
#             "ga": 0.25,
#             "max": 0.25 },
#         "CLOSE": { 
#             "ga": 0.4,
#             "max": 0.4 },
#         "IMP1D": { 
#             "ga": 0.25,
#             "max": 0.25 },
#         "RANDOM": { 
#             "ga": 0.1,
#             "max": 0.1},
#         }
    params = ga.ga_parameters(
        n_clusters,
        data.shape[1],
        truth,
        meta_features,
        method=method,
        truth_methods=['ari'],
        archive_2d=None,#population[:data.shape[1] // 4],
        epochs=epochs,
        sampling = sampling,
        round_size=round_size,
        allow_subspace_overlap = True,
        improvement_per_mutation_report = False,
        clustering = clustering
        
    )
    solutions, archive= ga.run(data, params)
#     display(params["report"].groupby(["op", "improvement"]).count())
    solutions["dataset_name"] = name
    
    t5 = time.time()
    n_total = t5-t1
    print(f"TIME: GA: {(t5-t4)/60} min")
    print(f"TIME: Total: {(t5-t1)/60} min")
    solutions["total_time"] = round((t5-t1)/60, 1)
    solutions["t(feature_sel)"] = round((t2-t1)/60, 1)
    solutions["t(cnn)"] = round((t3-t2)/60, 1)
    solutions["t(clustering2d)"] = round((t4-t3)/60, 1)
    solutions["t(ga)"] = round((t5-t4)/60, 1)
    
    solutions["n_ga"] = archive.shape[0]
    solutions["n_cnn"] = n_total
    solutions["input_size"] = data.shape[1]
    
    
    solutions["n_non_redundant"] = meta_features[(meta_features["redundant"] ==1) ].shape[0]
    solutions["n_imp"] = meta_features[(meta_features["relevance"] !=0)].shape[0]
    solutions["n_imp4"] = meta_features[(meta_features["relevance"] ==4)].shape[0]
    solutions["n_imp3"] = meta_features[(meta_features["relevance"] ==3)].shape[0]
    
    
    if results is None: 
        results = solutions
    else:
        results = pd.concat([results, solutions], ignore_index = True)
    results.to_excel(f"../reports/microarray_{clustering}_{method}.xlsx")
    

In [None]:
results = pd.read_excel("../reports/microarray_gmm_adapted_ratkowsky_lance.xlsx")

max_ari = results.groupby("dataset_name").agg({"ari": max}).reset_index()

pd.merge(results[["dataset_name", "ari", "silhouette"]], 
         max_ari, on = ["dataset_name", "ari"]).groupby("dataset_name").max()

In [None]:
results = None
filenames = np.array([
    'alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh', 'west'])
clustering = "hdbscan"
path = '../data/microarray/'
method = "adapted_ratkowsky_lance"
imp_f = np.arange(20)
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values


    n_clusters = len(np.unique(truth))

    z_file= f"../data/microarray/Z_{name}_correlation.npy"
    print(f"\n##########  {name}, {data.shape}")

    # Clustering 1D
    meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=90,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.4)
    t2 = time.time()
    print(f"TIME: 1d Features : {(t2-t1)/60} min")

    t3 = time.time()
    t4 = time.time()
    print(f"TIME: 2d scores: {(t4-t3)/60} min")
    round_size = 3
    epochs = 10*round_size

    sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
    }

    params = ga.ga_parameters(
        n_clusters,
        data.shape[1],
        truth,
        meta_features,
        method=method,
        truth_methods=['ari'],
        archive_2d=None,#population[:data.shape[1] // 4],
        epochs=epochs,
        sampling = sampling,
        round_size=round_size,
        allow_subspace_overlap = True,
        improvement_per_mutation_report = False,
        clustering = clustering
        
    )
    solutions, archive= ga.run(data, params)
#     display(params["report"].groupby(["op", "improvement"]).count())
    solutions["dataset_name"] = name
    
    t5 = time.time()
    n_total = t5-t1
    print(f"TIME: GA: {(t5-t4)/60} min")
    print(f"TIME: Total: {(t5-t1)/60} min")
    solutions["total_time"] = round((t5-t1)/60, 1)
    solutions["t(feature_sel)"] = round((t2-t1)/60, 1)
    solutions["t(cnn)"] = round((t3-t2)/60, 1)
    solutions["t(clustering2d)"] = round((t4-t3)/60, 1)
    solutions["t(ga)"] = round((t5-t4)/60, 1)
    
    solutions["n_ga"] = archive.shape[0]
    solutions["n_cnn"] = n_total
    solutions["input_size"] = data.shape[1]
    
    
    solutions["n_non_redundant"] = meta_features[(meta_features["redundant"] ==1) ].shape[0]
    solutions["n_imp"] = meta_features[(meta_features["relevance"] !=0)].shape[0]
    solutions["n_imp4"] = meta_features[(meta_features["relevance"] ==4)].shape[0]
    solutions["n_imp3"] = meta_features[(meta_features["relevance"] ==3)].shape[0]
    
    
    if results is None: 
        results = solutions
    else:
        results = pd.concat([results, solutions], ignore_index = True)
    results.to_excel(f"../reports/microarray_{clustering}_{method}.xlsx")
results.groupby("dataset_name").agg({"ari": max})

In [None]:
results = pd.read_excel("../reports/microarray_hdbscan_adapted_ratkowsky_lance.xlsx")

max_ari = results.groupby("dataset_name").agg({"ari": max}).reset_index()

pd.merge(results[["dataset_name", "ari", "silhouette"]], 
         max_ari, on = ["dataset_name", "ari"]).groupby("dataset_name").max()

In [None]:
results = None
filenames = np.array(['alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh'])
clustering = "leiden"
path = '../data/microarray/'
method = "adapted_ratkowsky_lance"
imp_f = np.arange(20)
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values


    n_clusters = len(np.unique(truth))

    z_file= f"../data/microarray/Z_{name}_correlation.npy"
    print(f"\n##########  {name}, {data.shape}")

    # Clustering 1D
    meta_features = feature_ranking.rank_features(data,
                                              nb_bins=20,
                                              rank_threshold=90,
                                              z_file=z_file,
                                              metric='correlation',
                                              redundant_threshold=0.4)
    t2 = time.time()
    print(f"TIME: 1d Features : {(t2-t1)/60} min")

    t3 = time.time()
    t4 = time.time()
    print(f"TIME: 2d scores: {(t4-t3)/60} min")
    round_size = 3
    epochs = 10*round_size

    sampling = {
    "ARCHIVE2D": { 
        "ga": 0,
        "max": 0 },
    "CLOSE": { 
        "ga": 0.35,
        "max": 0.35 },
    "IMP1D": { 
        "ga": 0.35,
        "max": 0.35 },
    "RANDOM": { 
        "ga": 0.3,
        "max": 0.3},
    }
#     sampling = {
#         "ARCHIVE2D": { 
#             "ga": 0.25,
#             "max": 0.25 },
#         "CLOSE": { 
#             "ga": 0.4,
#             "max": 0.4 },
#         "IMP1D": { 
#             "ga": 0.25,
#             "max": 0.25 },
#         "RANDOM": { 
#             "ga": 0.1,
#             "max": 0.1},
#         }
    params = ga.ga_parameters(
        n_clusters,
        data.shape[1],
        truth,
        meta_features,
        method=method,
        truth_methods=['ari'],
        archive_2d=None,#population[:data.shape[1] // 4],
        epochs=epochs,
        sampling = sampling,
        round_size=round_size,
        allow_subspace_overlap = True,
        improvement_per_mutation_report = False,
        clustering = clustering
        
    )
    solutions, archive= ga.run(data, params)
#     display(params["report"].groupby(["op", "improvement"]).count())
    solutions["dataset_name"] = name
    
    t5 = time.time()
    n_total = t5-t1
    print(f"TIME: GA: {(t5-t4)/60} min")
    print(f"TIME: Total: {(t5-t1)/60} min")
    solutions["total_time"] = round((t5-t1)/60, 1)
    solutions["t(feature_sel)"] = round((t2-t1)/60, 1)
    solutions["t(cnn)"] = round((t3-t2)/60, 1)
    solutions["t(clustering2d)"] = round((t4-t3)/60, 1)
    solutions["t(ga)"] = round((t5-t4)/60, 1)
    
    solutions["n_ga"] = archive.shape[0]
    solutions["n_cnn"] = n_total
    solutions["input_size"] = data.shape[1]
    
    
    solutions["n_non_redundant"] = meta_features[(meta_features["redundant"] ==1) ].shape[0]
    solutions["n_imp"] = meta_features[(meta_features["relevance"] !=0)].shape[0]
    solutions["n_imp4"] = meta_features[(meta_features["relevance"] ==4)].shape[0]
    solutions["n_imp3"] = meta_features[(meta_features["relevance"] ==3)].shape[0]
    
    
    if results is None: 
        results = solutions
    else:
        results = pd.concat([results, solutions], ignore_index = True)
    results.to_excel("../reports/microarray_leiden.xlsx")
    

In [None]:
results.groupby("dataset_name").agg({"ari": max})

# Other methods

In [None]:
from sklearn import mixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
import hdbscan

## Silhouette analysis

In [None]:
results = pd.DataFrame()
filenames = np.array([
    'alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh', 'west'
])
clustering = "hdbscan"
path = '../data/microarray/'

for name in filenames:
    print(name)
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values
    if data.shape[1] > 8000:
        data = data[:, :8000]
    n_clusters = len(np.unique(truth))
    row = {"dataset": name}
    clustering = AffinityPropagation(random_state=5).fit(data)
    ari = silhouette_score(data, clustering.labels_)
    print(f"Affinity {ari}")
    row["AffinityPropagation"] = ari

    clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='discretize',random_state=5).fit(data)
    ari = silhouette_score(data, clustering.labels_)
    print(f"Spectral {ari}")
    row["Spectral"] = ari

    clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(data)
    ari = silhouette_score(data, clustering.labels_)
    print(f"KMeans {ari}")
    row["KMeans"] = ari

    gmm = mixture.GaussianMixture(n_components=n_clusters,
                  covariance_type="full", random_state=0)
    pred = gmm.fit_predict(data)
    ari = silhouette_score(data, pred)
    print(f"GMM {ari}")
    row["GMM"] = ari

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(data).labels_
    ari = silhouette_score(data, pred)
    print(f"HDBSCAN {ari}")
    row["HDBSCAN"] = ari
    n_comp = min(data.shape[1], data.shape[0]) -1
    pca = PCA(min(50, n_comp))
    pca_data = pca.fit_transform(data)

    clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(pca_data)
    ari = silhouette_score(pca_data, clustering.labels_)
    print(f"PCA KMeans {ari}")
    row["PCA_KMeans"] = ari

    gmm = mixture.GaussianMixture(n_components=n_clusters,
                  covariance_type="full", random_state=0)
    pred = gmm.fit_predict(pca_data)
    ari = silhouette_score(pca_data, pred)
    print(f"PCA GMM {ari}")
    row["PCA_GMM"] = ari

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(pca_data).labels_
    ari = silhouette_score(pca_data, pred)
    print(f"PCAHDBSCAN {ari}")
    row["PCA_HDBSCAN"] = ari

    results = results.append(row, ignore_index = True)
    results.to_pickle("../data/microarray_others_silhouette.pkl")
# results.groupby("dataset_name").agg({"ari": max})

# ARI analysis

In [None]:
results = pd.DataFrame()
filenames = np.array([
#     'alon', 
#     'borovecki', 
    'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh','west', ])
clustering = "hdbscan"
path = '../data/microarray/'
method = "adapted_ratkowsky_lance"

for name in filenames:
    print(name)
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values
    if data.shape[1] > 8000:
        data = data[:, :8000]
    n_clusters = len(np.unique(truth))
    row = {"dataset": name}
#     clustering = AffinityPropagation(random_state=5).fit(data)
#     ari = adjusted_rand_score(truth, clustering.labels_)
#     print(f"Affinity {ari}")
#     row["AffinityPropagation"] = ari

#     clustering = SpectralClustering(n_clusters=n_clusters, assign_labels='discretize',random_state=5).fit(data)
#     ari = adjusted_rand_score(truth, clustering.labels_)
#     print(f"Spectral {ari}")
#     row["Spectral"] = ari

#     clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(data)
#     ari = adjusted_rand_score(truth, clustering.labels_)
#     print(f"KMeans {ari}")
#     row["KMeans"] = ari

#     gmm = mixture.GaussianMixture(n_components=n_clusters,
#                   covariance_type="full", random_state=0)
#     pred = gmm.fit_predict(data)
#     ari = adjusted_rand_score(truth, pred)
#     print(f"GMM {ari}")
#     row["GMM"] = ari

#     pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(data).labels_
#     ari = adjusted_rand_score(truth, pred)
#     print(f"HDBSCAN {ari}")
#     row["HDBSCAN"] = ari

    n_comp = min(data.shape[0], data.shape[1])-1
    pca = PCA(min(n_comp, 50))
    pca_data = pca.fit_transform(data)

    clustering = KMeans(n_clusters=n_clusters,random_state=5).fit(pca_data)
    ari = adjusted_rand_score(truth, clustering.labels_)
    print(f"PCA KMeans {ari}")
    row["PCA_KMeans"] = ari

    gmm = mixture.GaussianMixture(n_components=n_clusters,
                  covariance_type="full", random_state=0)
    pred = gmm.fit_predict(pca_data)
    ari = adjusted_rand_score(truth, pred)
    print(f"PCA GMM {ari}")
    row["PCA_GMM"] = ari

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(pca_data).labels_
    ari = adjusted_rand_score(truth, pred)
    print(f"PCAHDBSCAN {ari}")
    row["PCA_HDBSCAN"] = ari

    results = results.append(row, ignore_index = True)
    results.to_pickle("../data/microarray_others.pkl")
    print(results.shape)
# results.groupby("dataset_name").agg({"ari": max})

In [None]:
results.round(2)

# Supervised analysis of datasets

In [None]:
import scripts.ga_evaluation as ga_evaluation
filenames = np.array(['alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh'])
path = '../data/microarray/'
imp_f = np.arange(20)
result_df = pd.DataFrame()
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values

    n_clusters = len(np.unique(truth))

    result = {"Dataset" : name,
             "Original Dimensions": f"{data.shape[0]} x {data.shape[1]}",
             "Cluster sizes" : ", ".join(np.array(list(Counter(truth).values())).astype(str))}
    
    predK = KMeans(n_clusters= n_clusters, random_state = 2).fit(data).labels_
    ari_all = adjusted_rand_score(truth, predK)
    
    predK = KMeans(n_clusters= n_clusters, random_state = 2).fit(data[:, :5]).labels_
    ari_top10 = adjusted_rand_score(truth, predK)
    
    pca = PCA(2)
    pca_data = pca.fit_transform(data)

    predK = KMeans(n_clusters= n_clusters, random_state = 2).fit(pca_data).labels_
    ari_pca = adjusted_rand_score(truth, predK)
    
    r1 = ga_evaluation.random_sampling(data, truth, n_clusters, algo = "gmm")
    r2 = ga_evaluation.random_sampling(data, truth, n_clusters, algo = "hdbscan")
    result["ARI all dataset"] = round(ari_all,2)
    result["ARI PCA dataset"] = round(ari_pca,2)
    result["ARI top 10 features"] = round(ari_top10,2)
    result["Random GMM"] = round(r1,2)
    result["Random HDBSCAN"] = round(r2,2)
    result_df = result_df.append(result, ignore_index = True)

result_df

# Run time analysis

In [None]:
results = pd.read_excel("reports/r_cnn.xlsx", index_col=0)

In [None]:
results["label"] = results["dataset_name"] + " (" +results["input_size"].astype(str) + " features) " 

In [None]:
perf = results.groupby("input_size").min()[['t(feature_sel)', 't(cnn)','t(ga)']]

In [None]:
perf = perf.rename(columns = {
    't(feature_sel)' : '1D Feature ranking', 
    't(cnn)' : '2D Feature ranking with NN',
    't(ga)': 'Optimization algorithm for top 10 subspaces'
})

In [None]:
plt.figure(figsize = (10,3))
ax = plt.gca()
perf.plot(kind='bar', stacked=True, ax = ax)
plt.ylabel("time (min)")
plt.xticks(rotation = 0)
sns.despine()
plt.title("Run times on West, Khan, Gordon and Boroveki datasets")
plt.xlabel("Number of dimensions in the input dataset")
plt.savefig(f"images/run_times.pdf", bbox_inches='tight')

# Best scores using supervised feature selection

In [None]:
import scripts.ga_evaluation as ga_evaluation
from sklearn import mixture
import hdbscan
filenames = np.array(['alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh', 'west'])
path = '../data/microarray/'

In [None]:
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values
    print(Counter(truth))
    n_clusters = len(np.unique(truth))
    gmm_scores = []
    hdbscan_scores = []
    for i in range(2, 50):
        input_data = data[:, :i]
        gmm = mixture.GaussianMixture(n_components=n_clusters,
                          covariance_type="full", random_state=0)
        pred = gmm.fit_predict(input_data)
        ari = adjusted_rand_score(truth, pred)
        gmm_scores.append(ari)

        pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
        ari = adjusted_rand_score(truth, pred)
        hdbscan_scores.append(ari)

        
    print(f"\n\n\n{name} GMM ari = {max(gmm_scores)}, ")
    print(f"{name} HDBSCAN ari = {max(hdbscan_scores)}, ")
    

# Mutual Information

In [None]:
from sklearn.feature_selection import chi2,  mutual_info_classif, SelectKBest

In [None]:
for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    data = data.drop("truth", axis = 1).values
    print(Counter(truth))
    n_clusters = len(np.unique(truth))
    gmm_scores = []
    hdbscan_scores = []
    sel = SelectKBest(mutual_info_classif, k=50).fit_transform(data, truth)
    for i in range(2, 50):
        input_data = sel[:, :i]
        gmm = mixture.GaussianMixture(n_components=n_clusters,
                          covariance_type="full", random_state=0)
        pred = gmm.fit_predict(input_data)
        ari = adjusted_rand_score(truth, pred)
        gmm_scores.append(ari)

        pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
        ari = adjusted_rand_score(truth, pred)
        hdbscan_scores.append(ari)

        
    print(f"\n\n\n{name} GMM ari = {max(gmm_scores)}, ")
    print(f"{name} HDBSCAN ari = {max(hdbscan_scores)}, ")

In [None]:
import scripts.ga_evaluation as ga_evaluation
filenames = np.array(['alon', 'borovecki', 'chiaretti', 'christensen', 'golub', 'gordon',
       'khan', 'sorlie', 'su', 'yeoh', 'west'])
path = '../data/microarray/'

for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    input_data = data.drop("truth", axis = 1).values
    if len(input_data) > 5000:
        input_data = input_data[:, :5000]

    n_clusters = len(np.unique(truth))
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    print(f"{name} GMM ari = {ari}")

    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    print(f"{name} HDBSCAN ari = {ari}")

In [None]:
import scripts.ga_evaluation as ga_evaluation
filenames = np.array(['borovecki'])
path = '../data/microarray/'

for name in filenames:
    t1 = time.time()
    data = pd.read_pickle(f'{path}' + name + '.pkl')
    truth = data["truth"].values
    input_data = data.drop("truth", axis = 1).values
    input_data = input_data[:, :10000]
    

    n_clusters = len(np.unique(truth))
    
    pred = hdbscan.HDBSCAN(min_cluster_size =2).fit(input_data).labels_
    ari = adjusted_rand_score(truth, pred)
    print(f"{name} HDBSCAN ari = {ari}")
    
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    pred = gmm.fit_predict(input_data)
    ari = adjusted_rand_score(truth, pred)
    print(f"{name} GMM ari = {ari}")




# Execution time analysis


In [None]:
results = pd.read_excel("reports/r_cnn.xlsx", index_col=0)

results["label"] = results["dataset_name"] + " (" +results["input_size"].astype(str) + " features) " 

perf = results.groupby("input_size").min()[['t(feature_sel)', 't(cnn)','t(ga)']]

perf = perf.rename(columns = {
    't(feature_sel)' : '1D Feature ranking', 
    't(cnn)' : '2D Feature ranking with NN',
    't(ga)': 'Optimization algorithm for top 10 subspaces'
})



plt.figure(figsize = (10,3))
ax = plt.gca()
perf.plot(kind='bar', stacked=True, ax = ax)
plt.ylabel("time (min)")
plt.xticks(rotation = 0)
sns.despine()
plt.title("Run times on West, Khan, Gordon and Boroveki datasets")
plt.xlabel("Number of dimensions in the input dataset")
plt.savefig(f"images/run_times.pdf", bbox_inches='tight')