# Summary 
This notebook allows to replicate the generation of data used for training the neural network.

In [2]:
import sys
sys.path.append("..")

#GPU configuration
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.cluster import  KMeans, MeanShift
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets.samples_generator import make_blobs
import itertools
import random

import scipy
from tqdm import tqdm
import pickle
import sys
from keras.utils import to_categorical
import keras
from keras.models import load_model
from datetime import datetime
import time
from keras.preprocessing.image import ImageDataGenerator
import scripts.data_generator as data_generator
import scripts.internal_scores as validation
import scripts.cnn_models as cnn_models
import hdbscan
from sklearn import mixture
import scripts.plot_losses as plot_losses
random_state=0
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5



# Data simulation methods

In [None]:
def evaluate_subspace(subspace,
                      all_x,
                      all_y,
                      all_original_input,
                      return_data=True,
                      methods=["adapted_ratkowsky_lance", "adapted_silhouette"],
                      nb_bins=20,
                      n_clusters=2):
    """
    Compute data score and add results to global all_x, all_y, all_original_input
    """
    subspace = preprocessing.MinMaxScaler().fit_transform(subspace)
    if return_data:
        all_original_input.append(subspace)
    scores = []
    predK = KMeans(n_clusters=n_clusters, random_state=0).fit(subspace).labels_
    for method in methods:
        score = round(getattr(validation.validation(), method)(subspace, predK), 4)
        scores.append(score)
        
    predK = hdbscan.HDBSCAN(min_cluster_size =10).fit(subspace).labels_
    for method in methods:
        if len(np.unique(predK)) == 1: #only one cluster
            score = 0
        else:
            score = round(getattr(validation.validation(), method)(subspace, predK), 4)
        scores.append(score)
        
    gmm = mixture.GaussianMixture(n_components=n_clusters,
                      covariance_type="full", random_state=0)
    predK = gmm.fit_predict(subspace)
    for method in methods:
        if len(np.unique(predK)) == 1: #only one cluster
            score = 0
        else:
            score = round(getattr(validation.validation(), method)(subspace, predK), 4)
        scores.append(score)
        
    digitized_subspace = cnn_models.digitize(subspace, nb_bins=20)
    img = cnn_models.digitized_subspace_to_img(digitized_subspace, nb_bins)

    all_x.append(img)
    all_y.loc[all_y.shape[0]] = scores

    return all_x, all_y, all_original_input

def generate_blobs(size,
                   all_n_clusters=[3],
                   nb_bins=20,
                   return_data=True,
                   methods=["adapted_ratkowsky_lance", "adapted_silhouette", "Wemmert_Gancarski"],
                   n_clusters=2):
    all_img = []
    all_y = pd.DataFrame(columns = ["km_arl", "km_as",
                                    "km_wg", "h_arl", "h_as","h_wg", "gmm_arl", "gmm_as", "gmm_wg"])

    all_original_input = []

    # 1. Generate blob subpaces
    for nc in all_n_clusters:
        print(f"Generating {size} gaussian blobs with {nc} clusters")

        for i in range(size):
            n_samples = max(np.random.randint(15, 200) * nc, 150)
            n_samples = min(n_samples, 1500)
            data, _, _ = data_generator.make_blob_data(n_samples,
                                                  [nc],
                                                  None,
                                                  2, 7, plot = False)
#             n_cutoff= np.random.choice([1,2])
#             data = data_generator.cutoff_data(data, n_cutoff)
            ids = list(itertools.combinations(np.arange(data.shape[1]), 2))

            for i, j in ids:
                
                all_img, all_y, all_original_input = evaluate_subspace(
                    data[:, [i, j]],
                    all_img,
                    all_y,
                    all_original_input,
                    return_data=return_data,
                    nb_bins=nb_bins,
                    methods=methods,
                    n_clusters=n_clusters)


    all_y["id"] = np.arange(all_y.shape[0])
    return np.array(all_img), all_y, np.array(all_original_input)


def generate_mixed_data(n_experiments=20,
                        nb_bins=20,
                        return_data=True,
                        n_clusters_subspace=[2, 3],
                        n_obs_mixed_distribution=3,
                        methods=["adapted_ratkowsky_lance", "adapted_silhouette", "Wemmert_Gancarski"],
                        data_file="../data/mixed_train_img_2d.npy",
                        score_file="../data/mixed_train_2d.pkl",
                        orig_file = "",
                        n_clusters=2):
    all_img = []
    all_y = pd.DataFrame(columns = ["km_arl", "km_as",
                                    "km_wg", "h_arl", "h_as","h_wg", "gmm_arl", "gmm_as", "gmm_wg"])
    all_original_input = []

    for _ in range(n_experiments):
        subspace_clusters = np.random.choice(np.arange(2, 9), 2, replace=False)
        n_samples = max(150, np.random.randint(10, 150) * max(subspace_clusters))
        n_samples = min(n_samples, 1500)
        mixed_data, _, _ = data_generator.make_data_for_ga(
                     subspace_clusters,
                     cluster_std=None,
                     n_uniform_features=3,
                     n_normal_features=3,
                     n_neg_binomial=4,
                     n_gamma=3,
                     n_beta=7,
                     random_redundant=True,
                     n_redundant=5,
                     n_outlier_features=2,
                     n_cutoff=4,
                     n_bimodal_features=2,
                     min_subspace_features=2,
                     max_subspace_features=6,
                     n_samples= n_samples,
                     plot=False)
        print(
            f"Mixing subpaces of {mixed_data.shape}, subspaces with {subspace_clusters} clusters"
        )
        for i, j in tqdm(
                itertools.combinations(np.arange(mixed_data.shape[1]), 2)):
            all_img, all_y, all_original_input = evaluate_subspace(
                mixed_data[:, [i, j]],
                all_img,
                all_y,
                all_original_input,
                return_data=return_data,
                nb_bins=nb_bins,
                methods=methods,
                n_clusters=n_clusters)


    all_y["id"] = np.arange(all_y.shape[0])
    np.save(data_file, np.array(all_img))
    np.save(orig_file, np.array(all_original_input))
    all_y.to_pickle(score_file)
    return np.array(all_img), all_y, np.array(all_original_input)


def generate_biological_subspaces(filenames,
                                  methods=["adapted_ratkowsky_lance", "adapted_silhouette", "Wemmert_Gancarski"],
                                  n_subspaces=1000,
                                  nb_bins=20,
                                  return_data = True,
                                  truth_column="truth",
                                  n_clusters=2):
    """
    Samples from the list of biological subspaces n_subspaces 2 d random
    combinations
    """
    all_img = []
    all_y = pd.DataFrame(columns = ["km_arl", "km_as",
                                    "km_wg", "h_arl", "h_as","h_wg", "gmm_arl", "gmm_as", "gmm_wg"])
    all_original_input = []
    for filename in filenames:
        print(filename)
        data = pd.read_pickle(filename)
        truth = data[truth_column].values
        data = data.drop(truth_column, axis=1).values
        if n_clusters>=data.shape[0]/2:
            continue
        subspaces = np.random.choice(np.arange(data.shape[1]),
                                     size=(n_subspaces, 2))
        
        for subspace in subspaces:
            all_img, all_y, all_original_input = evaluate_subspace(
                data[:, subspace],
                all_img,
                all_y,
                all_original_input,
                return_data=return_data,
                nb_bins=nb_bins,
                methods=methods,
                n_clusters=n_clusters)

    all_y["id"] = np.arange(all_y.shape[0])
    if len(all_img) == 0:
        return np.zeros((0, 21, 21, 1)), all_y, np.array(all_original_input)
    return np.array(all_img), all_y, np.array(all_original_input)

## Data generation loop

In [None]:
nb_bins = 20
all_n_clusters = np.arange(2,30)

for n_clusters in all_n_clusters:
    print(">>", n_clusters)
    # Blob datasets
    blob_train_img, blob_train_scores, blob_train_orig = generate_blobs(
        7, all_n_clusters=all_n_clusters, nb_bins=20, n_clusters=n_clusters, return_data=True)
    np.save(f"../data/nn_data/blob_train_img_{n_clusters}.npy", blob_train_img)
    np.save(f"../data/nn_data/blob_train_orig_{n_clusters}.npy", blob_train_orig)
    blob_train_scores.to_pickle(f"../data/nn_data/blob_train_y_{n_clusters}.pkl")

    blob_val_img, blob_val_scores, blob_val_orig = generate_blobs(
        1, all_n_clusters=all_n_clusters, nb_bins=20, n_clusters=n_clusters, return_data=True,)

    np.save(f"../data/nn_data/blob_val_img_{n_clusters}.npy", blob_val_img)
    np.save(f"../data/nn_data/blob_val_orig_{n_clusters}.npy", blob_val_orig)
    blob_val_scores.to_pickle(f"../data/nn_data/blob_val_y_{n_clusters}.pkl")
    
#     # Mixed features
    mixed_train_img, mixed_train_scores, _ = generate_mixed_data(
        n_experiments=10,
        nb_bins=20,
        n_clusters_subspace=all_n_clusters,
        n_obs_mixed_distribution=2,
        return_data=True,
        data_file=f"../data/nn_data1/mixed_train_img_{n_clusters}.npy",
        score_file=f"../data/nn_data1/mixed_train_y_{n_clusters}.pkl",
        orig_file = f"../data/nn_data1/mixed_train_orig_{n_clusters}.npy",
        n_clusters=n_clusters)

    mixed_val_img, mixed_val_scores, _ = generate_mixed_data(
        n_experiments=1,
        nb_bins=20,
        n_clusters_subspace=all_n_clusters,
        n_obs_mixed_distribution=2,
        return_data=True,
        data_file=f"../data/nn_data/mixed_val_img_{n_clusters}.npy",
        score_file=f"../data/nn_data/mixed_val_y_{n_clusters}.pkl",
        orig_file = f"../data/nn_data/mixed_val_orig_{n_clusters}.npy",
        n_clusters=n_clusters)


    filenames = [
        '../data/microarray/alon.pkl', '../data/microarray/nakayama.pkl',
        '../data/microarray/christensen.pkl', '../data/microarray/west.pkl'
    ]

    bio_img_train, bio_scores_train, bio_orig_train = generate_biological_subspaces(
        filenames,
        n_subspaces=100,
        nb_bins=20,
        n_clusters=n_clusters)

    np.save(f"../data/nn_data/ma_train_img_{n_clusters}.npy", bio_img_train)
    np.save(f"../data/nn_data/ma_train_orig_{n_clusters}.npy", bio_orig_train)
    bio_scores_train.to_pickle(f"../data/nn_data/ma_train_y_{n_clusters}.pkl")

    filenames = ['../data/microarray/alon.pkl', '../data/microarray/nakayama.pkl']
    bio_img_val, bio_scores_val, bio_orig_val = generate_biological_subspaces(
        filenames,
        n_subspaces=10,
        nb_bins=20,
        n_clusters=n_clusters)

    np.save(f"../data/nn_data/ma_val_img_{n_clusters}.npy", bio_img_val)
    np.save(f"../data/nn_data/ma_val_orig_{n_clusters}.npy", bio_orig_val)
    bio_scores_val.to_pickle(f"../data/nn_data/ma_val_y_{n_clusters}.pkl")
    filenames = [
        '../data/microarray/su.pkl', '../data/microarray/chin.pkl',
        '../data/microarray/tian.pkl'
    ]

    bio_img_test, bio_scores_test, bio_test_orig = generate_biological_subspaces(
        filenames,
        n_subspaces=50, nb_bins=20, n_clusters=n_clusters)

    np.save(f"../data/nn_data/ma_test_img_{n_clusters}.npy", bio_img_test)
    np.save(f"../data/nn_data/ma_test_orig_{n_clusters}.npy", bio_test_orig)
    bio_scores_test.to_pickle(f"../data/nn_data/ma_test_y_{n_clusters}.pkl")

    # RNA seq data
    filenames = [
        '../data/rna_data/BRCA.pkl', '../data/rna_data/KIRP.pkl'
    ]

    bio_img_test, bio_scores_test, bio_test_orig = generate_biological_subspaces(
        filenames, n_subspaces=50, nb_bins=20,
        truth_column = "y", n_clusters=n_clusters)

    np.save(f"../data/nn_data/rna_test_img_{n_clusters}.npy", bio_img_test)
    np.save(f"../data/nn_data/rna_test_orig_{n_clusters}.npy", bio_test_orig)
    bio_scores_test.to_pickle(f"../data/nn_data/rna_test_y_{n_clusters}.pkl")

# Combine data 

In [None]:
all_n_clusters = np.arange(2,20)
num_classes = len(all_n_clusters)

# Handle train


In [None]:
x_train_img = []
x_train_orig = []
x_train_k = []
y_train = []

for n_clusters in tqdm(all_n_clusters):
    mixed_train_img = np.load(
        f"../data/nn_data/mixed_train_img_{n_clusters}.npy")
    mixed_train_orig = np.load(
        f"../data/nn_data/mixed_train_orig_{n_clusters}.npy", allow_pickle=True)
    mixed_train_scores = pd.read_pickle(
        f"../data/nn_data/mixed_train_y_{n_clusters}.pkl")
    
    mixed_train_img1 = np.load(
        f"../data/nn_data1/mixed_train_img_{n_clusters}.npy")
    mixed_train_orig1 = np.load(
        f"../data/nn_data1/mixed_train_orig_{n_clusters}.npy", allow_pickle=True)
    mixed_train_scores1 = pd.read_pickle(
        f"../data/nn_data1/mixed_train_y_{n_clusters}.pkl")
    
    blob_train_img = np.load(
        f"../data/nn_data/blob_train_img_{n_clusters}.npy")
    blob_train_orig = np.load(
        f"../data/nn_data/blob_train_orig_{n_clusters}.npy",  allow_pickle=True)
    blob_train_scores = pd.read_pickle(
        f"../data/nn_data/blob_train_y_{n_clusters}.pkl")
    
    
    bio_train_img = np.load(
        f"../data/nn_data/ma_train_img_{n_clusters}.npy")
    bio_orig_train = np.load(
        f"../data/nn_data/ma_train_orig_{n_clusters}.npy", allow_pickle=True)
    bio_train_scores = pd.read_pickle(
        f"../data/nn_data/ma_train_y_{n_clusters}.pkl")
    
    x_train_orig.extend(list(mixed_train_orig))
    x_train_orig.extend(list(mixed_train_orig1))
    x_train_orig.extend(list(blob_train_orig))
    x_train_orig.extend(list(bio_orig_train))
    
    x_train_img.append(
            np.concatenate([mixed_train_img,mixed_train_img1, blob_train_img, 
                            bio_train_img
                           ]))
    train_scores = pd.concat(
            [mixed_train_scores, mixed_train_scores1, blob_train_scores, 
             bio_train_scores
            ])
    x_train_k.append([n_clusters-2] * train_scores.shape[0])
    y_train.append(train_scores)
    


x_train_img = np.concatenate(x_train_img)
x_train_orig = np.array(x_train_orig)
y_train=pd.concat(y_train)
y_train["id"] = np.arange(y_train.shape[0])
# x_train_k = np.concatenate(x_train_k)
x_train_k = to_categorical(np.concatenate(x_train_k), num_classes=num_classes)
print (x_train_img.shape, len(x_train_orig), y_train.shape, x_train_k.shape)
np.save("../data/nn_data/img_x_train_img.npy", x_train_img)
np.save("../data/nn_data/img_x_train_k.npy", x_train_k)
y_train.to_pickle("../data/nn_data/img_y_train.npy")

In [None]:
del x_train_img, x_train_orig, y_train, x_train_k

# Handle validation

In [None]:
x_val_img = []
x_val_orig = []
x_val_k = []
y_val = []

for n_clusters in tqdm(all_n_clusters):
    mixed_val_img = np.load(
        f"../data/nn_data/mixed_val_img_{n_clusters}.npy")
    mixed_val_orig = np.load(
        f"../data/nn_data/mixed_val_orig_{n_clusters}.npy", allow_pickle=True)
    mixed_val_scores = pd.read_pickle(
        f"../data/nn_data/mixed_val_y_{n_clusters}.pkl")
    
    blob_val_img = np.load(
        f"../data/nn_data/blob_val_img_{n_clusters}.npy")
    blob_val_orig = np.load(
        f"../data/nn_data/blob_val_orig_{n_clusters}.npy",  allow_pickle=True)
    blob_val_scores = pd.read_pickle(
        f"../data/nn_data/blob_val_y_{n_clusters}.pkl")
    
    
    bio_val_img = np.load(
        f"../data/nn_data/ma_val_img_{n_clusters}.npy")
    bio_orig_val = np.load(
        f"../data/nn_data/ma_val_orig_{n_clusters}.npy", allow_pickle=True)
    bio_val_scores = pd.read_pickle(
        f"../data/nn_data/ma_val_y_{n_clusters}.pkl")
    
    x_val_orig.extend(list(mixed_val_orig))
    x_val_orig.extend(list(blob_val_orig))
    x_val_orig.extend(list(bio_orig_val))
    
    x_val_img.append(
            np.concatenate([mixed_val_img, blob_val_img, 
                            bio_val_img
                           ]))
    val_scores = pd.concat(
            [mixed_val_scores, blob_val_scores, 
             bio_val_scores
            ])
    x_val_k.append([n_clusters-2] * val_scores.shape[0])
    y_val.append(val_scores)
    


x_val_img = np.concatenate(x_val_img)
x_val_orig = np.array(x_val_orig)
y_val=pd.concat(y_val)
y_val["id"] = np.arange(y_val.shape[0])
# x_val_k = np.concatenate(x_val_k)
x_val_k = to_categorical(np.concatenate(x_val_k), num_classes=num_classes)
print (x_val_img.shape, len(x_val_orig), y_val.shape, x_val_k.shape)
np.save("../data/nn_data/img_x_val_img.npy", x_val_img)
np.save("../data/nn_data/img_x_val_k.npy", x_val_k)
y_val.to_pickle("../data/nn_data/img_y_val.npy")

# Test data

In [None]:
x_test_ma_img = []
x_test_ma_orig = []
x_test_ma_k = []
y_test_ma = []

x_test_rna_img = []
x_test_rna_orig = []
x_test_rna_k = []
y_test_rna = []

# for n_clusters in tqdm(all_n_clusters):
for n_clusters in tqdm(np.arange(2, 10)):
    bio_img_test_ma = np.load(
        f"../data/nn_data/ma_test_img_{n_clusters}.npy")
    bio_scores_test_ma = pd.read_pickle(
        f"../data/nn_data/ma_test_y_{n_clusters}.pkl")
    
    x_test_ma_img.append(bio_img_test_ma)
    x_test_ma_k.append([n_clusters-2] * bio_scores_test_ma.shape[0])
    y_test_ma.append(bio_scores_test_ma)

    bio_img_test = np.load(
        f"../data/nn_data/rna_test_img_{n_clusters}.npy")
    bio_scores_test = pd.read_pickle(
        f"../data/nn_data/rna_test_y_{n_clusters}.pkl")
    
    x_test_rna_img.append(bio_img_test)
    x_test_rna_k.append([n_clusters-2] * bio_scores_test.shape[0])
    y_test_rna.append(bio_scores_test)

x_test_ma_img = np.concatenate(x_test_ma_img)
y_test_ma=pd.concat(y_test_ma)

y_test_ma["id"] = np.arange(y_test_ma.shape[0])
x_test_ma_k = to_categorical(np.concatenate(x_test_ma_k), num_classes=num_classes)
print(x_test_ma_img.shape, y_test_ma.shape, x_test_ma_k.shape)

x_test_rna_img = np.concatenate(x_test_rna_img)
y_test_rna=pd.concat(y_test_rna)
y_test_rna["id"] = np.arange(y_test_rna.shape[0])
x_test_rna_k = to_categorical(np.concatenate(x_test_rna_k), num_classes=num_classes)
print (x_test_rna_img.shape, y_test_rna.shape, x_test_rna_k.shape)

np.save("../data/nn_data/img_x_test_ma_img.npy", x_test_ma_img)
np.save("../data/nn_data/img_x_test_ma_k.npy", x_test_ma_k)
y_test_ma.to_pickle("../data/nn_data/img_y_test_ma.npy")

np.save("../data/nn_data/img_x_test_rna_img.npy", x_test_rna_img)
np.save("../data/nn_data/img_x_test_rna_k.npy", x_test_rna_k)
y_test_rna.to_pickle("../data/nn_data/img_y_test_rna.npy")