# Summary

This notebook trains the neural network with simulated data, to approximate either the Silhouette or the Ratkowski lance scores of clustering feature subspaces with GMM.

In [None]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.cluster import  KMeans
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets.samples_generator import make_blobs
import itertools
import random

import scipy
from tqdm import tqdm
import pickle
import sys
from keras.utils import to_categorical
import keras
from keras.models import load_model
from datetime import datetime
import time
from keras.preprocessing.image import ImageDataGenerator#, array_to_img, img_to_array, load_img
import scripts.data_generator as data_generator
import scripts.internal_scores as validation
import scripts.cnn_models as cnn_models
import scripts.plot_losses as plot_losses
import hdbscan
random_state=0
random.seed( random_state )
np.random.seed(random_state)

%load_ext autoreload
%autoreload 2

# Define score for the analysis

In [None]:
score = "gmm_arl" # GMM + Ratkowski Lance
# score = "gmm_as" # GMM + Silhouette

# Put together all data for clusters into one file

In [None]:
all_n_clusters = np.arange(2,20)
num_classes = len(all_n_clusters)
num_classes

# Load simulated data

In [None]:
x_train_img = np.load("../data/nn_data/img_x_train_img.npy")
x_train_k = np.load("../data/nn_data/img_x_train_k.npy")
y_train= pd.read_pickle("../data/nn_data/img_y_train.npy")

x_val_img = np.load("../data/nn_data/img_x_val_img.npy")
x_val_k = np.load("../data/nn_data/img_x_val_k.npy")
y_val= pd.read_pickle("../data/nn_data/img_y_val.npy")

x_train_img.shape, x_val_img.shape

In [None]:
# idx= np.argmax(train_score)
idx = np.random.randint(0, len(y_train))
plt.imshow(x_train_img[idx][:,:,0])
plt.title(y_train[score].values[idx] );
print(np.argmax(x_train_k, axis = 1)[idx])

## Define model


In [None]:
def conditional_img_based_model(
                input_shape1 = (21,21,1),
                input_shape2 = (num_classes,),
                filter_size=16,
                dropout=0.2,
                noise=0.005,
                firstKernelSize=5,
                secondKernelSize=3,):


    input_layer = keras.layers.Input(input_shape1)
    input_layer1 = keras.layers.Input(input_shape2)
    x1 = keras.layers.Dense(50)(input_layer1)
    x = input_layer
#     x = GaussianNoise(stddev=noise)(x)
    x = keras.layers.Conv2D(filters=filter_size,
                                kernel_size=(firstKernelSize,firstKernelSize),
                                padding='same')(x)
    x = keras.layers.Conv2D(filters=filter_size,
                                kernel_size=(firstKernelSize,firstKernelSize),
                                padding='same')(x)
    x = keras.layers.Activation(activation="relu")(x)
    x = keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = keras.layers.Dropout(dropout)(x)
    
    x = keras.layers.Conv2D(filters=filter_size*2,
                                kernel_size=(secondKernelSize, secondKernelSize),
                                padding='same')(x)
    x = keras.layers.Conv2D(filters=filter_size*2,
                                kernel_size=(secondKernelSize, secondKernelSize),
                                padding='same')(x)
    x = keras.layers.Activation(activation="relu")(x)
    x = keras.layers.MaxPooling2D(pool_size=(2,2))(x)
    x = keras.layers.Dropout(dropout)(x)

    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(50, activation = "relu")(x)
    x=  keras.layers.Concatenate()([x, x1])
    x = keras.layers.Dense(50, activation = "relu")(x)
    x = keras.layers.Dense(20, activation = "relu")(x)
    x = keras.layers.Dense(1)(x)

    model = keras.models.Model(inputs=[input_layer, input_layer1], outputs=x)

    model.compile(loss='mean_squared_error',
                  optimizer="adam",
                  metrics=['mean_absolute_error', 'mse'])
    return model
model = conditional_img_based_model()
model.summary()

## Train

In [None]:
batch_size = 200
epochs = 1500

pl = plot_losses.PlotLosses(printInterval=1, name = 'mean_absolute_error')
es= keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=20,
                              verbose=0, mode='auto')
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, 
    min_lr=0.0001)

In [None]:
datagen = ImageDataGenerator(
    rotation_range=2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest')

# model.fit_generator(
#     datagen.flow(x_train_img, y_train[["score"]].values, batch_size=batch_size),
#     validation_data=(x_val_img ,y_val[["score"]].values), 
#     steps_per_epoch=len(x_train_img) // batch_size,
#     callbacks=[pl, reduce_lr],
#     epochs=epochs, verbose = True)

def custom_generator(datagen, X1, X2, y, batch_size):
    genX1 = datagen.flow(X1, y,  batch_size=batch_size, seed=1)
    genX2 = datagen.flow(X1, X2, batch_size=batch_size, seed=1)
    while True:
        X1i = genX1.next()
        X2i = genX2.next()
        yield [X1i[0], X2i[1]], X1i[1]

model.fit_generator(
    custom_generator(datagen, x_train_img, x_train_k,  y_train[[score]].values, batch_size),
    validation_data=([x_val_img,x_val_k] ,y_val[[score]].values), 
    steps_per_epoch=len(x_train_img) // batch_size,
    callbacks=[pl, reduce_lr],
    epochs=65, verbose = True)

model.save(f'../models/{score}.h5')

# Test

In [None]:
model = load_model(f'../models/gmm_as.h5')
score = 'gmm_as'
# score = "gmm_arl"

In [None]:
pred = model.predict([x_val_img, x_val_k] )
y_val["pred"] = np.ravel(pred)
print("MSE ", sklearn.metrics.mean_squared_error(y_val[score], y_val["pred"]))
print("MAE ", sklearn.metrics.mean_absolute_error(y_val[score], y_val["pred"]))

In [None]:
x_test_ma_img = np.load("../data/nn_data/img_x_test_ma_img.npy")
x_test_ma_k = np.load("../data/nn_data/img_x_test_ma_k.npy" )
y_test_ma = pd.read_pickle("../data/nn_data/img_y_test_ma.npy")

In [None]:
pred = model.predict([x_test_ma_img,x_test_ma_k] )
y_test_ma["pred"] = np.ravel(pred)
y_test_ma.shape

print("MSE ", sklearn.metrics.mean_squared_error(y_test_ma[score], y_test_ma["pred"]))
print("MAE ", sklearn.metrics.mean_absolute_error(y_test_ma[score], y_test_ma["pred"]))

In [None]:
x_test_rna_img = np.load("../data/nn_data/img_x_test_rna_img.npy")
x_test_rna_k = np.load("../data/nn_data/img_x_test_rna_k.npy")
y_test_rna = pd.read_pickle("../data/nn_data/img_y_test_rna.npy")

In [None]:
pred = model.predict([x_test_rna_img,x_test_rna_k] )
y_test_rna["pred"] = np.ravel(pred)
y_test_rna.shape

print("MSE ", sklearn.metrics.mean_squared_error(y_test_rna[score], y_test_rna["pred"]))
print("MAE ", sklearn.metrics.mean_absolute_error(y_test_rna[score], y_test_rna["pred"]))
