## DNN Model for binary classification Cancer or No Cancer

## Imports

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import SGD, Adam, Adamax, RMSprop, Adagrad
from tensorflow.keras import layers, activations
import keras_tuner as kt
from statistics import mean
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.metrics import AUC, Precision, Accuracy
import keras.backend as K


### Useful functions

In [2]:
# https://medium.com/@mostafa.m.ayoub/customize-your-keras-metrics-44ac2e2980bd
def specificity(y_true, y_pred):
    true_negatives = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(1 - y_true, 0, 1)))
    return true_negatives / (possible_negatives + K.epsilon())

# https://aakashgoel12.medium.com/how-to-add-user-defined-function-get-f1-score-in-keras-metrics-3013f979ce0d
def get_f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## Load Data

In [2]:
#load dataframe
#outlier_folder = "without_outliers"
outlier_folder = "with_outliers"
df_genes_labels_file = f'Data/{outlier_folder}/df_genes_labels.pkl'
if os.path.exists(df_genes_labels_file):
    df_genes_labels = pd.read_pickle(df_genes_labels_file)
else:
    raise Exception("You need to run the notebook \"PrepareDataExploratoryAnalysis\".")

df_genes_labels_file = f'Data/{outlier_folder}/df_expression_data_lasso.pkl'
if os.path.exists(df_genes_labels_file):
    df_genes_labels_selected = pd.read_pickle(df_genes_labels_file)
else:
    raise Exception("You need to run the notebook \"ReduceDimensionality\".")

labels = df_genes_labels.iloc[:, 5440:5441]

le = LabelEncoder()
le.fit(labels['Classification'])
labels_enc = le.transform(labels['Classification'])
labels_enc = pd.DataFrame(labels_enc, columns=['Classification'])

X_train, X_test, y_train, y_test = train_test_split(df_genes_labels_selected, labels_enc['Classification'], test_size=0.2, random_state=42, shuffle=True)

print(f'Cantidad de elementos por clase en TRAIN: \n {y_train.value_counts()} \n')
print(f'Cantidad de elementos por clase en TEST: \n {y_test.value_counts()}')

Cantidad de elementos por clase en TRAIN: 
 Classification
0    1309
1     569
Name: count, dtype: int64 

Cantidad de elementos por clase en TEST: 
 Classification
0    338
1    132
Name: count, dtype: int64


In [4]:
def model_builder(hp):
    classifier = Sequential()
    hp_units_first_layer = hp.Int('units', min_value = 64, max_value = 1024, default = 64, step = 32)
    classifier.add(Dense(units = hp_units_first_layer, activation = 'relu', input_dim = 50))
    
    for i in range(hp.Int('units_layers', min_value = 1, max_value = 5, step = 1)):
        classifier.add(Dense(units = hp.Int(f'units_{i}', min_value=128, max_value=1024, step=32),
                              activation ='relu'))
        if hp.Boolean("dropout_layer"):
            classifier.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.2, max_value=0.5, default=0.25, step=0.05,)))
            classifier.add(BatchNormalization())
    classifier.add(Dense(1, activation = 'sigmoid'))
    
    hp_learning_rate = hp.Float('lr', min_value=1e-8, max_value=1e-2, sampling="log")
    optimizers_dict = {
            "Adam": Adam(learning_rate=hp_learning_rate),
            "Adamax": Adamax(learning_rate=hp_learning_rate),
            "SGD": SGD(learning_rate=hp_learning_rate),
            "Adagrad": Adagrad(learning_rate=hp_learning_rate),
            "RMSprop": RMSprop(learning_rate=hp_learning_rate)
            }
    
    hp_optimizers = hp.Choice(name = 'optimizer', 
        values=['Adam', 'Adamax', 'SGD', 'Adagrad', 'RMSprop']
        )
        
        
    metrics=['accuracy', AUC(curve="ROC", name="roc"), AUC(curve="PR", name="pr"), Precision(name='precision'), get_f1_score, specificity]
    classifier.compile(optimizer=optimizers_dict[hp_optimizers],
                loss = 'binary_crossentropy', 
                metrics = metrics)

    return classifier

In [5]:
#tuner = kt.Hyperband(model_builder,
#                    objective= kt.Objective('val_accuracy', direction='max'),
#                    max_epochs = 50,
#                    directory = 'Data/model_tests',
#                    project_name = 'cancer_classif_Hyp',
#                    overwrite = True)

#tuner.search_space_summary()  

In [6]:
tuner = kt.BayesianOptimization(
    hypermodel=model_builder,
    objective= kt.Objective('val_accuracy', direction='max'),
    max_trials=40,    
    directory = 'Data/model_tests',
    project_name = 'cancer_classif_ByOpt',
    overwrite = True
)
tuner.search_space_summary()

Search space summary
Default search space size: 6
units (Int)
{'default': 64, 'conditions': [], 'min_value': 64, 'max_value': 1024, 'step': 32, 'sampling': 'linear'}
units_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 1024, 'step': 32, 'sampling': 'linear'}
dropout_layer (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 1e-08, 'conditions': [], 'min_value': 1e-08, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
optimizer (Choice)
{'default': 'Adam', 'conditions': [], 'values': ['Adam', 'Adamax', 'SGD', 'Adagrad', 'RMSprop'], 'ordered': False}


In [12]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
checkpoint = ModelCheckpoint('Data/models/best_model.h5', monitor='val_loss', save_best_only=True, mode='min', verbose=1)
tuner.search(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[stop_early, checkpoint])

INFO:tensorflow:Oracle triggered exit


In [8]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [9]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/56
Epoch 2/56
Epoch 3/56
Epoch 4/56
Epoch 5/56
Epoch 6/56
Epoch 7/56
Epoch 8/56
Epoch 9/56
Epoch 10/56
Epoch 11/56
Epoch 12/56
Epoch 13/56
Epoch 14/56
Epoch 15/56
Epoch 16/56
Epoch 17/56
Epoch 18/56
Epoch 19/56
Epoch 20/56
Epoch 21/56
Epoch 22/56
Epoch 23/56
Epoch 24/56
Epoch 25/56
Epoch 26/56
Epoch 27/56
Epoch 28/56
Epoch 29/56
Epoch 30/56
Epoch 31/56
Epoch 32/56
Epoch 33/56
Epoch 34/56
Epoch 35/56
Epoch 36/56
Epoch 37/56
Epoch 38/56
Epoch 39/56
Epoch 40/56
Epoch 41/56
Epoch 42/56
Epoch 43/56
Epoch 44/56
Epoch 45/56
Epoch 46/56
Epoch 47/56
Epoch 48/56
Epoch 49/56
Epoch 50/56
Epoch 51/56
Epoch 52/56
Epoch 53/56
Epoch 54/56
Epoch 55/56
Epoch 56/56


<keras.callbacks.History at 0x25d9ff3a260>

In [10]:
# Evaluate the result
eval_result = model.evaluate(X_test, y_test)
print(f"test loss: {eval_result[0]}, test accuracy: {eval_result[1]}")

test loss: 1.1258351802825928, test accuracy: 0.8127659559249878
