## DNN Model for binary classification Cancer or No Cancer

## Imports

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import SGD, Adam, Adamax, RMSprop
from tensorflow.keras import layers, activations
import keras_tuner as kt
from statistics import mean
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import ModelCheckpoint


## Load Data

In [2]:
#load dataframe
outlier_folder = "without_outliers"
#outlier_folder = "with_outliers"
df_genes_labels_file = f'Data/{outlier_folder}/df_genes_labels.pkl'
if os.path.exists(df_genes_labels_file):
    df_genes_labels = pd.read_pickle(df_genes_labels_file)
else:
    raise Exception("You need to run the notebook \"PrepareDataExploratoryAnalysis\".")

df_genes_labels_file = f'Data/{outlier_folder}/df_expression_data_lasso.pkl'
if os.path.exists(df_genes_labels_file):
    df_genes_labels_selected = pd.read_pickle(df_genes_labels_file)
else:
    raise Exception("You need to run the notebook \"ReduceDimensionality\".")

labels = df_genes_labels.iloc[:, 5440:5441]

X_train, X_test, y_train, y_test = train_test_split(df_genes_labels_selected, labels, test_size=0.2, random_state=42, shuffle=True)

print(f'Cantidad de elementos por clase en TRAIN: \n {y_train.value_counts()} \n')
print(f'Cantidad de elementos por clase en TEST: \n {y_test.value_counts()}')

Cantidad de elementos por clase en TRAIN: 
 Classification
1                 1050
0                  520
Name: count, dtype: int64 

Cantidad de elementos por clase en TEST: 
 Classification
1                 276
0                 117
Name: count, dtype: int64


In [23]:
def model_builder(hp):
    classifier = Sequential()
    hp_units_first_layer = hp.Int('units', min_value = 32, max_value = 128, default = 64, step = 32)
    classifier.add(Dense(units = hp_units_first_layer, activation = 'relu', input_dim = 50))
    
    for i in range(hp.Int('units_layers', min_value = 1, max_value = 5, step = 1)):
        classifier.add(Dense(units = hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                              activation = hp.Choice("activation", ["relu", "elu"])))
        if hp.Boolean("dropout_layer"):
            classifier.add(Dropout(rate=hp.Float(f"dropout_rate_{i}", min_value=0.2, max_value=0.5, default=0.25, step=0.05,)))
    classifier.add(Dense(1, activation = 'sigmoid'))

    hp_learning_rate = hp.Float("lr", min_value=1e-8, max_value=1e-2, sampling="log")
    classifier.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss = 'binary_crossentropy', 
                metrics = ['accuracy'])

    return classifier

In [25]:
tuner = kt.RandomSearch(
    hypermodel=model_builder,
    objective="val_accuracy",
    max_trials=50,
    executions_per_trial=5,
    overwrite=True,
    directory="Data/model_tests",
    project_name="cancer_classif_RS",
)

In [26]:
tuner.search_space_summary()

Search space summary
Default search space size: 6
units (Int)
{'default': 64, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
units_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'elu'], 'ordered': False}
dropout_layer (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 1e-08, 'conditions': [], 'min_value': 1e-08, 'max_value': 0.01, 'step': None, 'sampling': 'log'}


In [27]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(df_genes_labels_selected, labels, epochs=500, validation_split=0.2, callbacks=[stop_early])

Trial 5 Complete [00h 01m 41s]
val_accuracy: 0.7180661559104919

Best val_accuracy So Far: 0.736386775970459
Total elapsed time: 00h 06m 07s
INFO:tensorflow:Oracle triggered exit


In [28]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(df_genes_labels_selected, labels_enc, epochs=50, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

NameError: name 'labels_enc' is not defined

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(df_genes_labels_selected, labels_enc, epochs=best_epoch, validation_split=0.2)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1b6e1494c40>