# **Import the dataset and SMOTE**

In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.activations import elu
from keras import backend as K
import pandas as pd
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.model_selection import train_test_split
from focal_loss import BinaryFocalLoss

In [2]:
train = pd.read_csv('../../dataset/lung_training.csv')
test = pd.read_csv('../../dataset/lung_test.csv')
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [3]:
# split into input train, validation and test
train_X = train.iloc[:,:-1]
train_y = train.iloc[:,-1]
test_X = test.iloc[:,:-1]
test_y = test.iloc[:,-1]


# train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, shuffle=False)

# **Ensemble model of NN**

The idea is to fit $k + 1$ NNs models:

1. the first one will be the one trained on all lung obs and an equal number of non-lung obs randomly selected
2. ...
3. ...

And so forth up top the $k$-th model. 
We then define the prediction in voting fashion, possibly with a weight (i.e. a vote for lung could count more than a vote for non-lung).

As loss function we use the Focal Loss. Here’s the main idea: in our dataset, we will naturally have some training examples that are easier to classify than others (non-lung). During training, these examples will be classified with 99% accuracy, while other more challenging ones may still exhibit poor performance. The problem is that those easily classified training examples are still contributing to the loss. Why are we still giving them equal weight when there are other more challenging data points that if correctly classified can contribute much more to our overall accuracy?

$$FL(p_t) = \alpha \cdot (1-p_t)^\gamma \cdot \log{p_t}$$

Focal loss down-weights the well-classified examples. This has the net effect of putting more training emphasis on that data that is hard to classify! In a practical setting where we have a data imbalance, our majority class will quickly become well-classified since we have much more data for it. Thus, in order to insure that we also achieve high accuracy on our minority class, we can use the focal loss to give those minority class examples more relative weight during training. 

**$k$ undersampling NNs models**

In [23]:
models = []

for i in range(30):
    # create random undersampling of majority class
    rus = RandomUnderSampler(random_state=i)
    X_res, y_res = rus.fit_resample(train_X, train_y)


    model_i = keras.Sequential(
    [
        keras.layers.Dense(500, activation="relu", input_shape=(17393,)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(300, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
    metrics = [
        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
        keras.metrics.TrueNegatives(name="tn"),
        keras.metrics.TruePositives(name="tp"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
    ]

    model_i.compile(
        optimizer='adam', loss=BinaryFocalLoss(gamma=2), metrics=metrics
        )

    model_i.fit(X_res, y_res, batch_size=len(X_res), epochs=40, verbose=0)

    models.append(model_i)
        

2021-11-27 11:14:51.855061: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:14:54.800870: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:14:57.506237: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:15:00.203026: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:15:02.969297: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:15:05.675282: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:15:08.383919: I tensorflow/core/grappler/optimizers/cust

In [24]:
sum_pred = 0
for i in range(len(models)):
    sum_pred += models[i].predict(test_X)

ensemble_prediction = (sum_pred/len(models)) > 0.5
conf_matr = confusion_matrix(test_y, ensemble_prediction)
conf_matr

2021-11-27 11:16:35.210976: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:35.383967: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:35.567069: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:35.740877: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:35.916717: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:36.099589: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-27 11:16:36.258769: I tensorflow/core/grappler/optimizers/cust

array([[109,  66],
       [  5,  25]])