# **Import the dataset and SMOTE**

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.activations import elu
from keras import backend as K
import pandas as pd
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../dataset/lung_training.csv')
test = pd.read_csv('../dataset/lung_test.csv')
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [4]:
# split into input train, validation and test
X = train.iloc[:,:-1]
y = train.iloc[:,-1]
test_X = test.iloc[:,:-1]
test_y = test.iloc[:,-1]
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, shuffle=False)

# **Ensemble model of NN**

The idea is to fit $k + 1$ NNs models:

0. number zero will be the one trained on the balanced dataset obtained with SMOTE
1. the first one will be the one trained on all lung obs and an equal number of non-lung obs randomly selected
2. ...
3. ...

And so forth up top the $k$-th model. 
We then define the prediction in voting fashion, possibly with a weight (i.e. a vote for lung could count more than a vote for non-lung).

As loss function we use the Focal Loss. Here’s the main idea: in our dataset, we will naturally have some training examples that are easier to classify than others (non-lung). During training, these examples will be classified with 99% accuracy, while other more challenging ones may still exhibit poor performance. The problem is that those easily classified training examples are still contributing to the loss. Why are we still giving them equal weight when there are other more challenging data points that if correctly classified can contribute much more to our overall accuracy?

$$FL(p_t) = \alpha \cdot (1-p_t)^\gamma \cdot \log{p_t}$$

Focal loss down-weights the well-classified examples. This has the net effect of putting more training emphasis on that data that is hard to classify! In a practical setting where we have a data imbalance, our majority class will quickly become well-classified since we have much more data for it. Thus, in order to insure that we also achieve high accuracy on our minority class, we can use the focal loss to give those minority class examples more relative weight during training. 

In [5]:
def focal_loss(y_true, y_pred):
    gamma = 20.0
    alpha = 0.25
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))

**0. SMOTE model**

In [12]:
# create smote dataset
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(train_X, train_y)

In [13]:
smote_model = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu", input_shape=(17393,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation='sigmoid')
    ]
)
smote_model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_32 (Dense)             (None, 256)               4452864   
_________________________________________________________________
dense_33 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_16 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_17 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 257       
Total params: 4,584,705
Trainable params: 4,584,705
Non-trainable params: 0
____________________________________________

In [14]:
metrics = [
        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
        keras.metrics.TrueNegatives(name="tn"),
        keras.metrics.TruePositives(name="tp"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
    ]

smote_model.compile(
    optimizer='adam', loss=[focal_loss], metrics=metrics
    )

smote_model.fit(X_res, y_res, batch_size=len(X_res), epochs=40, verbose=0)

2021-11-21 15:35:00.517308: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


<keras.callbacks.History at 0x321cfce80>

In [23]:
y_pred = (smote_model.predict(test_X)) > 0.5
conf_matr = confusion_matrix(test_y, y_pred)
conf_matr

# maybe choose better 0.5 threshold with validation

**$k$ undersampling NNs models**

In [43]:
models = [smote_model]

for i in range(100):
    # create random undersampling of majority class
    rus = RandomUnderSampler(random_state=i)
    X_res, y_res = rus.fit_resample(train_X, train_y)


    model_i = keras.Sequential(
    [
        keras.layers.Dense(1000, activation="relu", input_shape=(17393,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(16, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

    model_i.compile(
        optimizer='adam', loss=[focal_loss], metrics=metrics
        )

    model_i.fit(X_res, y_res, batch_size=len(X_res), epochs=40, verbose=0)

    models.append(model_i)
        

2021-11-21 16:04:59.313224: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-21 16:05:02.634004: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-21 16:05:05.706310: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-21 16:05:08.812695: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-21 16:05:11.965844: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-21 16:05:15.110694: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


KeyboardInterrupt: 

**Choosing classification threshold**

In [41]:
best_eps = 0
best_acc = 0
sum_pred = models[0].predict(val_X)
for i in range(1,len(models)):
    sum_pred += models[i].predict(val_X)

for eps in [k/100000 for k in range(100000)]:
    ensemble_prediction = (sum_pred/len(models)) > (0.40 + eps)
    conf_matr = confusion_matrix(val_y, ensemble_prediction)

    tpr = conf_matr[1][1]/(conf_matr[1][0] + conf_matr[1][1])
    tnr = conf_matr[0][0]/(conf_matr[0][0] + conf_matr[0][1])
    eps_acc = (tpr + tnr)/2
    if eps_acc >= best_acc:
        best_acc = eps_acc
        best_eps = eps

print("Best threshold is: " + str(0.40 + best_eps) + '\n' + 'Best avg acc is: ' + str(best_acc))

Best threshold is: 0.49376000000000003
Best avg acc is: 0.6396628826969384


**Performance on test set**

# **Variable selection with permutation importance**

Feature importance techniques were developed to help assuage this interpretability crisis. Feature importance techniques assign a score to each predictor based on its ability to improve predictions. This allows us to rank the predictors in our model based on their relative predictive power.

The idea behind feature importance is simple. Inputs that are useful for prediction contain valuable information. If you destroy that information by randomly shuffling the feature values, the quality of your predictions should decrease. If the decrease in quality is small, then the information in the original predictor wasn’t very impactful in determining your predictions — your model is still pretty good without it. Furthermore, if the decrease is large, then the information in the original predictor had a large impact on your predictions.

This idea is implemented in three simple steps. Say that you’ve trained an ML model and recorded some measure of quality for the predictions (ex. MSE, log-loss, etc). For each predictor in the dataset:

1. Randomly shuffle the data in the predictor while keeping the values of other predictors constant

2. Generate new predictions based on the shuffled values and evaluate the quality of your new predictions

3. Compute the feature importance score by calculating the decrease in the quality of your new predictions relative to your original predictions

Once you’ve computed feature importance scores for all of your features, you can rank them in terms of predictive usefulness. To help explain permutation feature importance more concretely, consider the following synthetic case study.