# CNN with a Trainable Embedding Layer - Optimize hyperparameters
## With 80/10/10 split

In [1]:
%matplotlib inline

import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

In [2]:
import ktrain
from ktrain import text

import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

## Build test and training sets & define preprocessing

In [3]:
NUM_WORDS = 50000

In [4]:
train, test, preproc = text.texts_from_csv('../data/train_80_10_10.csv',
                                          'cleaned_contents',
                                          label_columns=['Discrimination_Label'],
                                          val_filepath='../data/val_80_10_10.csv',
                                          max_features=NUM_WORDS,
                                          maxlen=MAXLEN,
                                          ngram_range=NGRAM_RANGE)

detected encoding: utf-8 (if wrong, set manually)
language: en
Word Counts: 17531
Nrows: 647
647 train sequences
train sequence lengths:
	mean : 1501
	95percentile : 3939
	99percentile : 7668
x_train shape: (647,5000)
y_train shape: (647, 2)
Is Multi-Label? False
81 test sequences
test sequence lengths:
	mean : 1510
	95percentile : 4141
	99percentile : 5507
x_test shape: (81,5000)
y_test shape: (81, 2)


## Build the model

In [5]:
def _build_cnn(maxlen, max_features, embed_dim, filters, kernels, density,
               dropout=0.1,
               loss_func='categorical_crossentropy',
               activation = 'softmax', metrics=['accuracy'],
               verbose=1, optimizer='adam'):
    
    embedding_matrix = np.ones((max_features, 1))
    embedding_matrix[0] = 0

    # set up the model
    inp = tf.keras.layers.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(max_features, embed_dim, input_length=maxlen, 
                                  trainable=True)(inp)
    x0 = tf.keras.layers.Conv1D(filters=filters,
                               kernel_size=kernels[0],
                               activation='relu')(x)
    x0 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[0] + 1)(x0)

    x1 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[1],
                                activation='relu')(x)
    x1 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[1] + 1)(x1)
    
    x2 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[2],
                                activation='relu')(x)
    x2 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[2] + 1)(x2)
    
    x3 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[3],
                                activation='relu')(x)
    x3 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[3] + 1)(x3)

    x4 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[4],
                                activation='relu')(x)
    x4 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[4] + 1)(x4)

    x = tf.keras.layers.concatenate([x0, x1, x2, x3, x4])

    x = tf.keras.layers.Dense(density, activation='relu')(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(2, activation=activation)(x)
    
    model = tf.keras.Model(inputs=inp, outputs=outputs)
    
    model.compile(loss=loss_func,
                  optimizer=optimizer,
                  metrics=metrics)
    
    train, test, preproc = text.texts_from_csv('../data/train_80_10_10.csv',
                                          'cleaned_contents',
                                          label_columns=['Discrimination_Label'],
                                          val_filepath='../data/val_80_10_10.csv',
                                          max_features=NUM_WORDS,
                                          maxlen=maxlen,
                                          ngram_range=NGRAM_RANGE)
    
    return model

In [6]:
model = _build_cnn(MAXLEN, 50000, 100, filters=32, kernels=[2, 3, 4, 5, 6], dropout=0.4)
learner = ktrain.get_learner(model, train_data=train, val_data=test)

In [9]:
learner.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 5000, 100)    5000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 4999, 32)     6432        embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 4998, 32)     9632        embedding[0][0]                  
______________________________________________________________________________________________

## Define and train the model

### Find a good initial learning rate

This is a method that was developed at the Naval Research Laboratory.  It's been promoted by Jeremy Howard.

In [None]:
learner.lr_find(show_plot=True)

### Train

In [8]:
learner.autofit(0.001, early_stopping=5)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.001...
Train on 647 samples, validate on 81 samples
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 00007: Reducing Max LR on Plateau: new max lr will be 0.0005 (if not early_stopping).
Epoch 8/1024
Epoch 9/1024
Epoch 00009: Reducing Max LR on Plateau: new max lr will be 0.00025 (if not early_stopping).
Epoch 10/1024
Epoch 00010: early stopping
Weights from best epoch have been loaded into model.


<tensorflow.python.keras.callbacks.History at 0x7fd367ebb8d0>

## Examine results

In [10]:
learner.validate(class_names=preproc.get_classes())

              precision    recall  f1-score   support

           0       0.62      0.45      0.53        33
           1       0.68      0.81      0.74        48

    accuracy                           0.67        81
   macro avg       0.65      0.63      0.63        81
weighted avg       0.66      0.67      0.65        81



array([[15, 18],
       [ 9, 39]])

In [26]:
val_x = list(pd.read_csv('../data/val_80_10_10.csv')['cleaned_contents'])
y_val_proba = np.array(predictor.predict_proba(val_x))[:,1]
y_val_hat = np.array(predictor.predict(val_x), dtype=np.int64)
val_y = np.array(pd.read_csv('../data/val_80_10_10.csv')['Discrimination_Label'])

In [29]:
for i in range(1, 100):
    threshold = i / 100
    print(threshold, classification_report(val_y, y_val_proba >= threshold))

0.01               precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.59      1.00      0.74        48

    accuracy                           0.59        81
   macro avg       0.30      0.50      0.37        81
weighted avg       0.35      0.59      0.44        81

0.02               precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.59      1.00      0.74        48

    accuracy                           0.59        81
   macro avg       0.30      0.50      0.37        81
weighted avg       0.35      0.59      0.44        81

0.03               precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.59      1.00      0.74        48

    accuracy                           0.59        81
   macro avg       0.30      0.50      0.37        81
weighted avg       0.35      0.59      0.44        81

0.04

In [18]:
y_val_hat

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0])

In [13]:
test_x = list(pd.read_csv('../data/test_80_10_10.csv')['cleaned_contents'])
predictor = ktrain.get_predictor(learner.model, preproc=preproc)
y_hat = np.array(predictor.predict(test_x), dtype=np.int64)
y = np.array(pd.read_csv('../data/test_80_10_10.csv')['Discrimination_Label'])

In [14]:
print(classification_report(y, y_hat))
print(confusion_matrix(y, y_hat))

              precision    recall  f1-score   support

           0       0.79      0.51      0.62        37
           1       0.68      0.89      0.77        44

    accuracy                           0.72        81
   macro avg       0.74      0.70      0.70        81
weighted avg       0.73      0.72      0.70        81

[[19 18]
 [ 5 39]]


## Explain a prediction

In [None]:
predictor.explain('As the perpetrator is the sole breadwinner for his family, I reduce his sentence by two years.')