# CNN with a Trainable Embedding Layer

In [1]:
%matplotlib inline

import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

In [2]:
import ktrain
from ktrain import text

import numpy as np
import tensorflow as tf

## Build test and training sets & define preprocessing

In [3]:
NUM_WORDS = 50000
MAXLEN = 1000
NGRAM_RANGE = 1

In [4]:
train, test, preproc = text.texts_from_csv('../data/train.csv',
                                          'cleaned_contents',
                                          label_columns=['Discrimination_Label'],
                                          val_filepath='../data/test.csv',
                                          max_features=NUM_WORDS,
                                          maxlen=MAXLEN,
                                          ngram_range=NGRAM_RANGE)

detected encoding: utf-8 (if wrong, set manually)
language: en
Word Counts: 17751
Nrows: 647
647 train sequences
train sequence lengths:
	mean : 1526
	95percentile : 4005
	99percentile : 7619
x_train shape: (647,1000)
y_train shape: (647, 2)
Is Multi-Label? False
162 test sequences
test sequence lengths:
	mean : 1414
	95percentile : 3320
	99percentile : 7203
x_test shape: (162,1000)
y_test shape: (162, 2)


## Build the model

In [5]:
def _build_cnn(maxlen, max_features, embed_dim, filters, kernels,
               loss_func='categorical_crossentropy',
               activation = 'softmax', metrics=['accuracy'],
               verbose=1, optimizer='adam'):
    
    embedding_matrix = np.ones((max_features, 1))
    embedding_matrix[0] = 0

    # set up the model
    inp = tf.keras.layers.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(max_features, embed_dim, input_length=maxlen, 
                                  trainable=True)(inp)
    x0 = tf.keras.layers.Conv1D(filters=filters,
                               kernel_size=kernels[0],
                               activation='relu')(x)
    x0 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[0] + 1)(x0)

    x1 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[1],
                                activation='relu')(x)
    x1 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[1] + 1)(x1)
    
    x2 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[2],
                                activation='relu')(x)
    x2 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[2] + 1)(x2)
    
    x3 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[3],
                                activation='relu')(x)
    x3 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[3] + 1)(x3)

    x4 = tf.keras.layers.Conv1D(filters=filters,
                                kernel_size=kernels[4],
                                activation='relu')(x)
    x4 = tf.keras.layers.MaxPool1D(pool_size=maxlen - kernels[4] + 1)(x4)

    x = tf.keras.layers.concatenate([x0, x1, x2, x3, x4])

    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(2, activation=activation)(x)
    model = tf.keras.Model(inputs=inp, outputs=outputs)
    model.compile(loss=loss_func,
                  optimizer=optimizer,
                  metrics=metrics)
    return model

In [6]:
model = _build_cnn(1000, 50000, 100, filters=10, kernels=[2, 3, 4, 5, 6])
learner = ktrain.get_learner(model, train_data=train, val_data=test)

In [7]:
learner.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1000, 100)    5000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 999, 10)      2010        embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 998, 10)      3010        embedding[0][0]                  
______________________________________________________________________________________________

## Define and train the model

In [8]:
learner.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1000, 100)    5000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 999, 10)      2010        embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 998, 10)      3010        embedding[0][0]                  
______________________________________________________________________________________________

### Find a good initial learning rate

This is a method that was developed at the Naval Research Laboratory.  It's been promoted by Jeremy Howard.

In [9]:
learner.lr_find(show_plot=True)

simulating training for different learning rates... this may take a few moments...


  'If you wish to estimate LR using more epochs, set max_epochs manually.')


Train on 647 samples
Epoch 1/5
  0/647 [..............................] - ETA: 0s

KeyError: 'loss'

### Train

In [15]:
learner.autofit(0.01, early_stopping=5)

reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.01...
Train on 647 samples, validate on 162 samples
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 00003: Reducing Max LR on Plateau: new max lr will be 0.005 (if not early_stopping).
Epoch 4/1024
Epoch 5/1024
Epoch 00005: Reducing Max LR on Plateau: new max lr will be 0.0025 (if not early_stopping).
Epoch 6/1024
Epoch 00006: early stopping
Weights from best epoch have been loaded into model.


<tensorflow.python.keras.callbacks.History at 0x7fdad00a26d8>

## Examine results

In [16]:
learner.validate(class_names=preproc.get_classes())

              precision    recall  f1-score   support

           0       0.72      0.58      0.64        65
           1       0.75      0.85      0.80        97

    accuracy                           0.74       162
   macro avg       0.73      0.71      0.72       162
weighted avg       0.74      0.74      0.74       162



array([[38, 27],
       [15, 82]])

## Explain a prediction

In [17]:
import pandas as pd
X_test = pd.read_csv('./data/test.csv')['cleaned_contents']

In [18]:
predictor = ktrain.get_predictor(learner.model, preproc=preproc)

In [19]:
predictor.explain('As the perpetrator is the sole breadwinner for his family, I reduce his sentence by two years.')

Contribution?,Feature
1.181,Highlighted in text (sum)
-0.084,<BIAS>
