# Notebook for intent classifier model training

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

sys.path.append('..')
from src.utils import load_data

pd.set_option('display.max_colwidth', 120)

### Load training and test set

In [None]:
train_file_path = '../dataset/train.csv'
test_file_path = '../dataset/test.csv'
df_train = load_data(train_file_path)
df_test = load_data(test_file_path)

In [None]:
df_test.head()

In [None]:
df_train.head()

### Clean the dataset

In [None]:
from src.models import INTENT_CLSF_STOPWORDS
from src.utils import clean_sentences
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

INPUT_CLEAN_POLICY = '^ a-z A-Z 0-9 @#'
INPUT_LEMMATIZER = nltk.wordnet.WordNetLemmatizer()

In [None]:
train_cleaned_questions = clean_sentences(
    sentences=df_train['question'],
    stopwords=INTENT_CLSF_STOPWORDS,
    clean_policy=INPUT_CLEAN_POLICY,
    lemmatizer=INPUT_LEMMATIZER
)

In [None]:
test_cleaned_questions = clean_sentences(
    sentences=df_test['question'],
    stopwords=INTENT_CLSF_STOPWORDS,
    clean_policy=INPUT_CLEAN_POLICY,
    lemmatizer=INPUT_LEMMATIZER
)

### Encode the input datasets

In [None]:
from src.utils import create_tokenizer, save_tokenizer, load_tokenizer
from src.utils import get_words_max_length, encode_sentences, pad_sentences

The tokenizer can be created from scratch or loaded from disk.

In [None]:
# input_tokenizer = create_tokenizer(train_cleaned_questions, 
#                                   filters='!"#$%&()*+,-./:;<=>?[\]^_`{|}~')
input_tokenizer = load_tokenizer('../model_params/input_tokenizer.pickle')

In [None]:
VOCAB_SIZE = len(input_tokenizer.word_index) + 1
WORDS_MAX_LENGTH = get_words_max_length(train_cleaned_questions)
print("Vocab Size = %d and Maximum length = %d" % (VOCAB_SIZE, WORDS_MAX_LENGTH))

In [None]:
train_encoded_sentences = encode_sentences(input_tokenizer, 
                                           train_cleaned_questions)
test_encoded_sentences = encode_sentences(input_tokenizer, 
                                          test_cleaned_questions)

In [None]:
train_padded_sentences = pad_sentences(train_encoded_sentences,
                                       WORDS_MAX_LENGTH)
test_padded_sentences = pad_sentences(test_encoded_sentences,
                                      WORDS_MAX_LENGTH)

### Encode the output intent string labels
Tokenizer for the output (intent label strings) with filter changed so that it does not contain the underscore '_' symbol.

Also here, the tokenizer can be created from scratch or loaded form disk.

In [None]:
from src.utils import encode_output_labels, one_hot

OUTPUT_CLEAN_POLICY = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~'

In [None]:
# intent_labels = list(set(df_train['intent']))
# output_tokenizer = create_tokenizer(intent_labels, 
#                                     filters = OUTPUT_CLEAN_POLICY)
output_tokenizer = load_tokenizer('../model_params/output_tokenizer.pickle')

In [None]:
train_encoded_output = encode_output_labels(output_tokenizer, df_train['intent'])
test_encoded_output = encode_output_labels(output_tokenizer, df_test['intent'])

In [None]:
train_output_one_hot = one_hot(train_encoded_output)
test_output_one_hot = one_hot(test_encoded_output)

### Split training set in train/validation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(train_padded_sentences,
                                                  train_output_one_hot,
                                                  shuffle=True,
                                                  test_size=0.2)

print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

### Create the model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Dropout, LSTM

In [None]:
NR_OF_INTENT_CLASSES = len(output_tokenizer.word_index)

def create_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=max_length, trainable=False))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(NR_OF_INTENT_CLASSES, activation = "softmax"))

    return model

In [None]:
model = create_model(vocab_size=VOCAB_SIZE, 
                     max_length=WORDS_MAX_LENGTH)

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

### Train the model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
import os

In [None]:
if not os.path.isdir('trained_weights'):
    os.makedirs('trained_weights')
filename = 'trained_weights/intent_classifier_{epoch:04d}-{val_loss:.8f}.h5'
tensorboard_log_dir = 'tb_logs'

checkpoint = ModelCheckpoint(filename, 
                             monitor='val_loss', 
                             verbose=1, 
                             mode='min')
tensorboard = TensorBoard(log_dir=tensorboard_log_dir,
                          histogram_freq=0,
                          update_freq='epoch',
                          write_graph=False,
                          write_grads=False,
                          write_images=False,
                          embeddings_freq=0)

hist = model.fit(train_X, train_Y, 
                 epochs=30, 
                 batch_size=16, 
                 validation_data=(val_X, val_Y), 
                 callbacks=[checkpoint, tensorboard])

Plot the loss and accuracy curves

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20,5))
ax[0].plot(hist.history['loss'], color='red', label='train')
ax[0].plot(hist.history['val_loss'], color='green', label='val')
ax[1].plot(hist.history['accuracy'], color='red', label='train')
ax[1].plot(hist.history['val_accuracy'], color='green', label='val')
ax[0].set_ylabel('categorical_crossentropy')
ax[1].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[1].set_xlabel('epoch')
ax[0].legend()
ax[1].legend()
plt.show()

###### Make some predictions

In [None]:
from src.utils import clean_sentence

In [None]:
model.load_weights('../model_params/intent_classifier_0012-1.05701441.h5')

In [None]:
input_text = 'How are you?'

##### Preprocess the input and predict

In [None]:
clean_text = clean_sentence(input_text, 
                            stopwords=INTENT_CLSF_STOPWORDS,
                            clean_policy=INPUT_CLEAN_POLICY,
                            lemmatizer=INPUT_LEMMATIZER)

encoded_text = encode_sentences(input_tokenizer, clean_text)

# Check for unknown words
if [] in encoded_text:
    encoded_text = list(filter(None, encoded_text))

out = np.array(encoded_text).reshape(1, len(encoded_text))
x = pad_sentences(out, WORDS_MAX_LENGTH)

pred = model.predict_proba(x)[0]

Get the final output

In [None]:
# positions range from 0 to NR_OF_INTENT_CLASSES - 1, while output labels are indexed starting from 1
intent_index = np.argmax(pred) + 1  
confidence = round(np.max(pred), 3)
predicted_intent = output_tokenizer.index_word[intent_index]
print("Predicted intent '{}' with confidence {:>4.3f}".format(predicted_intent, confidence))

### Evaluate model on test set

In [None]:
test_loss, test_accuracy = model.evaluate(x=test_padded_sentences,
                                          y=test_output_one_hot)
print('Test loss = {}'.format(test_loss))
print('Test accuracy = {}'.format(test_accuracy))

In [None]:
train_loss, train_accuracy = model.evaluate(x=train_padded_sentences,
                                          y=train_output_one_hot)
print('Train loss = {}'.format(train_loss))
print('Train accuracy = {}'.format(train_accuracy))

###### Plot confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn

In [None]:
test_pred = model.predict_proba(test_padded_sentences)
labels_pred = np.array(list(map(lambda i: output_tokenizer.index_word[i+1], np.argmax(test_pred, axis=1))))
labels_true = np.array(list(map(lambda i: output_tokenizer.index_word[i], test_encoded_output.flatten())))

In [None]:
intents = list(output_tokenizer.word_index.keys())
cm = confusion_matrix(y_true=labels_true, 
                      y_pred=labels_pred, 
                      labels=intents)

In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(20,10))

df_cm = pd.DataFrame(cm, intents, intents)
sn.set(font_scale=0.5) # for label size
sn.heatmap(df_cm)

ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")
fig.show()