### LSTM Model
##### Built in Keras

Import modules

In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import numpy as np
np.random.seed(49)
import pandas as pd

import os
import csv
import codecs
import logging
import pickle
import random
import keras
import sys

from keras.preprocessing.text import Tokenizer
from keras.layers.merge import concatenate, subtract
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout, Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from helpers import save_model

Using TensorFlow backend.


Read in training data and append training labels

In [3]:
def get_train_data(train_data_path='../data/train_data.csv', train_labels_path='../data/train_data.csv'):
    df_train = pd.read_csv(train_data_path)
    df_train.drop(['is_duplicate'], axis= 1, inplace = True)
    df_labels = pd.read_csv(train_labels_path)
    logging.info('loaded training data')
    return df_train.merge(df_labels)

Read in test data

In [4]:
def get_test_data(test_data_path='../data/test_data.csv'):
    logging.info('loaded test data')
    return pd.read_csv(test_data_path)

Transform train and test sets into series

In [5]:
df_train = get_train_data()
df_test = get_test_data()

train_qs_1 = pd.Series(df_train['question1']) 
train_qs_2 = pd.Series(df_train['question2'])
labels = pd.Series(df_train['is_duplicate'])

test_qs_1 = pd.Series(df_test['question1']) 
test_qs_2 = pd.Series(df_test['question2']) 

Create full text lists for processing

In [6]:
all_texts = train_qs_1.astype(str).tolist() + train_qs_2.astype(str).tolist() + test_qs_1.astype(str).tolist() + test_qs_1.astype(str).tolist()

train_q1 = train_qs_1.astype(str).tolist()
train_q2 = train_qs_2.astype(str).tolist()

test_q1 = test_qs_1.astype(str).tolist()
test_q2 = test_qs_2.astype(str).tolist()

Using Keras Tokenizer, fit to all questions

In [16]:
MAX_TOK_WORDS = 100000
tokenizer = Tokenizer(num_words=MAX_TOK_WORDS)
tokenizer.fit_on_texts(all_texts)
word_index = tokenizer.word_index

Transform questions to word sequences

In [17]:
def make_sequence(text_series, tokenizer):
    return tokenizer.texts_to_sequences(text_series.astype(str).tolist())

In [18]:
train_seq_1 = make_sequence(train_qs_1, tokenizer)
train_seq_2 = make_sequence(train_qs_2, tokenizer)

test_seq_1 = make_sequence(test_qs_1, tokenizer)
test_seq_2 = make_sequence(test_qs_2, tokenizer)

Identify appropriate padding length
Take 99.5th percentile of sequence lengths

In [None]:
full_seq = train_seq_1 + train_seq_2 + test_seq_1 + test_seq_2

max_pad_len = int(np.percentile([len(x) for x in full_seq], 99.5))  # 36

Apply padding to sequences

In [None]:
padded_train_1 = pad_sequences(train_seq_1, maxlen=max_pad_len)
padded_train_2 = pad_sequences(train_seq_2, maxlen=max_pad_len)

padded_test_1 = pad_sequences(test_seq_1, maxlen=max_pad_len)
padded_test_2 = pad_sequences(test_seq_2, maxlen=max_pad_len)

Calculate class weights due to unbalanced data

In [None]:
dup_weight = float(df_labels['is_duplicate'].sum())/df_labels['is_duplicate'].count()
non_weight = 1 - dup_weight

re_weight = non_weight/dup_weight

class_weight = {0 : 1., 1: re_weight}

Create word index from Glove (Glove file stored locally)

In [None]:
def load_embeddings(glove_path):
    logging.info('loading embeddings from gloVe file')
    embeddings_index = {}
    try:
        glove = codecs.open(glove_path, encoding='utf-8')
    except IOError:
        logging.warning('no glove embeddings file supplied. please visit http://nlp.stanford.edu/data/glove.6B.zip and copy the file glove.6B.300d.txt into this directory')

    for row in glove:
        word_dims = row.split(' ')
        index = word_dims[0]
        dims = np.asarray(word_dims[1:], dtype='float32')
        embeddings_index[index] = dims

    glove.close()
    return embeddings_index

Create word embeddings

In [None]:
def create_embeddings(word_index, glove_path=os.getcwd()+'/glove.6B.300d.txt', save=True):
    logging.info('creating word embeddings')
    embeddings_index = load_embeddings(glove_path)
    index_length = len(word_index)
    embedding_matrix = np.zeros((index_length+1, 300))

    for w, i in word_index.items():
        if i > index_length:
            continue
        embedding_vector = embeddings_index.get(w)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    if save:
        logging.info('saving embeddings to file')
        with open('../models/embedding_matrix.txt', 'wb') as filepath:
            pickle.dump(embedding_matrix, filepath)

    return embedding_matrix

In [None]:
def get_embeddings(filename='../models/embedding_matrix.txt'):
    if os.path.isfile(filename):
        logging.info('loading embeddings from file')
        with open(filename, 'rb') as filepath:
                return pickle.load(filepath)
    else:
        logging.warning('No embeddings found, please create embeddings for the data provided in the /notebooks/lstm_train notebook.')
        return create_embeddings(word_index)

Create embedding layer for LSTM

In [None]:
embedding_layer = Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length=max_pad_len)

Define possible values for hyperparameters

In [None]:
lstm_nodes = [200,300,400]
dense_nodes = [100,200,300]

lstm_drop = [0.1,0.15,0.2,0.25,0.3]
dense_drop = [0.1,0.15,0.2,0.25,0.3]

dense_activation = ['relu','sigmoid']

lstm_bidirectional = [True,False]

Initialize hyperparameters by randomly selecting from defined options

This method is used to test performance of hyperparameters and can be overridden manually

In [None]:
lstm_nodes_choice = random.choice(lstm_nodes)
dense_nodes_choice = random.choice(dense_nodes)

lstm_drop_choice = random.choice(lstm_drop)
dense_drop_choice = random.choice(dense_drop)

dense_activation_choice = random.choice(dense_activation)

lstm_bidirectional_choice = random.choice(lstm_bidirectional)

Builds LSTM layer (bidirectional LSTM layer if chosen)

In [None]:
if lstm_bidirectional_choice:    
    lstm_layer = Bidirectional(LSTM(lstm_nodes_choice,
                                     dropout=lstm_drop_choice,
                                     recurrent_dropout=lstm_drop_choice
                                   ))
else:
    lstm_layer = LSTM(lstm_nodes_choice,
                       dropout=lstm_drop_choice,
                       recurrent_dropout=lstm_drop_choice
                     )

Input layers for Question 1 and 2

In [None]:
input_1 = Input(shape=(max_pad_len,), dtype='int32')
embedded_1 = embedding_layer(input_1)
q1 = lstm_layer(embedded_1)

input_2 = Input(shape=(max_pad_len,), dtype='int32')
embedded_2 = embedding_layer(input_2)
q2 = lstm_layer(embedded_2)

Combine the outputs of the Q1 and Q2 LSTM layers

In [None]:
combined_layer = subtract([q1, q2])
combined_layer = Dropout(lstm_drop_choice)(combined_layer)
combined_layer = BatchNormalization()(combined_layer)

First Dense layer

In [None]:
combined_layer = Dense(dense_nodes_choice, activation=dense_activation_choice)(combined_layer)
combined_layer = Dropout(dense_drop_choice)(combined_layer)
combined_layer = BatchNormalization()(combined_layer)

Second Dense layer (optional)

In [None]:
# combined_layer = Dense(dense_nodes_choice, activation=dense_activation_choice)(combined_layer)
# combined_layer = Dropout(dense_drop_choice)(combined_layer)
# combined_layer = BatchNormalization()(combined_layer)

Prediction (output) layer

In [None]:
prediction_layer = Dense(1, activation='sigmoid')(combined_layer)

Compile model

In [None]:
model = Model(inputs=[input_1, input_2],outputs=prediction_layer)
model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['acc'])

Fit model

In [None]:
epochs = 20
logging = model.fit([padded_train_1,padded_train_2], labels , validation_split = 0.2, 
                    epochs=epochs, batch_size=1000, shuffle=True, class_weight = class_weight)

Save model

In [None]:
sys.path.insert(0, '..')
save_model(model, '../models/model_1/')

Use model to predict test data and export as csv

In [None]:
test_predictions = model.predict([padded_test_1, padded_test_2])
test_df = pd.DataFrame({"test_id":pd.Series(df_test['test_id']), "nn_out":test_predictions.ravel()})
test_df.to_csv("../data/test_lstm_output.csv", index=False)

Use model to predict train data and export as csv

In [None]:
train_predictions = model.predict([padded_train_1,padded_train_2])
train_df = pd.DataFrame({"id":pd.Series(df_train['id']), "nn_out":train_predictions.ravel()})
train_df.to_csv("../data/train_lstm_output.csv", index=False)