# Data Prep

## Import packages

In [None]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
import numpy as np
import json
import pandas as pd
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import csv

## Initialize variables

In [None]:
BASE_DIR = '/content/drive/My Drive/Machine Learning/Data/Quora/'
TRAIN_DATA = 'quora_train.csv'
GLOVE_FILE = 'glove.6B.300d.txt'
Q1_TRAIN_FILE = 'q1_train.npy'
Q2_TRAIN_FILE = 'q2_train.npy'
LABEL_FILE = 'target_labels.npy'
GLOVE_EMBEDDING_MATRIX_FILE = 'glove_word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [None]:
train_data = pd.read_csv(BASE_DIR + TRAIN_DATA)

## Download and clean questions

In [None]:
question_1 = []
question_2 = []
is_duplicate = []

with open(BASE_DIR + TRAIN_DATA, encoding = 'utf-8') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    question_1.append(row['question1'])
    question_2.append(row['question2'])
    is_duplicate.append(row['is_duplicate'])

print(f"# of questions: {len(question_1)}")

# of questions: 404290


In [None]:
# functions for cleaning questions
def load_doc(filepath):
  file = open(filepath, 'r')
  text = file.read()
  file.close()
  return text

def clean_question(question):
  tokens = word_tokenize(question)
  tokens = [w for w in tokens if w.isalpha()]
  tokens = ' '.join(tokens)
  return tokens

def process_question(question):
  clean_q = []
  for q in question:
    q  = str(q)
    qs = clean_question(q)
    clean_q.append(qs)
  return clean_q

question1_clean = process_question(question_1)
question2_clean = process_question(question_2)

## Tokenize words

In [None]:
questions = question1_clean + question2_clean
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)

question1_word_sequences = tokenizer.texts_to_sequences(question1_clean)
question2_word_sequences = tokenizer.texts_to_sequences(question2_clean)

word_index = tokenizer.word_index
print(f"Words in index: {len(word_index)}")

Words in index: 76328


## Process Embedding

In [None]:
# define some functions to process pre-trained embedding
nb_words = min(MAX_NB_WORDS, len(word_index))

def load_embedding(filepath):
  file = open(filepath, 'r', encoding = 'utf-8')
  embeddings = {}
  for line in file:
    values = line.split(' ')
    word = values[0]
    vec = np.asarray(values[1:], dtype = 'float32')
    embeddings[word] = vec
  return embeddings

def get_weight_matrix(embedding):
  weight_matrix = np.zeros((nb_words+1, EMBEDDING_DIM))
  for word, i in word_index.items():
    if i > MAX_NB_WORDS:
      continue
    embedding_vector = embedding.get(word)
    if embedding_vector is not None:
      weight_matrix[i] = embedding_vector
  return weight_matrix

raw_embedding = load_embedding(BASE_DIR + GLOVE_FILE)
word_embedding_matrix = get_weight_matrix(raw_embedding)
print(f"embedding shape: {word_embedding_matrix.shape}")
print(f"Word embeddings: {len(raw_embedding)}")

embedding shape: (76329, 300)
Word embeddings: 400001


## Prepare training data

In [None]:
q1_data = pad_sequences(question1_word_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
q2_data = pad_sequences(question2_word_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
labels = np.array(is_duplicate, dtype = 'int')

print(f"Shape of q1_data: {q1_data.shape}")
print(f"shape of q2_data: {q2_data.shape}")
print(f"Shape of target class: {labels.shape}")

Shape of q1_data: (404290, 50)
shape of q2_data: (404290, 50)
Shape of target class: (404290,)


## Save processed data

In [None]:
np.save(open(BASE_DIR + Q1_TRAIN_FILE, 'wb'), q1_data)
np.save(open(BASE_DIR + Q2_TRAIN_FILE, 'wb'), q2_data)
np.save(open(BASE_DIR + LABEL_FILE, 'wb'), labels)
np.save(open(BASE_DIR + GLOVE_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(BASE_DIR + NB_WORDS_DATA_FILE,'w') as f:
  json.dump({'nb_words': nb_words}, f)

# Build and Train Model

## Import packages

In [None]:
import time, json
from keras.models import Model, load_model
from keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## initialize variables

In [None]:
LSTM_MODEL_PATH = 'lstm_quora_question_pairs.h5'
BI_MODEL_PATH = 'bi_quora_question_pairs.h5'
PLOT_MODEL = 'bi_model.png'
MAX_SEQUENCE_LENGTH = 50
GLOVE_EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.3
EPOCHS = 25
DROPOUT = 0.2
BATCH_SIZE = 32

## Load and split data

In [None]:
q1_data = np.load(open(BASE_DIR + Q1_TRAIN_FILE, 'rb'))
q2_data = np.load(open(BASE_DIR + Q2_TRAIN_FILE, 'rb'))
glove_embedding_matrix = np.load(open(BASE_DIR + GLOVE_EMBEDDING_MATRIX_FILE, 'rb'))
labels = np.load(open(BASE_DIR + LABEL_FILE, 'rb'))
with open(BASE_DIR + NB_WORDS_DATA_FILE, 'r') as f:
  nb_words = json.load(f)['nb_words']
      

# split data
X = np.stack((q1_data, q2_data), axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = VALIDATION_SPLIT, random_state = 42)

Q1_train = X_train[:, 0]
Q2_train = X_train[:, 1]
Q1_test = X_test[:, 0]
Q2_test = X_test[:, 1]

## Build Model

In [None]:
# using GloVe embedding

q1_input = Input(shape = (MAX_SEQUENCE_LENGTH,))
q2_input = Input(shape = (MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words+1, 
               GLOVE_EMBEDDING_DIM, 
               input_length = MAX_SEQUENCE_LENGTH,
               weights = [glove_embedding_matrix], 
               trainable = False)(q1_input)
q1 = Bidirectional(LSTM(128))(q1)

q2 = Embedding(nb_words+1,
               GLOVE_EMBEDDING_DIM,
               input_length = MAX_SEQUENCE_LENGTH,
               weights = [glove_embedding_matrix],
               trainable = False)(q2_input)
q2 = Bidirectional(LSTM(128))(q2)

merged = Concatenate()([q1,q2])
merged = Dense(128, activation = 'relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT)(merged)
merged = Dense(64, activation = 'relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT)(merged)
merged = Dense(32, activation = 'relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT)(merged)

merged = Dense(1, activation = 'sigmoid')(merged)

model = Model(inputs = [q1_input, q2_input], outputs = merged)
opt = Adam(lr = 0.001)
model.compile(loss = 'binary_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [None]:
t0 = time.time()
callbacks = ModelCheckpoint(BASE_DIR+BI_MODEL_PATH, monitor = 'val_loss',
                            verbose = 0, save_best_only = True)
history = model.fit([Q1_train, Q2_train], 
                    y_train,
                    verbose = 2,
                    batch_size = BATCH_SIZE,
                    epochs = EPOCHS,
                    callbacks = [callbacks], 
                    validation_split = VALIDATION_SPLIT)
t1 = time.time()
print(f"total training time: {round((t1-t0)/60,2)} minutes")

Epoch 1/25
6191/6191 - 123s - loss: 0.5627 - accuracy: 0.7103 - val_loss: 0.5166 - val_accuracy: 0.7393
Epoch 2/25
6191/6191 - 123s - loss: 0.5012 - accuracy: 0.7538 - val_loss: 0.4816 - val_accuracy: 0.7611
Epoch 3/25
6191/6191 - 121s - loss: 0.4647 - accuracy: 0.7775 - val_loss: 0.4645 - val_accuracy: 0.7743
Epoch 4/25
6191/6191 - 121s - loss: 0.4288 - accuracy: 0.7993 - val_loss: 0.4611 - val_accuracy: 0.7779
Epoch 5/25
6191/6191 - 121s - loss: 0.3876 - accuracy: 0.8236 - val_loss: 0.4782 - val_accuracy: 0.7662
Epoch 6/25
6191/6191 - 119s - loss: 0.3524 - accuracy: 0.8437 - val_loss: 0.4704 - val_accuracy: 0.7824
Epoch 7/25
6191/6191 - 117s - loss: 0.3178 - accuracy: 0.8631 - val_loss: 0.4859 - val_accuracy: 0.7844
Epoch 8/25
6191/6191 - 117s - loss: 0.2836 - accuracy: 0.8817 - val_loss: 0.5224 - val_accuracy: 0.7619
Epoch 9/25
6191/6191 - 116s - loss: 0.2528 - accuracy: 0.8963 - val_loss: 0.5407 - val_accuracy: 0.7803
Epoch 10/25
6191/6191 - 116s - loss: 0.2257 - accuracy: 0.9097 -

## Evaluate the model with best validation accuracy

In [None]:
model = load_model(BASE_DIR + BI_MODEL_PATH)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose = 0)
print(f"loss = {loss}, accuracy = {accuracy}")

loss = 0.46072694659233093, accuracy = 0.778327465057373


In [None]:
y_pred = model.predict([Q1_test, Q2_test])
y_pred_class = np.round(y_pred) # argmax for categorical crossentropy

In [None]:
class_rep = classification_report(y_pred_class, y_test)
print(class_rep)

              precision    recall  f1-score   support

         0.0       0.84      0.81      0.83     78790
         1.0       0.67      0.71      0.69     42497

    accuracy                           0.78    121287
   macro avg       0.76      0.76      0.76    121287
weighted avg       0.78      0.78      0.78    121287



In [None]:
from keras.utils import plot_model
plot_model(model, to_file = BASE_DIR + PLOT_MODEL, show_shapes = True, show_layer_names = True)