In [299]:
import pandas as pd
import os

# Read in data from dataset
dataset = os.path.expanduser('~/data/question_pair_dataframe.csv')
data = pd.read_csv(dataset,sep=',')
data = data[0:400000]

In [300]:
data[0:10]

Unnamed: 0,question1,question2,labels
0,Why did your brown dogs have black puppies?,Who contributed to non-euclidean?,0
1,What age boy dogs able to create puppies?,What is the most recent processor for the desk...,0
2,How do earth formed according to science?,The science concered with earth and its place ...,1
3,How long is opened corked wine good for?,How long is wine good after it is opened?,1
4,People contribute in the development of computer?,Who are the key people in computer development?,1
5,What is the most recent desktop processor by amd?,Exclamation point horseshoe?,0
6,What does key result areas?,How many calories is Coconut rum with Pineapple?,0
7,What is the function os vesicles?,What do vesicles function?,1
8,How is aerobic capacity expressed?,Average number of completed laps of aerobic ca...,1
9,What does the yellow light mean when it turns ...,When did the new jersey devils win the stanley...,0


In [301]:
import csv

# Read in GloVe word embeddings
embeddings_50d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_100d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_200d = "../../word_embeddings/glove.6B/glove.6B.200d.text"
embeddings_300d = "../../word_embeddings/glove.6B/glove.6B.300d.txt"

embeddings = pd.read_table(embeddings_300d, delim_whitespace=True, index_col=0, header=None, quoting=csv.QUOTE_NONE)
embedding_dim = 300

The pre-processing step below takes about:
- 17m for 200,000 question pairs 
- 4h to complete for ~2.5 million question pairs.

In [302]:
import tensorflow as tf
import numpy as np

def build_vocab(data):
    vocab = set()
    max_sentence_length = 0    

    # Put all words in the training corpus into a set
    for (q1, q2) in zip(data.question1, data.question2):
        q1words = tf.keras.preprocessing.text.text_to_word_sequence(q1)
        q2words = tf.keras.preprocessing.text.text_to_word_sequence(q2)
        max_sentence_length = max(max_question_length, max(len(q1words), len(q2words)))
        [vocab.add(word) for word in q1words + q2words] 
        
    # Add token for unknown words
    vocab.add("<UNK>")
    return (list(vocab), len(vocab), max_sentence_length)

vocab, vocab_length, max_sentence_length = build_vocab(data)
vocab.sort()

print(f"Built vocab. Found {vocab_length} words. Longest sentence is {max_sentence_length} words.")

Built vocab. Found 17706 words. Longest sentence is 41 words.


In [None]:
import numpy as np

def build_embedding_matrix(vocab, embedding_dim):
    # Add 1 extra spot for <UNK> token
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for index in range(1, len(vocab)):
        try:
            word = vocab[index]
            vector = embeddings.loc["the"]
            embedding_matrix[index] = vector
        except:
            continue
    return embedding_matrix

embedding_matrix = build_embedding_matrix(vocab, embedding_dim)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence


def build_word_index(data):
    def process_sentence(sentence):
        word_sequence = text_to_word_sequence(str(sentence))
        return np.pad([vocab.index(word) for word in word_sequence],
                      (max_sentence_length - len(word_sequence), 0),
                      mode="constant")

    q1s = [process_sentence(sentence) for sentence in data.question1]
    q2s = [process_sentence(sentence) for sentence in data.question2]

    df = pd.DataFrame({
        "question1": q1s,
        "question2": q2s,
        "labels": data.labels
    })
    return df


df = build_word_index(data)

In [None]:
from keras.layers import Input, Dense, Embedding, LSTM, Lambda, Conv1D, MaxPooling1D, Concatenate
from keras.models import Model

left_inputs = Input(shape=(max_sentence_length, ), dtype='int32')
right_inputs = Input(shape=(max_sentence_length, ), dtype='int32')

embedding_layer = Embedding(
    input_dim=vocab_length,
    weights=[embedding_matrix],
    trainable=False,
    output_dim=embedding_dim,
    input_length=max_sentence_length)
embedding_left = embedding_layer(left_inputs)
embedding_right = embedding_layer(right_inputs)

# # Bi-gram convolution
# bg_convolution_layer = Conv1D(
#     filters=16,
#     kernel_size=2,
#     padding="same",
#     use_bias=True,
#     activation=tf.nn.tanh)(embedded_left)
# # bg_pooling = MaxPooling1D(pool_size = 2)(bg_convolution_layer)

# Tri-gram convolution
tg_convolution_layer = Conv1D(
    filters=300,
    kernel_size=3,
    padding="same",
    use_bias=True,
    input_shape=(max_sentence_length, embedding_dim),
    activation=tf.nn.tanh)(embedded_left)
# tg_pooling = MaxPooling1D(pool_size = 3)(tg_convolution_layer)

# 4-gram convolution
fg_convolution_layer = Conv1D(
    filters=300,
    kernel_size=4,
    padding="same",
    use_bias=True,
    input_shape=(max_sentence_length, embedding_dim),
    activation=tf.nn.tanh)(embedded_left)
# fg_pooling = MaxPooling1D(pool_size = 4)(fg_convolution_layer)

sgram_convolution_layer = Conv1D(
    filters=300,
    kernel_size=5,
    padding="same",
    use_bias=True,
    input_shape=(max_sentence_length, embedding_dim),
    activation=tf.nn.tanh)(embedding_left)

merged_left = Concatenate(axis=2)([tg_convolution_layer, fg_convolution_layer, sgram_convolution_layer, embedding_left]) 
merged_right = Concatenate(axis=2)([tg_convolution_layer, fg_convolution_layer, sgram_convolution_layer, embedding_right]) 

In [None]:
import keras.backend as K

n_hidden_units = 30

def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# Similarity metric for output
malstm_distance = Lambda(
    function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
    output_shape=lambda x: (x[0][0], 1))([left_out, right_out])

lstm = LSTM(n_hidden_units)
left_out = lstm(merged_left)
right_out = lstm(merged_right)

model = Model(inputs=[left_inputs, right_inputs], outputs=[malstm_distance])

In [None]:
from sklearn.model_selection import train_test_split
import itertools

validation_set_size = round(0.20*len(df))
training_set_size = len(df) - validation_set_size

X = df[["question1","question2"]]
Y = df["labels"]

X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=validation_set_size)

X_train = {
    'left': X_train.question1, 
    'right': X_train.question2
}
X_validation = {
    'left': X_validation.question1, 
    'right': X_validation.question2
}
Y_train = Y_train.values
Y_validation = Y_validation.values

for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side],  maxlen=max_sentence_length)

print(f"Training set size: {len(X_train['left'])}")
print(f"Validation set size: {len(X_validation['right'])}")

In [None]:
from keras.optimizers import Adadelta
from time import time
import datetime 
import tensorflow as tf

batch_size = 32
num_epoch = 10

model.compile(loss='mean_squared_error', optimizer=tf.train.GradientDescentOptimizer(0.5), metrics=['accuracy'])

start_time = time()
trained = model.fit(
    [X_train['left'], X_train['right']], 
    Y_train, 
    batch_size=batch_size, 
    epochs=num_epoch,
    validation_data=([X_validation['left'], X_validation['right']], Y_validation)
)
print("Training time finished.\n{} epochs in {}".format(num_epoch, datetime.timedelta(seconds=time()-start_time)))