In [None]:
import pandas as pd
import os

# Read in data from dataset
dataset = os.path.expanduser('~/data/question_pair_dataframe.csv')
data = pd.read_csv(dataset,sep=',')

In [None]:
data[0:10]

In [None]:
import csv

# Read in GloVe word embeddings
embeddings_50d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_100d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_200d = "../../word_embeddings/glove.6B/glove.6B.200d.text"
embeddings_300d = "../../word_embeddings/glove.6B/glove.6B.300d.txt"

embeddings = pd.read_table(embeddings_300d, delim_whitespace=True, index_col=0, header=None, quoting=csv.QUOTE_NONE)
embedding_dim = 300

The pre-processing step below takes about ~4 hours to complete for ~2.5 million question pairs.

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

ws = [str(word) for word in embeddings.axes[0]]

def process_sentence(sentence):
    word_sequence = text_to_word_sequence(str(sentence))
    return [ws.index(word) if word in ws else 0 for word in word_sequence]

q1s = [process_sentence(sentence) for sentence in data.question1]
q2s = [process_sentence(sentence) for sentence in data.question2]

df = pd.DataFrame({
    "question1" : q1s,
    "question2" : q2s,
    "labels" : data.labels
})

In [None]:
max_question_length = max(max([len(str(question)) for question in df.question1],[len(str(question)) for question in df.question2]))
print(f"Max length: {max_question_length}")

In [None]:
from sklearn.model_selection import train_test_split
import itertools

validation_set_size = round(0.20*len(df))
training_set_size = len(df) - validation_set_size

X = df[["question1","question2"]]
Y = df["labels"]

X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=validation_set_size)

X_train = {
    'left': X_train.question1, 
    'right': X_train.question2
}
X_validation = {
    'left': X_validation.question1, 
    'right': X_validation.question2
}
Y_train = Y_train.values
Y_validation = Y_validation.values

for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side],  maxlen=max_question_length)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_validation)}")

In [None]:
from keras.layers import Input, Dense, Embedding, LSTM, Lambda
from keras.models import Model
import keras.backend as K


def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))


n_hidden_units = 32

left_input = Input(shape=(max_question_length, ), dtype='int32')
right_input = Input(shape=(max_question_length, ), dtype='int32')

embedding_layer = Embedding(
    len(embeddings), embedding_dim, input_length=max_question_length)
embedded_left = embedding_layer(left_input)
embedded_right = embedding_layer(right_input)

lstm = LSTM(n_hidden_units)
left_out = lstm(embedded_left)
right_out = lstm(embedded_right)

malstm_distance = Lambda(
    function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
    output_shape=lambda x: (x[0][0], 1))([left_out, right_out])

model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

On a 6 core i9, the below takes many hours per epoch.

In [None]:
from keras.optimizers import Adadelta
from time import time
import datetime
import tensorflow as tf

batch_size = 32
num_epoch = 10

model.compile(
    loss='mean_squared_error',
    optimizer=tf.train.GradientDescentOptimizer(0.5),
    metrics=['accuracy'])

start_time = time()
trained = model.fit(
    [X_train['left'], X_train['right']],
    Y_train,
    batch_size=batch_size,
    epochs=num_epoch,
    validation_data=([X_validation['left'], X_validation['right']],
                     Y_validation))
print(
    f"Training finished. {num_epoch} epochs in {datetime.timedelta(seconds=time() - start_time)}."
)