In [2]:
import pandas as pd
import os
# Read in data from dataset
dataset = os.path.expanduser('~/data/question_pair_dataframe.csv')
data = pd.read_csv(dataset,sep=',')

In [3]:
data[0:10]

Unnamed: 0,question1,question2,labels
0,Why did your brown dogs have black puppies?,Who contributed to non-euclidean?,0
1,What age boy dogs able to create puppies?,What is the most recent processor for the desk...,0
2,How do earth formed according to science?,The science concered with earth and its place ...,1
3,How long is opened corked wine good for?,How long is wine good after it is opened?,1
4,People contribute in the development of computer?,Who are the key people in computer development?,1
5,What is the most recent desktop processor by amd?,Exclamation point horseshoe?,0
6,What does key result areas?,How many calories is Coconut rum with Pineapple?,0
7,What is the function os vesicles?,What do vesicles function?,1
8,How is aerobic capacity expressed?,Average number of completed laps of aerobic ca...,1
9,What does the yellow light mean when it turns ...,When did the new jersey devils win the stanley...,0


In [4]:
import csv

# Read in GloVe word embeddings
embeddings_50d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_100d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_200d = "../../word_embeddings/glove.6B/glove.6B.200d.text"
embeddings_300d = "../../word_embeddings/glove.6B/glove.6B.300d.txt"

embeddings = pd.read_table(embeddings_300d, delim_whitespace=True, index_col=0, header=None, quoting=csv.QUOTE_NONE)
embedding_dim = 300

In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

ws = [str(word) for word in embeddings.axes[0]]

def process_sentence(sentence):
    word_sequence = text_to_word_sequence(str(sentence))
    return [ws.index(word) if word in ws else 0 for word in word_sequence]

q1s = [process_sentence(sentence) for sentence in data.question1]
q2s = [process_sentence(sentence) for sentence in data.question2]

df = pd.DataFrame({
    "question1" : q1s,
    "question2" : q2s,
    "labels" : data.labels
})

In [8]:
max_question_length = max(max([len(str(question)) for question in df.question1],[len(str(question)) for question in df.question2]))
print(f"Max length: {max_question_length}")

Max length: 204


In [9]:
from sklearn.model_selection import train_test_split
import itertools

validation_set_size = 0.20*len(df)
training_set_size = len(df) - validation_set_size

X = df[["question1","question2"]]
Y = df["labels"]

X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=validation_set_size)

X_train = {
    'left': X_train.question1, 
    'right': X_train.question2
}
X_validation = {
    'left': X_validation.question1, 
    'right': X_validation.question2
}
Y_train = Y_train.values
Y_validation = Y_validation.values

for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side],  maxlen=max_question_length)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
from keras.layers import Input, Dense, Embedding, LSTM, Lambda
from keras.models import Model
import keras.backend as K

def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

n_hidden_units = 32;

left_input = Input(shape=(max_question_length,), dtype='int32')
right_input = Input(shape=(max_question_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, input_length=max_question_length)
embedded_left = embedding_layer(left_input)
embedded_right = embedding_layer(right_input)

lstm = LSTM(n_hidden_units)

left_out = lstm(embedded_left)
right_out = lstm(embedded_right)

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_out, right_out])

model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

In [None]:
from keras.optimizers import Adadelta
from time import time
gradient_clipping_norm = 1.25
batch_size = 32
n_epoch = 10

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

start_time = time()
trained = model.fit(
    [X_train['left'], X_train['right']], 
    Y_train, 
    batch_size=batch_size, 
    epochs=n_epoch,
    validation_data=([X_validation['left'], X_validation['right']], Y_validation)
)
print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-start_time)))