In [2]:
import pandas as pd
# Read in data from dataset
dataset = "../../datasets/quora_duplicate_questions.tsv"
data = pd.read_csv(dataset,sep='\t')

In [3]:
data[0:10]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
import csv
import pandas as pd

# Read in GloVe word embeddings
embeddings_50d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_100d = "../../word_embeddings/glove.6B/glove.6B.100d.text"
embeddings_200d = "../../word_embeddings/glove.6B/glove.6B.200d.text"
embeddings_300d = "../../word_embeddings/glove.6B/glove.6B.300d.txt"

# Must change embedding_dim when you change the word embedding selection
embeddings = pd.read_table(embeddings_300d, delim_whitespace=True,
                           index_col=0, header=None, quoting=csv.QUOTE_NONE)
embedding_dim = 300

In [6]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

ws = [str(word) for word in embeddings.axes[0]]

def process_sentence(sentence):
    word_sequence = text_to_word_sequence(str(sentence))
    return [ws.index(word) if word in ws else 0 for word in word_sequence]

q1s = [process_sentence(sentence) for sentence in data["question1"]]
q2s = [process_sentence(sentence) for sentence in data["question2"]]

df = pd.DataFrame({
    "question1" : q1s,
    "question2" : q2s,
    "labels" : data["is_duplicate"]
})

In [7]:
df

Unnamed: 0,question1,question2,labels
0,"[102, 14, 0, 1065, 21, 1065, 3372, 4, 4280, 6,...","[102, 14, 0, 1065, 21, 1065, 3372, 4, 4280, 6,...",0
1,"[102, 14, 0, 523, 3, 180152, 8849, 41, 16359, ...","[102, 54, 1927, 83, 0, 792, 78, 8396, 0, 18015...",0
2,"[197, 86, 41, 686, 0, 1512, 3, 192, 925, 2540,...","[197, 86, 925, 1512, 30, 1041, 21, 20463, 131,...",0
3,"[738, 913, 41, 7278, 191, 10678, 197, 86, 41, ...","[596, 0, 6345, 61, 7460, 1021, 795, 7460, 14, ...",0
4,"[42, 48, 13151, 6, 430, 0, 3191, 2982, 16904, ...","[42, 2120, 54, 3981, 6, 2982, 430]",0
5,"[29593, 41, 913, 7, 51980, 1662, 3539, 3377, 5...","[0, 7, 4026, 51980, 1662, 3377, 5, 56141, 6, 5...",1
6,"[189, 41, 987, 44842]","[102, 4215, 0, 1546, 5, 372, 25, 1264, 5, 974,...",0
7,"[197, 86, 41, 30, 7, 219, 21010]","[102, 189, 41, 88, 4, 30, 7, 353, 21010]",1
8,"[61, 88, 81, 234, 0, 773, 3, 0]","[61, 88, 81, 234, 773, 3, 5]",0
9,"[10469, 128, 86, 41, 22048, 192, 3812, 0, 0]","[197, 88, 41, 22048, 10469, 0, 10, 415, 925]",0


In [23]:
max_question_length = max(max([len(str(question)) for question in df.question1],[len(str(question)) for question in df.question2]))
print(f"Max length: {max_question_length}")

15
15
15
13
16
20
5
8
9
13


[None, None, None, None, None, None, None, None, None, None]

In [None]:
from sklearn.model_selection import train_test_split
import itertools

validation_set_size = round(0.10*len(df))
training_set_size = len(df) - validation_set_size

X = df[["question1","question2"]]
Y = df["labels"]

X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=validation_set_size)

X_train = {
    'left': X_train.question1, 
    'right': X_train.question2
}
X_validation = {
    'left': X_validation.question1, 
    'right': X_validation.question2
}
Y_train = Y_train.values
Y_validation = Y_validation.values

for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side],  maxlen=max_question_length)


In [None]:
from keras.layers import Input, Dense, Embedding, LSTM, Lambda
from keras.models import Model
import keras.backend as K

def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

n_hidden_units = 32;

left_input = Input(shape=(max_question_length,), dtype='int32')
right_input = Input(shape=(max_question_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, input_length=max_question_length)
embedded_left = embedding_layer(left_input)
embedded_right = embedding_layer(right_input)

lstm = LSTM(n_hidden_units)

left_out = lstm(embedded_left)
right_out = lstm(embedded_right)

similarity_function = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_out, right_out])

model = Model(inputs=[left_input, right_input], outputs=[similarity_function])

In [None]:
from keras.optimizers import Adadelta
from time import time
import datetime
import tensorflow as tf

batch_size = 64
num_epoch = 10

model.compile(
    loss='mean_squared_error',
    optimizer=tf.train.GradientDescentOptimizer(0.5),
    metrics=['accuracy'])

start_time = time()
trained = model.fit(
    [X_train['left'], X_train['right']],
    Y_train,
    batch_size=batch_size,
    epochs=n_epoch,
    validation_data=([X_validation['left'], X_validation['right']],
                     Y_validation))
print(
    f"Training time finished. {num_epoch} epochs in {datetime.timedelta(seconds=time() - start_time)}."
)

In [None]:
model.save("semantic-similarity-v1")