In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt

np.random.seed(seed=69)

In [2]:
train_data = pd.read_csv("../input/quora-question-pairs/train.csv.zip")
print(train_data.shape)

(404290, 6)


In [3]:
random_indices_train = np.random.choice(train_data.shape[0], 250000, replace=False)
train_X = train_data.iloc[random_indices_train[:200000]]
test_X = train_data.iloc[random_indices_train[200000:]]

In [4]:
def text_to_word_list(text):
    
    text = str(text)
    text = text.lower()

    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [5]:
word2idx = {}

In [6]:
def words_to_indices(words_list):
    indices_list = []
    for w in words_list:
        if w in word2idx:
            indices_list.append(word2idx[w])
        else:
            indices_list.append(word2idx["<unk>"])
    return indices_list

In [7]:
for q in ["question1", "question2"]:
    for i in train_X.index:
        indices = []
        for w in text_to_word_list(train_X[q][i]):
            if w in word2idx:
                indices.append(word2idx[w])
            else:
                word2idx[w] = len(word2idx)+1
                indices.append(word2idx[w])
        train_X.at[i, q] = indices
word2idx["<unk>"] = len(word2idx)+1
print(len(word2idx))

63144


In [8]:
for q in ["question1", "question2"]:
    for i in test_X.index:
        indices = words_to_indices(text_to_word_list(test_X[q][i]))
        test_X.at[i, q] = indices

In [9]:
max_seq_length = max(train_X.question1.map(lambda x: len(x)).max(),
                     train_X.question2.map(lambda x: len(x)).max(),
                     test_X.question1.map(lambda x: len(x)).max(),
                     test_X.question2.map(lambda x: len(x)).max())
print(max_seq_length)

244


In [10]:
X = train_X[["question1", "question2"]]
X = {"left":X["question1"], "right":X["question2"]}
Y = train_X["is_duplicate"].values
X_test = test_X[["question1", "question2"]]
X_test = {"left":X_test["question1"], "right":X_test["question2"]}
Y_test = test_X["is_duplicate"].values
print(Y.shape, Y_test.shape)

(200000,) (50000,)


In [11]:
for dataset in [X, X_test]:
    for side in ["left", "right"]:
        dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length, padding="post", value=0)


In [12]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [13]:
def make_model(n_hidden=100, embedding_size=150):
    left_input = Input(shape=(max_seq_length, ), dtype="int32")
    right_input = Input(shape=(max_seq_length, ), dtype="int32")
    
    embedding_layer = Embedding(input_dim=1+len(word2idx), output_dim=embedding_size, input_length=max_seq_length)
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)
    
    siamese_LSTM = Bidirectional(LSTM(n_hidden))
    
    left_output = siamese_LSTM(encoded_left)
    right_output = siamese_LSTM(encoded_right)
    
    distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
    
    model = Model(inputs=[left_input, right_input], outputs=distance)
    
    return model

In [14]:
opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model = make_model()

In [15]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 244)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 244)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 244, 150)     9471750     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 200)          200800      embedding[0][0]       

In [16]:
model.compile(optimizer=opt, metrics=["accuracy"], loss="binary_crossentropy")

In [17]:
batch_size=128
epochs=25
early_stopper = EarlyStopping(patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("ckpt_model.hdf5", save_best_only=True)

In [18]:
model_trained = model.fit([X["left"], X["right"]], Y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=[early_stopper, model_checkpoint])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [19]:
model.evaluate([X_test["left"], X_test["right"]], Y_test)



[0.43732401728630066, 0.814740002155304]

In [20]:
model.save("my_model.hdf5")