In [None]:
# Imports To Be Added

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import datetime
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Embedding, LSTM, Merge
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint


In [None]:
# Removing Dots from the Training data as well as Test data

def dot_Word_Adjustment(data):
    dot_words=[]
    dot_words=data.split()
    dot_wordsoutput=[]
    dot_wordsinput=[]

    for word in dot_words:
        if '.' in word and len(word)>2:
            if re.search(r'\d', word) and re.search(r'\.', word):
                continue
            else:
                dot_wordsinput.append(word)
    index=0
    for dot_word in dot_wordsinput:
        dot_wordsoutput.append(re.sub(r"\.","",dot_word)) 
    for dot_wordsout in dot_wordsoutput:
        data=data.replace(dot_wordsinput[index],dot_wordsout)
        index=index+1   
    return data

    

In [None]:
# Preprocessing of Data and Convert it into WordList

def text_To_Word(text):
    text = str(text)
    text = text.lower()
    text = dot_Word_Adjustment(text) #dot word adjustment
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"\'s"," ",text)
    text = re.sub(r"\'ve","have ",text)
    text = re.sub(r"n't","not ",text)
    text = re.sub(r"can't","cannot ",text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)   
    text = re.sub(r"\s{2,}", " ", text)        
    text=text.split()
    return text

In [None]:
# Load Training Data as well as Test Data and Prepare an Embedding Matrix

training_data_set=pd.read_csv('train.csv')
testing_data_set=pd.read_csv('test.csv')
Embedding_file='GoogleNews-vectors-negative300.bin'
questions_columns=['question1','question2']
stops = set(stopwords.words('english'))  # dictionary for processing stopwords
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec = KeyedVectors.load_word2vec_format(Embedding_file, binary=True)
for dataset in [training_data_set, testing_data_set]:
    for index, row in dataset.iterrows():
        for question in questions_columns:
            question_in_numbers = []  # q2n -> represent questions in unique integer number representation
            for word in text_To_Word(row[question]):
                if word in stops and word not in word2vec.vocab:
                    continue
                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    question_in_numbers.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    question_in_numbers.append(vocabulary[word])
            dataset.set_value(index, question, question_in_numbers)
            
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)        
del word2vec     

In [None]:
# Preparing Data for Input to the Manhatten LSTM traning Model

max_seq_length = max(training_data_set.question1.map(lambda x: len(x)).max(),
                     training_data_set.question2.map(lambda x: len(x)).max(),
                     testing_data_set.question1.map(lambda x: len(x)).max(),
                     testing_data_set.question2.map(lambda x: len(x)).max())

validation_size_data = 40000
training_size = len(training_data_set) - validation_size

X = training_data_set[questions_columns]
Y = training_data_set['is_duplicate']

# Splitting of Training and Test data 
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size_data)

X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_set.question1, 'right': test_set.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

for dataset, side in itertools.product([X_test], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length) 

assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [None]:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 26

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: K.exp(-K.sum(K.abs(x[0]-x[1]), axis=1, keepdims=True)), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Start training
print("Training Started")
training_start_time = time()

malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,
                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))


In [None]:
# Plot Accuracy vs Epochs graph

plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot Loss vs Epchs graph

plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
# Save the Model for Further Use

from keras.models import load_model
malstm.save('my_qoura_model_final.h5')

# Load Saved Model 

model=load_model('my_qoura_model_final.h5')

# Prediction of Test Data and Calculated its Accuracy

accuracy=model.predict([X_test['left'],X_test['right']])

In [None]:
# Preparing an Output File with Test_id of Question Pairs and Probability field

test_id=np.array(testing_data_set.test_id)
test_id=test_id.reshape(len(test),1)
accuracy=accuracy.reshape(len(test),1)
data= np.concatenate((test_id, accuracy), axis=1)
np.savetxt('final_tcs_ai_problem_data.csv', data, fmt='%.4f', delimiter=',', header=" test_id,probability")