# Quora Question Pairs

The Goal of the this competition is to find the similarity of the question. i.e. We need to predict the probability that given two questions are duplicate. The Link of the problem is:
[Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs)

# 1. Import the required library

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation


import re

import matplotlib.pyplot as plt



from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, BatchNormalization, TimeDistributed, Input
from keras.layers import Lambda, Activation, Flatten, Conv1D, concatenate

from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras import initializers
from keras import backend as K


Using TensorFlow backend.


# 2. Prepare the Data for training

## 2.1 Read the Data

In [28]:
# Loading only 100 datapoint for the experiment. For full training remove [:100] from both the line
train = pd.read_csv("./quora-question-pairs/train.csv")[:100]
test = pd.read_csv("./quora-question-pairs/test.csv")[:100]

In [3]:
print(train.shape)
print(test.shape)

(100, 6)
(100, 3)


## 2.2 Process the Data

In [4]:
train = train.fillna('empty')
test = test.fillna('empty')

In [5]:
stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

In [6]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0k ", "0000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [7]:
def process_questions(question_list, questions, question_list_name, dataframe):
    '''transform questions and display progress'''
    for question in questions:
        question_list.append(text_to_wordlist(question))
        if len(question_list) % 100000 == 0:
            progress = len(question_list)/len(dataframe) * 100
            print("{} is {}% complete.".format(question_list_name, round(progress, 1)))

In [8]:
train_question1 = []
process_questions(train_question1, train.question1, 'train_question1', train)



train_question2 = []
process_questions(train_question2, train.question2, 'train_question2', train)



test_question1 = []
process_questions(test_question1, test.question1, 'test_question1', test)

test_question2 = []
process_questions(test_question2, test.question2, 'test_question2', test)

In [9]:
lengths = []
for question in train_question1:
    lengths.append(len(question.split()))

for question in train_question2:
    lengths.append(len(question.split()))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])



lengths.counts.describe()


print(np.percentile(lengths.counts, 99.0))
print(np.percentile(lengths.counts, 99.4))
print(np.percentile(lengths.counts, 99.5))
print(np.percentile(lengths.counts, 99.9))

22.0
22.0
22.024999999999977
26.00500000000008


In [10]:
# tokenize the words for all of the questions
all_questions = train_question1 + train_question2 + test_question1 + test_question2
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_questions)
print("Fitting is complete.")
train_question1_word_sequences = tokenizer.texts_to_sequences(train_question1)
print("train_question1 is complete.")
train_question2_word_sequences = tokenizer.texts_to_sequences(train_question2)
print("train_question2 is complete")

Fitting is complete.
train_question1 is complete.
train_question2 is complete


In [11]:
test_question1_word_sequences = tokenizer.texts_to_sequences(test_question1)
print("test_question1 is complete.")
test_question2_word_sequences = tokenizer.texts_to_sequences(test_question2)
print("test_question2 is complete.")

test_question1 is complete.
test_question2 is complete.


In [12]:
word_index = tokenizer.word_index
print("Words in index: %d" % len(word_index))

Words in index: 1325


## 2.3 Prepare the Train and Test

Following variable store the training and test data.
<ol>
    <li> ** Train ** </li>
    <ol>
    <li> <b> train_q1 :</b> Contains first question</li>
    <li> <b> train_q2 :</b> Contains Second question</li>
    </ol>
    <li> ** Test ** </li>
    <ol>
    <li> <b> train_q1 :</b> Contains first question</li>
    <li> <b> train_q1 :</b> Contains second question</li>
    </ol>    
</ol>

In [13]:
# Pad the questions so that they all have the same length.

max_question_len = 36

train_q1 = pad_sequences(train_question1_word_sequences, 
                              maxlen = max_question_len)
print("train_q1 is complete.")

train_q2 = pad_sequences(train_question2_word_sequences, 
                              maxlen = max_question_len)
print("train_q2 is complete.")

train_q1 is complete.
train_q2 is complete.


In [14]:
test_q1 = pad_sequences(test_question1_word_sequences, 
                             maxlen = max_question_len,
                             padding = 'post',
                             truncating = 'post')
print("test_q1 is complete.")

test_q2 = pad_sequences(test_question2_word_sequences, 
                             maxlen = max_question_len,
                             padding = 'post',
                             truncating = 'post')
print("test_q2 is complete.")

test_q1 is complete.
test_q2 is complete.


In [15]:
y_train = train.is_duplicate

In [16]:
embedding_dim = 300
nb_words = len(word_index)

# 3. Create the model

In [17]:
# Setting the parameter

units = 128 # Number of nodes in the Dense layers
dropout = 0.25 # Percentage of nodes to drop
nb_filter = 32 # Number of filters to use in Convolution1D
filter_length = 3 # Length of filter for Convolution1D

In [18]:
inp1 = Input(shape=(max_question_len,), name='input_1')
X = Embedding(input_dim= nb_words+1, 
              output_dim= embedding_dim, 
              input_length=max_question_len, 
              trainable= False
             )(inp1)
X = Conv1D(filters= nb_filter, kernel_size= filter_length, padding='same')(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Conv1D(filters= nb_filter, kernel_size= filter_length, padding='same')(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)
out1 = Flatten()(X)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [19]:
inp2 = Input(shape=(max_question_len,), name='input_2')
X = Embedding(input_dim= nb_words+1, 
              output_dim= embedding_dim, 
              input_length=max_question_len, 
              trainable= False
             )(inp2)
X = Conv1D(filters= nb_filter, kernel_size= filter_length, padding='same')(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Conv1D(filters= nb_filter, kernel_size= filter_length, padding='same')(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)
out2 = Flatten()(X)

In [20]:
inp3 = Input(shape=(max_question_len,), name='input_3')
X = Embedding(input_dim= nb_words+1, 
              output_dim= embedding_dim, 
              input_length=max_question_len, 
              trainable= False
             )(inp3)
X = TimeDistributed(Dense(embedding_dim))(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)
out3 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim,))(X)

In [21]:
inp4 = Input(shape=(max_question_len,), name='input_4')
X = Embedding(input_dim= nb_words+1, 
              output_dim= embedding_dim, 
              input_length=max_question_len, 
              trainable= False
             )(inp4)
X = TimeDistributed(Dense(embedding_dim))(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)
out4 = Lambda(lambda x: K.max(x, axis=1), output_shape=(embedding_dim,))(X)

In [22]:
merge12 = concatenate([out1, out2], name='merge_layer_1')
X = Dense(units=units*2)(merge12)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Dense(units=units)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
model12 = Dropout(dropout)(X)

In [23]:
merge34 = concatenate([out3, out4], name='merge_layer_2')
X = Dense(units=units*2)(merge34)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Dense(units=units)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
model34 = Dropout(dropout)(X)

In [24]:
merged = concatenate([model12, model34], name='final_merge')
X = Dense(units=units*2)(merged)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Dense(units=units)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Dense(units=units)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(dropout)(X)

X = Dense(units=1)(X)
X = BatchNormalization()(X)
out = Activation('sigmoid')(X)


In [25]:
model = Model(inputs=[inp1, inp2, inp3, inp4], outputs=out)


In [26]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [None]:
save_best_weights = 'question_pairs_weights.h5'

callbacks = [ModelCheckpoint(save_best_weights, monitor='val_loss', save_best_only=True),
             EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')]

history = model.fit([train_q1, train_q2, train_q1, train_q2],
                    y_train,
                    batch_size=256,
                    epochs=2, 
                    validation_split=0.15,
                    verbose=True,
                    shuffle=True,
                    callbacks=callbacks)

In [None]:
# Aggregate the summary statistics
summary_stats = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                              'train_acc': history.history['acc'],
                              'valid_acc': history.history['val_acc'],
                              'train_loss': history.history['loss'],
                              'valid_loss': history.history['val_loss']})


In [None]:
summary_stats

In [None]:
plt.plot(summary_stats.train_loss) # blue
plt.plot(summary_stats.valid_loss) # green
plt.show()

In [None]:
# Find the minimum validation loss during the training
min_loss, idx = min((loss, idx) for (idx, loss) in enumerate(history.history['val_loss']))
print('Minimum loss at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(min_loss))
min_loss = round(min_loss, 4)

In [None]:
# Make predictions with the best weights
#model.load_weights(save_best_weights)
predictions = model.predict([test_q1, test_q2, test_q1, test_q2], verbose = True, batch_size=512)

In [None]:
#Create submission
submission = pd.DataFrame(predictions, columns=['is_duplicate'])
submission.insert(0, 'test_id', test.test_id)
file_name = 'submission_{}.csv'.format(min_loss)
submission.to_csv(file_name, index=False)

In [None]:
submission.head()

In [None]:
model.save('quora_model.h5py')