# Quora question pairs: training

## Import packages

In [1]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json, csv
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

  (fname, cnt))
  (fname, cnt))
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Initialize global variables

In [2]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
DROPOUT = 0.1
BATCH_SIZE = 32

## Load the dataset, embedding matrix and word count

In [3]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

## Partition the dataset into train and test sets

In [4]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

## Define the model

In [5]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Model Summary

In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 300)      28679100    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 300)      28679100    input_2[0][0]                    
__________________________________________________________________________________________________
time_distr

## Train the model, checkpointing weights with best validation accuracy

In [7]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-04-10 06:31:24.934199
Train on 327474 samples, validate on 36387 samples
Epoch 1/25
 - 139s - loss: 0.5383 - acc: 0.7279 - val_loss: 0.5159 - val_acc: 0.7512
Epoch 2/25
 - 139s - loss: 0.4890 - acc: 0.7603 - val_loss: 0.5015 - val_acc: 0.7600
Epoch 3/25
 - 138s - loss: 0.4593 - acc: 0.7785 - val_loss: 0.4420 - val_acc: 0.7827
Epoch 4/25
 - 138s - loss: 0.4355 - acc: 0.7929 - val_loss: 0.4400 - val_acc: 0.7836
Epoch 5/25
 - 138s - loss: 0.4173 - acc: 0.8041 - val_loss: 0.4292 - val_acc: 0.7940
Epoch 6/25
 - 138s - loss: 0.3996 - acc: 0.8152 - val_loss: 0.4170 - val_acc: 0.7990
Epoch 7/25
 - 138s - loss: 0.3856 - acc: 0.8233 - val_loss: 0.4234 - val_acc: 0.7969
Epoch 8/25
 - 138s - loss: 0.3737 - acc: 0.8303 - val_loss: 0.4170 - val_acc: 0.8003
Epoch 9/25
 - 138s - loss: 0.3651 - acc: 0.8353 - val_loss: 0.4099 - val_acc: 0.8046
Epoch 10/25
 - 139s - loss: 0.3537 - acc: 0.8421 - val_loss: 0.4158 - val_acc: 0.8022
Epoch 11/25
 - 138s - loss: 0.3476 - acc: 0.8455 -

## Print best validation accuracy and epoch

In [8]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 25 = 0.8122


## Evaluate the model with best validation accuracy on the test partition

In [9]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.4101, accuracy = 0.8111


# Generating Predication values for test.csv

In [10]:
#Please Download and extract the test.csv file where you have placed these notebooks otherwise the following code won't execute
TESTING_FILE='test.csv'
OUTPUT_FILE= "OUTPUT_FILE.csv"
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

print("Processing", TESTING_FILE)

testid = []
question1 = []
question2 = []

with open(TESTING_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=',')
    for row in reader:
        testid.append(row['test_id'])
        question1.append(row['question1'])
        question2.append(row['question2'])
        
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

q1_data_test = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data_test = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_id = np.array(testid, dtype=int)
print('Shape of id tensor:', test_id.shape)
print('Shape of question1 test data tensor:', q1_data_test.shape)
print('Shape of question2 test data tensor:', q2_data_test.shape)


model.load_weights(MODEL_WEIGHTS_FILE)
solution=model.predict([q1_data_test, q2_data_test],batch_size=BATCH_SIZE,verbose=2)

with open(OUTPUT_FILE, "w+") as csv_file:   
    writer = csv.DictWriter(csv_file, fieldnames = ["test_id", "probability"])
    writer.writeheader()
    writer = csv.writer(csv_file, delimiter=',')
    for value in range(len(test_id)):
        output=[]
        output.append(test_id[value])
        output.append(solution[value][0])
        writer.writerow(output)      

Processing test.csv
Words in index: 101312
Shape of id tensor: (2345796,)
Shape of question1 test data tensor: (2345796, 25)
Shape of question2 test data tensor: (2345796, 25)
