# Final Neural Network Architecture

The state of that.

## Download missing files

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2018-12-19 15:16:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-12-19 15:16:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2018-12-19 15:18:11 (9.25 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


## Import stuff

In [0]:
import pandas as pd
import numpy as np
from keras.layers.embeddings import Embedding
from keras.layers import Dropout, BatchNormalization, Input, Embedding, Flatten, concatenate, Dense, LSTM, subtract, multiply, Bidirectional, Lambda
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import keras.backend as keras_backend

## Setup Hyperparameters

In [0]:
GLOVE_FILE = 'glove.6B.300d.txt'
SEQ_LENGTH = 25
EMBEDDING_SIZE = 300
LSTM_UNITS = 256
FEAT_DENSE_OUTPUT = 64
FEAT_ACTIVATION = 'relu'
MERGED_DENSE_ACTIVATION = 'relu'
MERGED_DENSE_OUTPUT = 200
DROPOUT_RATE = 0.1
LOSS = 'binary_crossentropy'
OPTIMIZER = 'adam'
CHECKPOINT_FILE = 'final-{epoch:02d}-{val_loss:.2f}-{val_acc:.3f}.h5'
SUBMISSION_FILE = 'final.csv'
EPOCHS = 25
BATCH_SIZE = 256

## Setup Train Data

In [0]:
df_train = pd.read_csv('cleaned_train_data.csv')
df_train = df_train.astype({'question1': str, 'question2': str})

In [0]:
df_labels = df_train[['id', 'is_duplicate']]

In [0]:
features = pd.read_csv('final_train_features.csv')
features = features.drop(axis=1, columns=['id'])

In [0]:
# Create the questions vocabulary
tokenizer = Tokenizer()
questions = pd.concat([df_train['question1'], df_train['question2']])
tokenizer.fit_on_texts(questions)

# We add one, because we will need to specify the integer for the largest encoded word as an array index, 
# e.g. words encoded 1 to 21 with array indices 0 to 21 or 22 positions.
vocab_size = len(tokenizer.word_index) + 1

In [0]:
# Integer Encode & pad the questions
q1_int_sequence = tokenizer.texts_to_sequences(df_train['question1'])
q2_int_sequence = tokenizer.texts_to_sequences(df_train['question2'])

q1_padded = pad_sequences(q1_int_sequence, maxlen=SEQ_LENGTH)
q2_padded = pad_sequences(q2_int_sequence, maxlen=SEQ_LENGTH)

In [0]:
# Load the Glove Embeddings
f = open(GLOVE_FILE)

embeddings_index = dict()
for line in f:
    embedding = line.split()
    word_key = embedding[0]
    word_weights = np.asarray(embedding[1:], dtype='float32')
    embeddings_index[word_key] = word_weights
f.close()

# Create the embedding matrix of the embeddings contained in the dataset
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding = embeddings_index.get(word)
    if embedding is not None:
        embedding_matrix[i] = embedding

## Create the Neural Network

In [0]:
# Initial, siamese layers
embedding_layer = Embedding(vocab_size, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=SEQ_LENGTH, trainable=False)
lstm_layer = Bidirectional(LSTM(LSTM_UNITS, activation='tanh'))

q1_input = Input(shape=(SEQ_LENGTH,))
q1_model = embedding_layer(q1_input)
q1_model = lstm_layer(q1_model)

q2_input = Input(shape=(SEQ_LENGTH,))
q2_model = embedding_layer(q2_input)
q2_model = lstm_layer(q2_model)

# Create Features Layer
features_layer = Input(shape=(features.shape[1],), dtype='float32')
features_dense = Dense(FEAT_DENSE_OUTPUT, activation=FEAT_ACTIVATION)(features_layer)

In [0]:
# Concatenate the outputs of the previous layers and create the merged layers.
merged_subtract = subtract([q1_model, q2_model])
merged_multiply = multiply([q1_model, q2_model])
merged_model = concatenate([q1_model, q2_model, merged_subtract, merged_multiply, features_dense])

merged_model = Dropout(DROPOUT_RATE)(merged_model)
merged_model = BatchNormalization()(merged_model)
merged_model = Dense(MERGED_DENSE_OUTPUT, activation=MERGED_DENSE_ACTIVATION)(merged_model)
merged_model = Dropout(DROPOUT_RATE)(merged_model)
merged_model = BatchNormalization()(merged_model)
merged_model = Dense(MERGED_DENSE_OUTPUT, activation=MERGED_DENSE_ACTIVATION)(merged_model)
merged_model = Dropout(DROPOUT_RATE)(merged_model)
merged_model = BatchNormalization()(merged_model)
merged_model = Dense(MERGED_DENSE_OUTPUT, activation=MERGED_DENSE_ACTIVATION)(merged_model)

merged_model = Dense(1, activation='sigmoid')(merged_model)

In [0]:
# Compile the model
model = Model(inputs=[q1_input,q2_input, features_layer], outputs=merged_model)

model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=['accuracy'])

In [0]:
model.summary()

## Train the Neural Network

### Train-validation split

In [0]:
# Word Sequences split
# Note: use same random stae for splitting both sets.
rng_state = 42

X = np.stack((q1_padded, q2_padded), axis=1)
y = df_labels['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rng_state)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

# Features split
feat_train, feat_test, y_train, y_test = train_test_split(features, y, test_size=0.1, random_state=rng_state)

### Train

In [0]:
callbacks = [EarlyStopping(patience=10, restore_best_weights=True), 
             ModelCheckpoint(CHECKPOINT_FILE, monitor='val_loss', save_best_only=True)]

history = model.fit([Q1_train, Q2_train, feat_train], y_train, epochs=EPOCHS, 
                    batch_size=BATCH_SIZE, callbacks=callbacks, validation_split=0.1)

In [0]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('Training Performance')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train Accuracy', 'Validation Accuracy'], loc='upper left')
plt.show()

### Load the best checkpoint and test it
The checkpoint saves the best model depending on validation, but the variable still contains the last iteration.

We load the best model and then test it on the test data.

In [0]:
model = load_model('SPECIFY_CHECKPOINT_NAME.h5')
model.evaluate([Q1_test, Q2_test, feat_test], y_test)

## Predict the Kaggle test set

In [0]:
# Read and prepare the test set
df_test = pd.read_csv('test_data.csv')

# Tokenize, sequence and padd the texts
test_q1_seq = tokenizer.texts_to_sequences(df_test['question1'])
test_q2_seq = tokenizer.texts_to_sequences(df_test['question2'])

test_q1_padded = pad_sequences(test_q1_seq, maxlen=25)
test_q2_padded = pad_sequences(test_q2_seq, maxlen=25)

In [0]:
test_features = pd.read_csv('final_test_features.csv',)

In [0]:
# Predict
predicted = model.predict([test_q1_padded, test_q2_padded, test_features_merged])

In [0]:
# Prepare and save the Submission
submission = pd.DataFrame()
submission['test_id'] = df_test['test_id']
submission['is_duplicate'] = predicted

In [0]:
submission.loc[submission.is_duplicate < 0.5, 'is_duplicate'] = 0                                                                                                                                                             
submission.loc[submission.is_duplicate >= 0.5, 'is_duplicate'] = 1
submission['is_duplicate'] = submission['is_duplicate'].astype(int)

submission.to_csv(SUBMISSION_FILE, index=False)