## Sequential NN

In [None]:
import pandas as pd
import numpy as np
from keras.layers.embeddings import Embedding
from keras.layers import Dropout, BatchNormalization, Input, Embedding, Flatten, concatenate, Dense
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(42)

In [None]:
df_train = pd.read_csv('train_data.csv')
df_labels = pd.read_csv('train_labels.csv')

In [None]:
# Create the questions vocabulary
tokenizer = Tokenizer()
questions = pd.concat([df_train['question1'], df_train['question2']])
tokenizer.fit_on_texts(questions)

In [None]:
# We add one, because we will need to specify the integer for the largest encoded word as an array index, 
# e.g. words encoded 1 to 21 with array indices 0 to 21 or 22 positions.

vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Integer Encode
q1_int_sequence = tokenizer.texts_to_sequences(df_train['question1'])
q2_int_sequence = tokenizer.texts_to_sequences(df_train['question2'])

In [None]:
q1_padded = pad_sequences(q1_int_sequence, maxlen=25)
q2_padded = pad_sequences(q2_int_sequence, maxlen=25)

In [None]:
# Load the Glove Embeddings
f = open('glove.6B.100d.txt')
embeddings_index = dict()
for line in f:
    embedding = line.split()
    word_key = embedding[0]
    word_weights = np.asarray(embedding[1:], dtype='float32')
    embeddings_index[word_key] = word_weights
f.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding = embeddings_index.get(word)
    if embedding is not None:
        embedding_matrix[i] = embedding

In [None]:
# Create the network
q1_input = Input(shape=(25,))
q1_model = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=25, trainable=False)(q1_input)
q1_model = Flatten()(q1_model)

q2_input = Input(shape=(25,))
q2_model = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=25, trainable=False)(q2_input)
q2_model = Flatten()(q2_model)

In [None]:
merged_model = concatenate([q1_model, q2_model])
merged_model = Dense(200, activation='relu')(merged_model)
merged_model = Dropout(0.1)(merged_model)
merged_model = BatchNormalization()(merged_model)

merged_model = Dense(1, activation='sigmoid')(merged_model)

In [None]:
model = Model(inputs=[q1_input,q2_input], outputs=merged_model)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X = np.stack((q1_padded, q2_padded), axis=1)
y = df_labels['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [None]:
history = model.fit([Q1_train, Q2_train], y_train, epochs=15, verbose=2, batch_size=32)

In [None]:
# Apply on test set

In [None]:
df_test = pd.read_csv('test_data.csv')

In [None]:
test_q1_seq = tokenizer.texts_to_sequences(df_test['question1'])
test_q2_seq = tokenizer.texts_to_sequences(df_test['question2'])

In [None]:
test_q1_padded = pad_sequences(test_q1_seq, maxlen=25)
test_q2_padded = pad_sequences(test_q2_seq, maxlen=25)

In [None]:
predicted = model.predict([test_q1_padded, test_q2_padded])

In [None]:
submission = pd.DataFrame()
submission['test_id'] = df_test['test_id']
submission['is_duplicate'] = predicted

In [None]:
submission.loc[submission.is_duplicate < 0.5, 'is_duplicate'] = 0                                                                                                                                                             
submission.loc[submission.is_duplicate >= 0.5, 'is_duplicate'] = 1
submission['is_duplicate'] = submission['is_duplicate'].astype(int)

In [57]:
submission.head()

Unnamed: 0,test_id,is_duplicate
0,15,1
1,20,0
2,21,0
3,23,0
4,34,0


In [None]:
submission.to_csv('nn_one_dense_layer.csv', index=False)

In [None]:
model.save('nn_one_dense_layer.h5')