In [60]:
import numpy as np
np.random.seed(49)
import pandas as pd

import os
import csv
import codecs
import random
import keras
import sys

from keras.preprocessing.text import Tokenizer
from keras.layers.merge import concatenate, subtract
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout, Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.models import Model


Read in training data and append training labels

In [45]:
df_train = pd.read_csv('../data/train_data.csv')
df_train.drop(['is_duplicate'], axis= 1, inplace = True)
df_labels = pd.read_csv('../data/train_labels.csv')
df_train = df_train.merge(df_labels)

###Read in test

test = pd.read_csv('../data/test_data.csv')

In [46]:
###Create train and Cros-validation sets
#train, CV = train_test_split(df_train, train_size = 0.8, random_state = 49)

In [47]:
### Transform into series
###Train
train_qs_1 = pd.Series(df_train['question1']) 
train_qs_2 = pd.Series(df_train['question2']) 
labels = pd.Series(df_train['is_duplicate'])
train_ids = pd.Series(df_train['id'])

###Test
test_qs_1 = pd.Series(test['question1']) 
test_qs_2 = pd.Series(test['question2']) 
test_ids = pd.Series(test['test_id']) 

In [48]:
#Create full lists for text processing:

all_texts = train_qs_1.astype(str).tolist() + train_qs_2.astype(str).tolist() + test_qs_1.astype(str).tolist() + test_qs_1.astype(str).tolist()

train_q1 = train_qs_1.astype(str).tolist()
train_q2 = train_qs_2.astype(str).tolist()

test_q1 = test_qs_1.astype(str).tolist()
test_q2 = test_qs_2.astype(str).tolist()

In [49]:
###Create word index from Glove
embeddings_index = {}
glove_path = '/Users/laurentiusblindow/Repositories/aml-kaggle/glove.6B.300d.txt'
glove = codecs.open(glove_path, encoding='utf-8')

for row in glove:
    word_dims = row.split(' ')
    index = word_dims[0]
    dims = np.asarray(word_dims[1:], dtype='float32')
    embeddings_index[index] = dims
    
glove.close()

In [50]:
##Tokenize all uing Keras Tokenizer

##Fit tokenizer:
max_tok_words = 100000
tokenizer = Tokenizer(num_words=max_tok_words)
tokenizer.fit_on_texts(all_texts)
word_index = tokenizer.word_index

#Create sequences to tokenize

train_seq_1 = tokenizer.texts_to_sequences(train_q1)
train_seq_2 = tokenizer.texts_to_sequences(train_q2)

test_seq_1 = tokenizer.texts_to_sequences(test_q1)
test_seq_2 = tokenizer.texts_to_sequences(test_q2)


In [51]:
#Identify appropriate padding length:
full_seq = train_seq_1 + train_seq_2 + test_seq_1 + test_seq_2
#99.5th percentile
max_pad_len = int(np.percentile([len(x) for x in full_seq],99.5))

In [52]:
#Apply padding:

padded_train_1 = pad_sequences(train_seq_1, maxlen=max_pad_len)
padded_train_2 = pad_sequences(train_seq_2, maxlen=max_pad_len)

padded_test_1 = pad_sequences(test_seq_1, maxlen=max_pad_len)
padded_test_2 = pad_sequences(test_seq_2, maxlen=max_pad_len)

In [59]:
#Class weights

dup_weight = float(df_labels['is_duplicate'].sum())/df_labels['is_duplicate'].count()
non_weight = 1 - dup_weight

re_weight = non_weight/dup_weight

class_weight = {0 : 1.,
    1: re_weight}

In [53]:
#Create word embeddings

index_length = len(word_index)
embedding_matrix = np.zeros((index_length+1, 300))


for w, i in word_index.items():
    
    #if i >= index_length:
    if i > index_length:
        continue
    embedding_vector = embeddings_index.get(w)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [54]:
#### Optional: Augment data to contain Q2 vs Q1 swapped - effectively doubling training data

# padded_train_1 = np.concatenate([padded_train_1,padded_train_2], axis = 0)
# padded_train_2 = np.concatenate([padded_train_2,padded_train_1], axis = 0)
# labels = np.concatenate([labels,labels], axis = 0)

In [55]:
#Embedding layer for LSTM
embedding_layer = Embedding(index_length+1,300,weights=[embedding_matrix],input_length=max_pad_len)
# TODO: freeze = FALSE

In [56]:
#Hyperparameters
lstm_nodes = [200,300,400]
dense_nodes = [100,200,300]

lstm_drop = [0.1,0.15,0.2,0.25,0.3]
dense_drop = [0.1,0.15,0.2,0.25,0.3]

dense_activation = ['relu','sigmoid']

lstm_bidirectional = [True,False]

In [57]:
#Random initialization

lstm_nodes_choice = 300 #random.choice(lstm_nodes)
dense_nodes_choice = 200 #random.choice(dense_nodes)

lstm_drop_choice = 0.2 #random.choice(lstm_drop)
dense_drop_choice = 0.05 #random.choice(dense_drop)

dense_activation_choice = 'relu' #random.choice(dense_activation)

lstm_bidirectional_choice = False #random.choice(lstm_bidirectional)

In [61]:
#Build LSTM layer (Bidirectional if picked)
if lstm_bidirectional_choice:    
    lstm_layer = Bidirectional(LSTM(lstm_nodes_choice,
#                                     dropout=lstm_drop_choice,
#                                     recurrent_dropout=lstm_drop_choice
                                   ))
else:
    lstm_layer = LSTM(lstm_nodes_choice,
#                       dropout=lstm_drop_choice,
#                       recurrent_dropout=lstm_drop_choice
                     )

#Question 1 input layer
input_1 = Input(shape=(max_pad_len,), dtype='int32')
embedded_1 = embedding_layer(input_1)
q1 = lstm_layer(embedded_1)

#Question 2 input layer
input_2 = Input(shape=(max_pad_len,), dtype='int32')
embedded_2 = embedding_layer(input_2)
q2 = lstm_layer(embedded_2)

#Combine outputs from q1 and q2
combined_layer = subtract([q1, q2])
# combined_layer = Dropout(lstm_drop_choice)(combined_layer)
combined_layer = BatchNormalization()(combined_layer)

#First Dense layer
combined_layer = Dense(dense_nodes_choice, activation=dense_activation_choice)(combined_layer)
# combined_layer = Dropout(dense_drop_choice)(combined_layer)
combined_layer = BatchNormalization()(combined_layer)

# #Second Dense layer
# combined_layer = Dense(dense_nodes_choice, activation=dense_activation_choice)(combined_layer)
# # combined_layer = Dropout(dense_drop_choice)(combined_layer)
# combined_layer = BatchNormalization()(combined_layer)

#Prediction Dense Layer
prediction_layer = Dense(1, activation='sigmoid')(combined_layer)

#Compile Model
model = Model(inputs=[input_1, input_2],outputs=prediction_layer)
model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['acc'])

In [63]:
#Fit Model
epochs = 20
logging = model.fit([padded_train_1,padded_train_2], labels , validation_split = 0.2, epochs=epochs, batch_size=1000, shuffle=True, class_weight = class_weight)

Train on 258531 samples, validate on 64633 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 38000/258531 [===>..........................] - ETA: 26:31:47 - loss: 0.1535 - acc: 0.9533

KeyboardInterrupt: 

In [None]:
import sys
sys.path.insert(0, '..')
from helpers import save_model

save_model(model, '../models/lstm-nodropout-subtract-1dense/')

In [None]:
nodes_lstm = [lstm_nodes_choice] * epochs
nodes_dense = [dense_nodes_choice] * epochs

drop_lstm = [lstm_drop_choice] * epochs
drop_dense = [dense_drop_choice] * epochs

activation_dense = [dense_activation_choice] * epochs

lstm_bidirectional = [lstm_bidirectional_choice] * epochs

epoch_cnt = range(1,epochs+1)

acc = logging.history['acc']
val_acc = logging.history['val_acc']
loss = logging.history['loss']
val_loss = logging.history['val_loss']


In [None]:
performance = pd.DataFrame(
    {'nodes_lstm': nodes_lstm,
    'nodes_dense': nodes_dense,
    'drop_lstm': drop_lstm,
    'drop_dense': drop_dense,
    'activation_dense': activation_dense,
    'lstm_bidirectional': lstm_bidirectional,
    'epoch_cnt': epoch_cnt,
    'acc': acc,
    'val_acc': val_acc,
    'loss': loss,
    'val_loss': val_loss
    })
master = pd.DataFrame()

In [None]:
master = master.append(performance,ignore_index=True)

In [None]:
master.to_csv("../models/lstm-nodropout-subtract-1dense/performance_20epochs.csv", index=False)

In [None]:
test_predictions = model.predict([padded_test_1, padded_test_2])

In [None]:
test_df = pd.DataFrame({"test_id":test_ids, "nn_out":test_predictions.ravel()})
test_df.to_csv("test_preds_nodropout_subtract-1dense_20e.csv", index=False)

In [None]:
train_predictions = model.predict([padded_train_1,padded_train_2])

In [None]:
train_df = pd.DataFrame({"id":train_ids, "nn_out":train_predictions.ravel()})
train_df.to_csv("preds_nodropout_subtract-1dense_20e.csv", index=False)

In [None]:
master

In [201]:
# from helpers import save_model
# save_model(model, 'models/lstm-final')