In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import string
import re
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import SpatialDropout1D
from keras.layers import Input, Conv1D, MaxPooling1D, concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
def f1(y_true, y_pred):
    # F1 function to be used while training the model
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def calc_ratio(p):
    # p is the fraction of the resampled dataset that y = 1
    # returns the ratio of y=1 : y=0
    return p/(1-p)

def pretoken(X):
    # processing the text
    X = X.map(lambda x : re.sub(r'[^\w\s]','',x)) # remove punctuations
    X = X.map(lambda x : x.lower()) # Lowercased
    X = X.map(lambda x : x.split()) # split strings into list of strings
    return X

def tokenize_padding(X, max_length, token):
    X = token.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_length, padding='post')
    return X
    
def train_preproc(X, max_length):
    # function to prepare the training data set
    X = pretoken(X)
    token = Tokenizer()
    token.fit_on_texts(X_train)
    X = tokenize_padding(X,max_length,token)
    return X, token, len(token.word_index) + 1

def test_preproc(X, max_length, token):
    # function to prepare the dev and test data sets
    X = pretoken(X)
    X = tokenize_padding(X,max_length,token)
    return X

def cutoff(x, frac):
    if x > frac:
        return 1
    else:
        return 0

In [None]:
# Importing the data
trainData = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [None]:
# Importing the glove word embeddings into a dict
def get_coefs(word, *args): 
    return word, np.asarray(args, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', encoding = 'utf8'))

# Calculating the mean and std of the word embeddings
glove_matrix = np.stack(embeddings_index.values())
glove_mean = glove_matrix.mean()
glove_std = glove_matrix.std()

In [None]:
# Splitting the train and dev data sets and pre-processing all train, dev and test
X_train , X_dev, y_train, y_dev = train_test_split(trainData['question_text'], trainData['target'], test_size=0.2, random_state=93)
max_sen_length = 30 # arbitrary. to be tuned later
X_train, token, vocab_size = train_preproc(X_train, max_sen_length)
X_dev = test_preproc(X_dev, max_sen_length, token)
X_test = test_preproc(test['question_text'], max_sen_length,token)

In [None]:
# Create a weight matrix for words 
embedding_matrix = np.random.normal(glove_mean, glove_std, [vocab_size,300])
for word, i in token.word_index.items():
    try:
        embedding_matrix[i] = embeddings_index[word]
    except:
        continue

In [None]:
# 2x LSTM Model 
model2 = Sequential()
model2.add(Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = max_sen_length, trainable = False))
model2.add(SpatialDropout1D(0.2))
model2.add(Bidirectional(CuDNNLSTM(100, return_sequences = True)))
model2.add(Bidirectional(CuDNNLSTM(100, return_sequences = True)))
model2.add(Flatten())
model2.add(Dense(16, activation = 'relu'))
model2.add(Dropout(0.5))
model2.add(Dense(8, activation = 'relu'))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation = 'sigmoid'))
opt = keras.optimizers.Adam(lr = 1e-3, decay = 1e-5)
model2.compile(loss = 'binary_crossentropy', optimizer = opt, metrics = [f1,'accuracy'])
model2.fit(X_train, y_train, batch_size = 256, epochs = 5, validation_data = (X_dev,y_dev), verbose = 2)

In [None]:
# CNN model with various filters and max pooling
filter_size = [1,2,3,4]
conv_lst = []
pool_lst = []
flat_lst = []

model_input = Input(shape=(max_sen_length,), dtype='int32', name = 'model_input')
emb_layer = Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = max_sen_length, trainable = False)(model_input)
x = SpatialDropout1D(0.4)(emb_layer)
for i in range(len(filter_size)):
    conv_lst += [Conv1D(filters = 64, kernel_size = filter_size[i], padding = 'same', name = 'conv'+str(filter_size[i]))(x)]
    pool_lst += [MaxPooling1D(pool_size = max_sen_length+1-filter_size[i])(conv_lst[i])]
    flat_lst += [Flatten()(pool_lst[i])]
x = concatenate(flat_lst)
x = Dense(16, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(8, activation = 'relu')(flat_lst[i])
model_output = Dense(1, activation = 'sigmoid')(x)
opt = keras.optimizers.Adam(lr = 1e-3, decay = 1e-5)

model = Model(inputs=model_input, outputs = model_output)
model.compile(loss = 'binary_crossentropy', optimizer = opt, metrics = [f1,'accuracy'])
model.fit(X_train, y_train, epochs = 3, batch_size=256, validation_data = (X_dev,y_dev), verbose = 2)

In [None]:
# ensemble the 2 models
ensemble_p = 0.3 # magic number first, will tune later on
y_dev_predict = pd.DataFrame(ensemble_p*model.predict(X_dev) + (1-ensemble_p)*model2.predict(X_dev))

# estimating the cut off level to determine if the question is insincere
f1_lst = []
for i in range(1,7):
    f1_lst += [f1_score(y_dev, y_dev_predict[0].map(lambda x : cutoff(x, i/10)))]

ensemble_cutoff = (np.argmax(f1_lst) + 1)/10

In [None]:
# calculating the predictions for the test set
y_pred = ensemble_p*model.predict(X_test) + (1-ensemble_p)*model2.predict(X_test)
y_pred = pd.DataFrame(y_pred)

# preparing submission file
my_submission = pd.DataFrame({'qid': test.qid, 'prediction': y_pred[0].map(lambda x : cutoff(x, ensemble_cutoff))})
my_submission.to_csv('submission.csv',index=False)

This blended model does not perform (previous runs obtain a score of about 0.66) as well as the previous 2x bi-lstm with input and recurrent dropouts (public score of 0.672).

**Next step**

1. Implement attention mechnism for the lstm models.
2. Blend the glove word embedding with the 3 other embeddings available. Two possible approaches: take the simple average of the embeddings or implement a method to learn the weights

Hyper parameter tuning can be done later. Current priority is to learn, build and test as many different models/architectures/mechanisms as possible.