In [None]:
import os
import time 
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNGRU, Bidirectional, Embedding, Activation, Dropout
from keras.layers import GlobalMaxPool1D, Conv1D
from keras import regularizers, initializers, optimizers, layers, constraints
from keras.models import Model

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape -", train_df.shape)
print("Test shape -", test_df.shape)

In [None]:
train_df.head(5)

In [None]:
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state = 1993)

train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

In [None]:
print(train_X.shape)
print(val_X.shape)
print(train_X[1:3])

print("Inapt Questions are", train_df["target"].sum())
print("Total Questions are", train_df["target"].count())
print("Fraction of Inapt Questions", round(train_df["target"].sum()/train_df["target"].count(),3))

In [None]:
vocab_size = 50000 #Number of words used to create embedding
max_features = 300 #Number of features in a single embedding vector
max_length = 100 #Max length of each question to consider 

t = Tokenizer(num_words=vocab_size) #Initialize a Tokenizer
t.fit_on_texts(list(train_X)) #Learn the Tokens #Converting Train from an Arrary to a List

#print(t.word_counts) 
print(t.document_count)
#print(t.word_index)
#print(t.word_docs)

In [None]:
train_X = t.texts_to_sequences(train_X) #Using trained tokenizer to encode train, test and val.
val_X = t.texts_to_sequences(val_X)
test_X = t.texts_to_sequences(test_X)

print(train_X[1:3]) #The words are now replaced by the corresponding encodings from the tokenizer
print(test_X[1:3])

In [None]:
# For each record, this creates a vector of varying size. Let's make every record of same size, 100
# Sequence Padding

train_X = pad_sequences(train_X, maxlen=max_length)
val_X = pad_sequences(val_X, maxlen=max_length)
test_X = pad_sequences(test_X, maxlen=max_length)

# Now we have the input table created, let's create the output table
train_y = train_df['target'].values
val_y = val_df['target'].values

print(train_X[1])

In [None]:
!ls ../input/embeddings/

In [None]:
# Code to get the embeddings from 
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = t.word_index
nb_words = min(vocab_size, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, max_features))
for word, i in word_index.items():
    if i >= vocab_size: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(max_length,))
x = Embedding(vocab_size,max_features, weights=[embedding_matrix], trainable=False)(inp) 
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(32, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(32,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(16,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

print(model.summary())

In [None]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
Glove_Result = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh,2)
    print("F1 score at threshold {0} is {1}".format(thresh,metrics.f1_score
                                                    (val_y, (Glove_Result>thresh).astype(int))))

# best Threshold coming out to be 0.43

In [None]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
pred_test_y = (pred_glove_test_y>0.38).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)