In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator#
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Flatten
from keras.optimizers import Adam
from keras.models import Model
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train_df= pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
#check for null values in the data
train_df.isna().sum()

In [None]:
#check for null values in the data
test_df.isna().sum()

In [None]:
train_df['target'].value_counts()

In [None]:
#some plotting
ax = sns.countplot(train_df['target']) 
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
    

In [None]:
#check the words in sincere and insincere questions
sincere_train = train_df[train_df['target'] ==0]
insincere_train = train_df[train_df['target']==1]

In [None]:
sincere_text = " ".join(i for i in sincere_train.question_text)
print ("There are {} words in the total sincere questions.".format(len(sincere_text)))

In [None]:
insincere_text = " ".join(i for i in insincere_train.question_text)
print ("There are {} words in insincere questions.".format(len(insincere_text)))

In [None]:
#some plotting using wordcloud
stopwords = set(STOPWORDS)
#wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=10000, background_color="white").generate(sincere_text)
#plt.figure()
#plt.imshow(wordcloud, interpolation="bilinear")
#plt.axis("off")
#plt.show()

In [None]:
#wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=1000, background_color="white").generate(insincere_text)
#plt.figure()
#plt.imshow(wordcloud, interpolation="bilinear")
#plt.axis("off")
#plt.show()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#split the data for training and cross validation
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state= 144)

In [None]:
print(train_df.shape)
print(val_df.shape)

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("##").values
val_X = val_df["question_text"].fillna("##").values
test_X = test_df['question_text'].fillna("##").values
print("before tokenization")
print(train_X.shape)
print(val_X.shape)
print(test_X.shape)

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))

train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)


print("after tokenization")
print(len(train_X))
print(len(val_X))
print(len(test_X))


In [None]:
## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

print("after padding")
print(len(train_X))
print(len(val_X))
print(len(test_X))

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
#shuffling the data
np.random.seed(2018)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_y = train_df['target'].values
val_y = val_df['target'].values

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)

x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
x = Flatten()(x)
x = Dense(64, activation="relu")(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])

In [None]:
# ## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
val_pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)

In [None]:
for thresh in np.arange(0.1, 0.510, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (val_pred_glove_val_y>thresh).astype(int))))

In [None]:
threshold = 0.37

In [None]:
pred_glove_val_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
y_pred = [1 if i> 0.41 else 0 for i in pred_glove_val_y]

In [None]:
submission = pd.DataFrame()
submission['qid'] = test_df['qid']
submission['prediction'] = y_pred
submission.to_csv("submission.csv", index=False)