In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Deep Learning libraries
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.layers import Dropout, LSTM
from keras.callbacks import ModelCheckpoint

In [None]:
print(os.listdir("../input/embeddings"))

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sub = pd.read_csv('../input/sample_submission.csv')

In [None]:
ngram_range = 1
max_features = 50000
maxlen = 50
batch_size = 32
embedding_dims = 300
epochs = 4

In [None]:
x_train = train["question_text"].values
x_test = test["question_text"].values
y_train = train["target"].values

In [None]:
from gensim.models import KeyedVectors

news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [None]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [None]:
train["question_text"] = train["question_text"].apply(lambda x: clean_text(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_text(x))

In [None]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
train["question_text"] = train["question_text"].apply(lambda x: clean_numbers(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_numbers(x))

In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'facebook': 'social medium'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [None]:
train["question_text"] = train["question_text"].apply(lambda x: replace_typical_misspell(x))
test["question_text"] = test["question_text"].apply(lambda x: replace_typical_misspell(x))

to_remove = ['a','to','of','and']

train_sentences = train["question_text"].apply(lambda x: x.split())
train_sentences = [[word for word in sentence if not word in to_remove] for sentence in train_sentences]

test_sentences = test["question_text"].apply(lambda x: x.split())
test_sentences = [[word for word in sentence if not word in to_remove] for sentence in test_sentences]

In [None]:
from keras.preprocessing.text import Tokenizer
tk = Tokenizer(num_words=max_features, lower = True, filters='')
full_text = list(train_sentences) + list(test_sentences)
tk.fit_on_texts(full_text)

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(tk.word_index)+1, embedding_dims))
count = 0
for word, i in tk.word_index.items():
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]
    else:
#         print(word)
        count += 1

In [None]:
del embeddings_index

In [None]:
train_tokenized = tk.texts_to_sequences(train['question_text'])
test_tokenized = tk.texts_to_sequences(test['question_text'])

In [None]:
train_tokenized[100]

In [None]:
train_tokenized = sequence.pad_sequences(train_tokenized, maxlen=maxlen)
test_tokenized = sequence.pad_sequences(test_tokenized, maxlen=maxlen)

In [None]:
embedding_layer = Embedding(len(tk.word_index) + 1,
                            embedding_dims,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from keras.layers import Input, Conv2D, MaxPool2D, Reshape, Concatenate, Flatten, BatchNormalization, Dropout
from keras.models import Model

In [None]:
filter_sizes = [1,2,3,4]
num_filters = 64

sequence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Reshape((maxlen, embedding_dims, 1))(embedded_sequences)

maxpool_pool = []
for i in range(len(filter_sizes)):
    conv1 = Conv2D(num_filters, kernel_size=(filter_sizes[i], embedding_dims),
                                 kernel_initializer='he_normal', activation='elu')(x)
    maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv1))
    

z = Concatenate(axis=1)(maxpool_pool)   
z = Flatten()(z)
z = Dropout(0.1)(z)

preds = Dense(1, activation="sigmoid")(z)

model = Model(sequence_input, preds)
checkpoint = ModelCheckpoint('CNN2D-{epoch:03d}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[f1])

class_weight = {0: 1.,
                1: 3.}

# happy learning!
model.fit(train_tokenized, y_train,
          batch_size=128,
          epochs=epochs, validation_split=0.01, class_weight=class_weight, callbacks=[checkpoint])

In [None]:
from sklearn.metrics import f1_score

In [None]:
train_pred = model.predict(train_tokenized[:50000], batch_size = 1024, verbose = 1)
train_predictions = np.squeeze(train_pred>0.47).astype(int)
f1_score(train_predictions, y_train[:50000])

In [None]:
pred = model.predict(test_tokenized, batch_size = 1024, verbose = 1)
predictions = np.squeeze(pred>0.47).astype(int)
sub['prediction'] = predictions
sub.to_csv("submission.csv", index=False)

In [None]:
test['question_text'].iloc[np.where(predictions==1)[0][:10]].values