In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("."))


# Any results you write to the current directory are saved as output.

In [None]:
# Reading train and test data
train_df = pd.read_csv('../input/train.csv')
train_text = train_df['question_text'].fillna("dieter").tolist()
train_label = train_df['target'].values

test_df = pd.read_csv('../input/test.csv')
test_text = test_df['question_text'].fillna("dieter").tolist()

print(f'Training size: {len(train_text)}')
print(f'Test size: {len(test_text)}')

In [None]:
# Embeddings parameters
GLOVE_DIR = '../input/embeddings' 
MAX_SEQUENCE_LENGTH = 30  # Maximum number of words in a sentence
MAX_NB_WORDS = 40000  # Vocabulary size
EMBEDDING_DIM = 300  # Dimensions of Glove word vectors

# CNN Intent Model Parameters
filter_sizes = [1, 2, 3, 5] # One filter for each Conv2D layer
num_filters = 56
drop = 0.3

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Reshape, Dropout, Concatenate
from keras.layers import Conv2D, MaxPool2D, Embedding, BatchNormalization
from keras.models import Model


from keras import backend as K
K.tensorflow_backend._get_available_gpus()

def featurize_text(train_text, test_text):
    """
    Takes in input the train_text and test_text
    and returns the featurized input as (x_train, y_train) and the word_index.

    """
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    input_data = train_text + test_text
    tokenizer.fit_on_texts(input_data)
    
    _x_train = tokenizer.texts_to_sequences(train_text)
    _x_test = tokenizer.texts_to_sequences(test_text)
    
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    x_train = pad_sequences(_x_train, maxlen=MAX_SEQUENCE_LENGTH)
    x_test = pad_sequences(_x_test, maxlen=MAX_SEQUENCE_LENGTH)
    
    return x_train, x_test, word_index

In [None]:
def get_model(word_index):
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d/glove.840B.300d.txt'), encoding='utf-8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))
    print('Found %s unique labels.' % y_train.shape[1])
    #num_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    print("Creating Model...")
    inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    
    embedding = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                          input_length=MAX_SEQUENCE_LENGTH, trainable=True)(inputs)
    
    reshape = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1))(embedding)

    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDING_DIM), padding='valid',
                    kernel_initializer='he_normal', activation='elu')(reshape)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid',
                    kernel_initializer='he_normal', activation='elu')(reshape)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid',
                    kernel_initializer='he_normal', activation='elu')(reshape)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDING_DIM), padding='valid',
                kernel_initializer='he_normal', activation='elu')(reshape)
    
    maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1, 1), padding='valid')(
        conv_0)
    maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1, 1), padding='valid')(
        conv_1)
    maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1, 1), padding='valid')(
        conv_2)
    maxpool_3 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1, 1), padding='valid')(
    conv_3)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(0.2)(flatten)
    preds = Dense(1, activation='sigmoid')(dropout)
    model = Model(inputs=inputs, outputs=preds)

    return model

In [None]:
# Featurizing the input
x_train, x_test, w_index = featurize_text(train_text, test_text)
y_train = to_categorical(np.asarray(train_label))
del train_text
del test_text
del train_df
assert len(x_train) == len(y_train)

In [None]:
model = get_model(w_index)
model.summary()

In [None]:
from keras.callbacks import Callback
from sklearn.metrics import f1_score
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > 0.35).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

batch_size = 256
epochs = 4

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
X_tra, X_val, y_tra, y_val = train_test_split(x_train, train_label, test_size = 0.05, random_state=42)

#early_stopping = EarlyStopping(patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('./quora.model', save_best_only=True, verbose=1)
#reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0001, verbose=1)
F1_Score = F1Evaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 verbose=True, callbacks = [F1_Score, model_checkpoint])

In [None]:
y_pred = model.predict(x_test, batch_size=256, verbose=True)
y_pred = (y_pred > 0.35).astype(int)
y_pred = [x[0] for x in y_pred]


submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_pred})
submit_df.to_csv("submission.csv", index=False)

In [None]:
from IPython.display import HTML
import base64  
import pandas as pd  

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index =False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(submit_df)