In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))


# Any results you write to the current directory are saved as output.

In [None]:
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Dropout, Concatenate, Lambda, Flatten
from keras.layers import GlobalMaxPool1D
from keras.models import Model


import tqdm


# Combinations
This kernel would contain a combination of previousle tested models. For example, it may be useful to combine pretrained embeddings with ones that were trained on this particular datase.

# Embeddings

In [None]:
MAX_SEQUENCE_LENGTH = 60
MAX_WORDS = 75000
EMBEDDINGS_TRAINED_DIMENSIONS = 100
EMBEDDINGS_LOADED_DIMENSIONS = 300

## Custom
Train our own embeddings on the training data

In [None]:
import gensim, logging
from nltk.tokenize import sent_tokenize

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class SentenceGenerator(object):
    def __init__(self, texts):
        self.texts = texts
    def __iter__(self):
        for text in self.texts:
            sentences = sent_tokenize(text)
            for sent in sentences:
                yield sent
 

def train_w2v(texts, epochs=5):
    sent_gen = SentenceGenerator(texts)
    model_path = "quora_w2v" +\
        f"_{EMBEDDINGS_TRAINED_DIMENSIONS}dimenstions" +\
        f"_{str(epochs)}epochs" +\
        f"_{MAX_WORDS}words" +\
        ".model"

    if (os.path.isfile(model_path)):
        model = gensim.models.Word2Vec.load(model_path)
        print("Word2Vec loaded from " + model_path)
    else:
        model = gensim.models.Word2Vec(sent_gen, size=EMBEDDINGS_TRAINED_DIMENSIONS, workers=4, max_final_vocab=MAX_WORDS, iter=epochs)
        model.save(model_path)
        print("Word2Vec saved to " + model_path)
        
    return model

## Pretrained
Load (one of) the embeddings

In [None]:
def load_embeddings(file):
    embeddings = {}
    with open(file) as f:
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings = dict(get_coefs(*line.split(" ")) for line in f if len(line)>100)
        
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

# Data
Load the data.

In [None]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [None]:
BATCH_SIZE = 512
Q_FRACTION = 1
questions = df_train.sample(frac=Q_FRACTION)
question_texts = questions["question_text"].values
question_targets = questions["target"].values
test_texts = df_test["question_text"].fillna("_na_").values

print(f"Working on {len(questions)} questions")

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(list(df_train["question_text"].values))

In [None]:
custom_embeddings = train_w2v(question_texts, epochs=5)
pretrained_embeddings = load_embeddings("../input/embeddings/glove.840B.300d/glove.840B.300d.txt")


In [None]:
from collections import defaultdict

def create_embedding_weights(tokenizer, embeddings, dimensions):
    not_embedded = defaultdict(int)
    
    word_index = tokenizer.word_index
    words_count = min(len(word_index), MAX_WORDS)
    embeddings_matrix = np.zeros((words_count, dimensions))
    for word, i in word_index.items():
        if i >= MAX_WORDS:
            continue
        if word not in embeddings:
            not_embedded[word] = not_embedded[word] + 1
            continue
        embedding_vector = embeddings[word]
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
            
    print(sorted(not_embedded, key=not_embedded.get)[:10])
    return embeddings_matrix

In [None]:
custom_emb_weights = create_embedding_weights(tokenizer, custom_embeddings, EMBEDDINGS_TRAINED_DIMENSIONS)
pretrained_emb_weights = create_embedding_weights(tokenizer, pretrained_embeddings, EMBEDDINGS_LOADED_DIMENSIONS)

# Model
Construct the model to use, e.g. a simple NN

In [None]:
from keras.layers import Conv2D, Reshape, MaxPool2D

filter_sizes = [1,2,3,5]
num_filters = 42

def create_model():
    tokenized_input = Input(shape=(MAX_SEQUENCE_LENGTH,), name="tokenized_input")

    trained = Embedding(MAX_WORDS,
                        EMBEDDINGS_TRAINED_DIMENSIONS,
                        weights=[custom_emb_weights],
                        trainable=False)(tokenized_input)
    
    pretrained = Embedding(MAX_WORDS,
                           EMBEDDINGS_LOADED_DIMENSIONS,
                           weights=[pretrained_emb_weights],
                           trainable=False)(tokenized_input)

    trained = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDINGS_TRAINED_DIMENSIONS, 1))(trained)
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDINGS_TRAINED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(trained)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDINGS_TRAINED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(trained)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDINGS_TRAINED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(trained)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDINGS_TRAINED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(trained)
    maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[3] + 1, 1))(conv_3)
    trained = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
    trained = Flatten()(trained)

    pretrained = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDINGS_LOADED_DIMENSIONS, 1))(pretrained)
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDINGS_LOADED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(pretrained)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDINGS_LOADED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(pretrained)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], EMBEDDINGS_LOADED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(pretrained)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], EMBEDDINGS_LOADED_DIMENSIONS),
                    kernel_initializer='he_normal', activation='tanh')(pretrained)
    maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[3] + 1, 1))(conv_3)
    pretrained = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
    pretrained = Flatten()(pretrained)
    
    x = Concatenate(axis=1)([pretrained, trained])
    x = Dropout(0.7)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[tokenized_input], outputs=out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model


# Model evaluation




In [None]:
import sklearn
import keras
import matplotlib.pyplot as plt

THRESHOLD = 0.35

class F1EpochCallback(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f1s = []
        
    def on_epoch_end(self, batch, logs={}):
        predictions = self.model.predict(self.validation_data[0])
        predictions = (predictions > THRESHOLD).astype(int)
        predictions = np.asarray(predictions)
        targets = self.validation_data[1]
        f1 = sklearn.metrics.f1_score(targets, predictions)
        print(f"validation_f1: {f1}")
        self.f1s.append(f1)
        return
    
def display_model_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.show()

def display_model_f1(f1_callback):
    plt.plot(f1_callback.f1s)
    plt.title('F1')
    plt.ylabel('F1')
    plt.xlabel('Epoch')
    plt.legend(['F1 score'], loc='upper right')
    plt.show()

# Training
Train the model. Also, experiment with different versions

## Prepare the data first
E.g. the tokenized words as well as the nlp features

In [None]:
train_X = pad_sequences(tokenizer.texts_to_sequences(question_texts),
                        maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# %%time
model = create_model()
f1_callback = F1EpochCallback()
history = model.fit(
    x=[train_X],
    y=question_targets,
    batch_size=512, epochs=20, callbacks=[f1_callback], validation_split=0.015,
    verbose=2)


In [None]:
display_model_history(history)
display_model_f1(f1_callback)

# Results

In [None]:
test_word_tokens = pad_sequences(tokenizer.texts_to_sequences(test_texts),
                       maxlen=MAX_SEQUENCE_LENGTH)

pred_test = model.predict([test_word_tokens], batch_size=1024, verbose=1)
pred_test = (pred_test > THRESHOLD).astype(int)

df_out = pd.DataFrame({"qid":df_test["qid"].values})
df_out['prediction'] = pred_test
df_out.to_csv("submission.csv", index=False)