Model: Bidrectional LSTM over 2 types of word embeddings (from chars using RNN, and from one-hot [as in the paper](https://arxiv.org/abs/1604.05529)). Additional featrues from ["Simple Feature Engg Notebook - Spooky Author"](https://www.kaggle.com/sudalairajkumar/simple-feature-engg-notebook-spooky-author) and sentence embedding from LSTM output are used as input for Dense layer. 

LB: around 0.38

In [None]:
import csv
import re
import os
import pickle
import copy
import string
from collections import Counter

import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

from keras.models import Model, load_model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.layers import LSTM, Bidirectional, Dropout, Dense, Input, Embedding, BatchNormalization, TimeDistributed
from keras.layers.merge import concatenate

RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)

# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
# Reading dataset
DATA_TRAIN = "../input/train.csv"
DATA_TEST = "../input/test.csv"

train = pd.read_csv(DATA_TRAIN, delimiter=',', quotechar='"')
test = pd.read_csv(DATA_TEST, delimiter=',', quotechar='"')

author_to_index = {
    "EAP": 0, 
    "HPL": 1, 
    "MWS": 2
}
train["author"] = train["author"].map(author_to_index)

print("Train samples: {}".format(train.shape[0]))
print(train.head())
print()
print("Test samples: {}".format(test.shape[0]))
print(test.head())

In [None]:
# Tokenizer - nltk.word_tokenize (punkt module of NLTK)
tokenize = nltk.word_tokenize
print(train["text"][0])
print(tokenize(train["text"][0]))

In [None]:
class Vocabulary(object):
    def __init__(self, dump_filename):
        self.dump_filename = dump_filename
        self.word_to_index = {}
        self.index_to_word = []
        self.counter = Counter()
        self.reset()

        if os.path.isfile(self.dump_filename):
            self.load()

    def save(self):
        with open(self.dump_filename, "wb") as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

    def load(self):
        with open(self.dump_filename, "rb") as f:
            vocab = pickle.load(f)
            self.__dict__.update(vocab.__dict__)

    def add_word(self, word):
        self.counter[word] += 1
        if self.word_to_index.get(word) is None:
            self.index_to_word.append(word)
            index = len(self.index_to_word) -1
            self.word_to_index[word] = index
            return index
        return self.word_to_index[word]

    def get_word_index(self, word):
        if self.word_to_index.get(word) is not None:
            return self.word_to_index[word]
        return len(self.word_to_index)

    def get_word(self, index):
        return self.index_to_word[index]

    def size(self):
        return len(self.index_to_word)
    
    def reset(self):
        self.word_to_index = {}
        self.index_to_word = []
        self.counter = Counter()
        self.word_to_index["NotAWord"] = 0
        self.index_to_word.append("NotAWord")
        self.counter["NotAWord"] = 1
    
    def shrink(self, num):
        pairs = self.counter.most_common(num)
        self.reset()
        for word, count in pairs:
            self.add_word(word)

In [None]:
def bow(train_texts, test_texts, tokenizer=nltk.word_tokenize, preprocessor=None,
        use_tfidf=False, max_features=None, bow_ngrams=(1, 1), analyzer='word'):
    train = copy.deepcopy(train_texts)
    test = copy.deepcopy(test_texts)

    if use_tfidf:
        vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    else:
        vectorizer = CountVectorizer(analyzer=analyzer, ngram_range=bow_ngrams, tokenizer=tokenizer,
                                     preprocessor=preprocessor, max_features=max_features)
    data = train + test
    data = vectorizer.fit_transform(data)
    train_data = data[:len(train)]
    test_data = data[len(train):]
    return train_data, test_data

def run_bow_nb(train_sentences, train_answers, test_sentences):
    train_data, test_data = bow(train_sentences, test_sentences)
    nb = MultinomialNB()
    clf = GridSearchCV(estimator=nb, 
                       param_grid={"alpha": [0.1, 0.3, 0.6, 0.9, 1.0]}, 
                       scoring="neg_log_loss", cv=5)
    clf.fit(train_data, train_answers)
    print("CV: {}".format(clf.best_score_))
    return  clf.predict_proba(train_data), clf.predict_proba(test_data)

def run_boc_nb(train_sentences, train_answers, test_sentences):
    train_data, test_data = bow(train_sentences, test_sentences, tokenizer=None, use_tfidf=True, analyzer='char')
    nb = MultinomialNB()
    clf = GridSearchCV(estimator=nb, 
                       param_grid={"alpha": [0.1, 0.3, 0.6, 0.9, 1.0]}, 
                       scoring="neg_log_loss", cv=5)
    clf.fit(train_data, train_answers)
    print("CV: {}".format(clf.best_score_))
    return  clf.predict_proba(train_data), clf.predict_proba(test_data)

def collect_additional_features(train, test):
    train_df = train.copy()
    test_df = test.copy()
    eng_stopwords = set(nltk.corpus.stopwords.words("english"))
    
    train_df["words"] =  train_df["text"].apply(lambda text: text.split())
    test_df["words"] = test_df["text"].apply(lambda text: text.split())
    
    train_df["num_words"] = train_df["words"].apply(lambda words: len(words))
    test_df["num_words"] = test_df["words"].apply(lambda words: len(words))
    
    train_df["num_unique_words"] = train_df["words"].apply(lambda words: len(set(words)))
    test_df["num_unique_words"] = test_df["words"].apply(lambda words: len(set(words)))
    
    train_df["num_chars"] = train_df["text"].apply(lambda text: len(text))
    test_df["num_chars"] = test_df["text"].apply(lambda text: len(text))
    
    train_df["num_stopwords"] = train_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))
    test_df["num_stopwords"] = test_df["words"].apply(lambda words: len([w for w in words if w in eng_stopwords]))
    
    train_df["num_punctuations"] = train_df['text'].apply(lambda text: len([c for c in text if c in string.punctuation]))
    test_df["num_punctuations"] =test_df['text'].apply(lambda text: len([c for c in text if c in string.punctuation]))
    
    train_df["num_words_upper"] = train_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
    test_df["num_words_upper"] = test_df["words"].apply(lambda words: len([w for w in words if w.isupper()]))
    
    train_df["num_words_title"] = train_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
    test_df["num_words_title"] = test_df["words"].apply(lambda words: len([w for w in words if w.istitle()]))
    
    train_df["mean_word_len"] = train_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
    test_df["mean_word_len"] = test_df["words"].apply(lambda words: np.mean([len(w) for w in words]))
    
    pred_train, pred_test = run_bow_nb(train_df["text"].tolist(), train_df["author"].tolist(), test_df["text"].tolist())
    train_df["nb_count_eap"] = pred_train[:,0]
    train_df["nb_count_hpl"] = pred_train[:,1]
    train_df["nb_count_mws"] = pred_train[:,2]
    test_df["nb_count_eap"] = pred_test[:,0]
    test_df["nb_count_hpl"] = pred_test[:,1]
    test_df["nb_count_mws"] = pred_test[:,2]
    
    pred_train, pred_test = run_boc_nb(train_df["text"].tolist(), train_df["author"].tolist(), test_df["text"].tolist())
    train_df["nb_count_chars_eap"] = pred_train[:,0]
    train_df["nb_count_chars_hpl"] = pred_train[:,1]
    train_df["nb_count_chars_mws"] = pred_train[:,2]
    test_df["nb_count_chars_eap"] = pred_test[:,0]
    test_df["nb_count_chars_hpl"] = pred_test[:,1]
    test_df["nb_count_chars_mws"] = pred_test[:,2]
    
    train_df.drop(["text", "id", "words"], axis=1, inplace=True)
    test_df.drop(["text", "id", "words"], axis=1, inplace=True)
    if "author" in train_df.columns:
        train_df.drop(["author"], axis=1, inplace=True)
    if "author" in test_df.columns:
        test_df.drop(["author"], axis=1, inplace=True)
    
    scaler = MinMaxScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)
    return train_df, test_df

In [None]:
VOCAB_PATH = "vocab.pickle"

def prepare_vocabulary(vocab_path, train, test, shrink_border=None):
    vocabulary = Vocabulary(vocab_path)
    if vocabulary.size() <= 1:
        for sentence in train['text'].tolist():
            for word in tokenize(sentence):
                vocabulary.add_word(word)
        print("Train vocabulary size: {}".format(vocabulary.size()))
        for sentence in test['text'].tolist():
            for word in tokenize(sentence):
                vocabulary.add_word(word) 
        print("Train+test vocabulary size: {}".format(vocabulary.size()))
        vocabulary.save()

    print("Vocabulary size: {}".format(vocabulary.size()))
    if shrink_border is not None:
        vocabulary.shrink(shrink_border)
        print("Vocabulary size after shrink: {}".format(vocabulary.size()))
    return vocabulary

vocabulary = prepare_vocabulary(VOCAB_PATH, train, test)

In [None]:
CHAR_SET = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-'\""

def get_samples(sentences, vocabulary, word_max_count, max_word_len):
    n = len(sentences)
    word_matrix = np.zeros((n, word_max_count), dtype='int')
    char_matrix = np.zeros((n, word_max_count, max_word_len), dtype=np.int)
    for i, sentence in enumerate(sentences):
        words = tokenize(sentence)[:word_max_count]
        word_matrix[i, -len(words):] = [vocabulary.get_word_index(word) for word in words]
        char_vectors = []
        for word in words:
            char_indices = np.zeros(max_word_len)
            word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word]
            char_indices[-min(len(word), max_word_len):] = word_char_indices[:max_word_len]
            char_vectors.append(char_indices)
        char_matrix[i, -len(words):] = char_vectors
    return word_matrix, char_matrix

def get_train_val_test_sets(x, y, x_test, vocabulary, word_max_count=80, max_word_len=30, val_part=0.1):
    word_matrix, char_matrix = get_samples(x["text"].tolist(), vocabulary, word_max_count, max_word_len)

    n = x.shape[0]
    np.random.seed(RANDOM_SEED)
    perm = np.random.permutation(n)
    idx_train = perm[:int(n*(1-val_part))]
    idx_val = perm[int(n*(1-val_part)):]

    additional_features_matrix_train, additional_features_matrix_val = \
        collect_additional_features(x.iloc[idx_train], x.iloc[idx_val])

    word_matrix_train = word_matrix[idx_train]
    char_matrix_train = char_matrix[idx_train]
    y_train = np.array(y, dtype='int32')[idx_train]

    word_matrix_val = word_matrix[idx_val]
    char_matrix_val = char_matrix[idx_val]
    y_val = np.array(y, dtype='int32')[idx_val]

    word_matrix_test, char_matrix_test = get_samples(x_test["text"].tolist(), 
                                                     vocabulary, word_max_count, max_word_len)
    _, additional_features_matrix_test = collect_additional_features(x.iloc[idx_train], x_test)

    return (word_matrix_train, char_matrix_train, additional_features_matrix_train, y_train), \
        (word_matrix_val, char_matrix_val, additional_features_matrix_val, y_val), \
        (word_matrix_test, char_matrix_test, additional_features_matrix_test)
        
data_train, data_val, data_test = get_train_val_test_sets(train, train['author'].tolist(), test, vocabulary)

In [None]:
class SpookyRNN:
    def __init__(self, rnn_units=64, dense_units=32, dropout=0.4, batch_size=256, 
                 embeddings_dimensions=150, char_embeddings_dimension=20, max_word_len=30,
                 char_lstm_output_dim=64):
        self.batch_size = batch_size
        self.dropout = dropout
        self.rnn_units = rnn_units
        self.dense_units = dense_units
        self.embeddings_dimensions = embeddings_dimensions
        self.char_embeddings_dimension = char_embeddings_dimension
        self.char_lstm_output_dim =char_lstm_output_dim
        self.max_word_len = max_word_len

        self.model = None

    def build(self, n_additional_features, vocabulary_size):
        word_index_input = Input(shape=(None,), dtype="int32", name="word_index_input")
        word_embeddings = Embedding(vocabulary_size + 1, 
                                    self.embeddings_dimensions, name="word_embeddings")(word_index_input)
        
        char_input = Input(shape=(None, self.max_word_len), dtype="int32", name="char_input")
        char_embeddings = Embedding(len(CHAR_SET) + 1, 
                                    self.char_embeddings_dimension, name='char_embeddings')(char_input)
        word_from_char_embeddings = TimeDistributed(Bidirectional(
            LSTM(self.char_lstm_output_dim // 2, dropout=self.dropout, 
                 recurrent_dropout=self.dropout, name='CharLSTM')))(char_embeddings)
        
        additional_features_input = Input(shape=(n_additional_features, ), dtype='float32', name='add_input')
        
        lstm_input = concatenate([word_embeddings, word_from_char_embeddings], name="lstm_input")
        lstm_layer = Bidirectional(LSTM(self.rnn_units // 2, dropout=self.dropout, 
                                        recurrent_dropout=self.dropout))(lstm_input)
        
        layer = concatenate([lstm_layer, additional_features_input], name="dense_input")
        dense = Dense(self.dense_units, activation='relu')(layer)
        dense = Dropout(self.dropout)(dense)
        
        predictions = Dense(3, activation='softmax')(dense)
        model = Model(inputs=[word_index_input, char_input, additional_features_input], outputs=predictions)
        
        model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
                                                                                          
        print(model.summary())
        self.model = model

    def train(self, data_train, data_val, model_filename, enable_checkpoints=True):
        word_matrix_train, char_matrix_train, additional_features_matrix_train, y_train = data_train
        word_matrix_val, char_matrix_val, additional_features_matrix_val, y_val = data_val
        
        print("Train example:")
        print(word_matrix_train[0])
        print(char_matrix_train[0])
        print(additional_features_matrix_train[0])
        print(y_train[0])
        
        # Callback to prevent overfitting.
        callbacks = [EarlyStopping(monitor='val_loss', patience=0)]

        # Callback to save best only model.
        if enable_checkpoints:
            callbacks.append(ModelCheckpoint(model_filename, monitor='val_loss', save_best_only=True))

        self.model.fit([word_matrix_train, char_matrix_train, additional_features_matrix_train], y_train, 
                       validation_data=([word_matrix_val, char_matrix_val, additional_features_matrix_val], y_val),
                       epochs=50,
                       batch_size=self.batch_size,
                       shuffle=True, 
                       callbacks=callbacks,
                       verbose=1)

    def load(self, filename: str) -> None:
        self.model = load_model(filename)
        print(self.model.summary())

    def predict(self, data_test, answer_filename):
        word_matrix, char_matrix, additional_features_matrix = data_test
        
        print("Test example: ")
        print(word_matrix[0])
        print(char_matrix[0])
        print(additional_features_matrix[0])
        preds = self.model.predict([word_matrix, char_matrix, additional_features_matrix], 
                                   batch_size=self.batch_size, verbose=1)
        index_to_author = { 0: "EAP", 1: "HPL", 2: "MWS" }
        submission = pd.DataFrame({"id": test["id"], index_to_author[0]: preds[:, 0], 
                                   index_to_author[1]: preds[:, 1], index_to_author[2]: preds[:, 2]})
        submission.to_csv(answer_filename, index=False)

In [None]:
MODEL_FILENAME = "model.h5"

In [None]:
rnn = SpookyRNN()
rnn.build(data_train[2].shape[1], vocabulary.size())
rnn.train(data_train, data_val, MODEL_FILENAME)

In [None]:
rnn = SpookyRNN()
rnn.load(MODEL_FILENAME)
rnn.predict(data_test, 'answer.csv')