# Submission 1: Training using Embeddings

In [None]:
#Imports and downloads
import numpy as np
from __future__ import print_function
from keras.utils import to_categorical
from sklearn.metrics import classification_report
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Conv1D
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers import Dropout, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix

Here are some useful functions that we are defining to make our lives easier. These include some pre-processing work.

In [None]:
def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_remove_chars(text):
    tokenizer = Tokenizer(split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_leave_chars(text):
    tokenizer = Tokenizer(filters="", split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

# Pads the input arrays to be of equal length
# input: An array of sequences of numbers
# output: an array of sequences of numbers
def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen) 

# removes stop words from the sentences in text
# input: an array of strings
# output: an array of strings
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]

# We had seen a submission where the person treated punctuation as distinct words, we thought this would be worth trying
#  seeing as some authors may have different patterns of punctuation
# input: an array of strings
# output: an array of strings
def convert_punctuation_to_words(texts):
    chars = "~!@#$%^&*()_+`-=,./;'<>?:\""
  
    for c in chars:
        texts = [text.replace(c, " " + c + " ") for text in texts]
    return texts

# Uses the Porter stemmer to stem each word in the texts
# input: an array of strings
# output: an array of strings
def stem_texts_porter(texts):
    stmr = PorterStemmer()
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]

Now we are going to load the training data

In [None]:
# loads the training data
def load_training_data():
    return pd.read_csv("train.csv")
def load_test_data():
    return pd.read_csv("test.csv")

Now we create a function that will run our pipeline

In [None]:
# A function to execute a machine learning pipeline
# load_func : the function that loads training data
# preprocessing_func_arr : an array of functions that preprocess the data. Executed in order and the output feeds into the input of the next
# create_model_func : a function that creates a model to train and test.
def run(load_func, preprocess_func_arr, create_model_func, verbosity=2, preprocess_debug=False):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = get_text_only(full_data)
    for func in preprocess_func_arr:
        data = func(data)
        if preprocess_debug is True:
            print(data[0])
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    print("Model Summary", model.summary)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)  
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))
    
    print("Generating Kaggle test results - test_results.csv")
    test_data = load_test_data()
    data = test_data["text"]
    for func in preprocess_func_arr:
        data = func(data)
    out = model.predict(data)
    print(out.shape)

Next we will define some different models. We created a few here, but only one of them was any good. We found that the embedding and pooling model was the only one worth while. We explore convolutional networks more in submission 3.

In [2]:
# This is the best performing of the three models, it has the lowest loss of all three.
# All three have similar accuracies.
def embedding_and_pooling_model(input_dim, embedding_dims=500, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

def embedding_conv1d_pooling_model(input_dim, embedding_dims=100, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(16, 8, activation="relu"))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model



def deeper_with_multiple_convolutions(input_dim, embedding_dims=100, optimizer=RMSprop(lr=0.003)):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

Finally, we run our model!!

In [4]:
preprocessors = [
    convert_punctuation_to_words,
    remove_stops,
    convert_to_sequences_remove_chars,
    pad_data
]

run(load_training_data, preprocessors, embedding_and_pooling_model)

Loading data
Getting labels
Preprocessing
Creating model
Model Summary <bound method Container.summary of <keras.models.Sequential object at 0x0000017CC0373208>>
Training model
Train on 15663 samples, validate on 3916 samples
Epoch 1/64
 - 17s - loss: 1.0704 - acc: 0.4197 - val_loss: 1.0194 - val_acc: 0.4714
Epoch 2/64
 - 17s - loss: 0.8565 - acc: 0.6668 - val_loss: 0.7341 - val_acc: 0.7845
Epoch 3/64
 - 16s - loss: 0.5746 - acc: 0.8121 - val_loss: 0.5639 - val_acc: 0.8059
Epoch 4/64
 - 16s - loss: 0.4180 - acc: 0.8636 - val_loss: 0.5130 - val_acc: 0.8003
Epoch 5/64
 - 16s - loss: 0.3315 - acc: 0.8908 - val_loss: 0.4617 - val_acc: 0.8151
Epoch 6/64
 - 16s - loss: 0.2721 - acc: 0.9099 - val_loss: 0.4775 - val_acc: 0.8023
Epoch 7/64
 - 16s - loss: 0.2273 - acc: 0.9239 - val_loss: 0.4212 - val_acc: 0.8309
Epoch 8/64
 - 16s - loss: 0.1952 - acc: 0.9351 - val_loss: 0.4255 - val_acc: 0.8312
Epoch 9/64
 - 16s - loss: 0.1668 - acc: 0.9435 - val_loss: 0.4335 - val_acc: 0.8302
Training complete


This model was semi-sucessful. Putting it up on the Kaggle site we scored 2.55803, which put us in 1504th place, which isnt very good. However, this is a pretty simple model with no convolutional layers, but a lot of pre-processing. Throughout this competetion, we found that pre-processing was less important than having a good model. This could be becuase our pre-processes wasnt affecting the things that the networks were learning on. Since these networks are so black box, it is hard to tell if that was the case. Another reason is that we just were not doing the right sort of pre-processing and were not extracting feature that were actually important. From this jumping off point we continued on.