# Useful Functions 

In [41]:
import numpy as np
from __future__ import print_function
from keras.utils import to_categorical
from sklearn.metrics import classification_report

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

from sklearn.metrics import confusion_matrix

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

import pandas as pd

# loads the training data
def load_training_data():
    return pd.read_csv("train.csv")

from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

# A function to execute a machine learning pipeline
# load_func : the function that loads training data
# preprocessing_func_arr : an array of functions that preprocess the data. Executed in order and the output feeds into the input of the next
# create_model_func : a function that creates a model to train and test.
def run(load_func, preprocess_func_arr, create_model_func, verbosity=2, preprocess_debug=False):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = get_text_only(full_data)
    for func in preprocess_func_arr:
        data = func(data)
        if preprocess_debug is True:
            print(data[0])
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    print("Model Summary", model.summary)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))
    

# Model creations functions

In [63]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Conv1D
from keras.models import Sequential

# This is the best performing of the three models, it has the lowest loss of all three.
# All three have similar accuracies.
def embedding_and_pooling_model(input_dim, embedding_dims=500, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

def embedding_conv1d_pooling_model(input_dim, embedding_dims=100, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(16, 8, activation="relu"))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

from keras.optimizers import RMSprop
from keras.layers import Dropout, MaxPooling1D

def deeper_with_multiple_convolutions(input_dim, embedding_dims=100, optimizer=RMSprop(lr=0.003)):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# Preprocessing functions

In [44]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_remove_chars(text):
    tokenizer = Tokenizer(split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_leave_chars(text):
    tokenizer = Tokenizer(filters="", split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

from keras.preprocessing.sequence import pad_sequences

# Pads the input arrays to be of equal length
# input: An array of sequences of numbers
# output: an array of sequences of numbers
def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen) 

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
# input: an array of strings
# output: an array of strings
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]

# We had seen a submission where the person treated punctuation as distinct words, we thought this would be worth trying
#  seeing as some authors may have different patterns of punctuation
# input: an array of strings
# output: an array of strings
def convert_punctuation_to_words(texts):
    chars = "~!@#$%^&*()_+`-=,./;'<>?:\""
  
    for c in chars:
        texts = [text.replace(c, " " + c + " ") for text in texts]
    return texts

from nltk.stem import PorterStemmer

# Uses the Porter stemmer to stem each word in the texts
# input: an array of strings
# output: an array of strings
def stem_texts_porter(texts):
    stmr = PorterStemmer()
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
preprocessors = [
    convert_punctuation_to_words,
    remove_stops,
    convert_to_sequences_remove_chars,
    pad_data
]

run(load_training_data, preprocessors, embedding_and_pooling_model)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Train on 15663 samples, validate on 3916 samples
Epoch 1/64
 - 20s - loss: 1.0705 - acc: 0.4241 - val_loss: 1.0160 - val_acc: 0.5092
Epoch 2/64
 - 19s - loss: 0.8618 - acc: 0.6673 - val_loss: 0.7334 - val_acc: 0.7347
Epoch 3/64
 - 18s - loss: 0.5805 - acc: 0.8122 - val_loss: 0.5640 - val_acc: 0.7985
Epoch 4/64
 - 18s - loss: 0.4244 - acc: 0.8640 - val_loss: 0.4899 - val_acc: 0.8052
Epoch 5/64
 - 18s - loss: 0.3368 - acc: 0.8895 - val_loss: 0.4460 - val_acc: 0.8230
Epoch 6/64
 - 19s - loss: 0.2782 - acc: 0.9065 - val_loss: 0.4338 - val_acc: 0.8238
Epoch 7/64
 - 19s - loss: 0.2329 - acc: 0.9233 - val_loss: 0.4048 - val_acc: 0.8401
Epoch 8/64
 - 19s - loss: 0.1961 - acc: 0.9347 - val_loss: 0.4077 - val_acc: 0.8355
Epoch 9/64
 - 19s - loss: 0.1695 - acc: 0.9430 - val_loss: 0.4090 - val_acc: 0.8315
Training complete
Testing model
Test results
accuracy 0.831460674157
metrics
             precision    recall  f1-score   s