In [134]:
import numpy as np
from __future__ import print_function

In [135]:
from keras.utils import to_categorical

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

In [153]:
from keras.callbacks import EarlyStopping

def run(load_func, preprocess_func, create_model_func):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = preprocess_func(get_text_only(full_data))
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=2,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")

In [144]:
import pandas as pd

def load_training_data():
    return pd.read_csv("train.csv")

In [145]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential

def create_simple_model(input_dim, embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [146]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [147]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
def convert_to_sequences(text, filters, to_lower):
    tokenizer = Tokenizer(filters=filters, lower=to_lower, split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

In [141]:
from keras.preprocessing.sequence import pad_sequences

def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen)    

In [154]:
def preprocessing(text):
    return pad_data(convert_to_sequences(text, "", False))

run(load_training_data, preprocessing, create_simple_model)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Train on 15663 samples, validate on 3916 samples
Epoch 1/64
 - 7s - loss: 1.0858 - acc: 0.4031 - val_loss: 1.0868 - val_acc: 0.3933
Epoch 2/64
 - 7s - loss: 1.0783 - acc: 0.4064 - val_loss: 1.0786 - val_acc: 0.3933
Epoch 3/64
 - 7s - loss: 1.0610 - acc: 0.4137 - val_loss: 1.0535 - val_acc: 0.4196
Epoch 4/64
 - 7s - loss: 1.0249 - acc: 0.4726 - val_loss: 1.0210 - val_acc: 0.4632
Epoch 5/64
 - 7s - loss: 0.9721 - acc: 0.5646 - val_loss: 0.9659 - val_acc: 0.5199
Epoch 6/64
 - 6s - loss: 0.9095 - acc: 0.6426 - val_loss: 0.9076 - val_acc: 0.6239
Epoch 7/64
 - 7s - loss: 0.8463 - acc: 0.6910 - val_loss: 0.8553 - val_acc: 0.6537
Epoch 8/64
 - 6s - loss: 0.7852 - acc: 0.7269 - val_loss: 0.8080 - val_acc: 0.6966
Epoch 9/64
 - 6s - loss: 0.7286 - acc: 0.7538 - val_loss: 0.7670 - val_acc: 0.7293
Epoch 10/64
 - 6s - loss: 0.6794 - acc: 0.7740 - val_loss: 0.7287 - val_acc: 0.7398
Epoch 11/64
 - 6s - loss: 0.6332 - acc: 0.7951 -

| Attempted | acc | loss | val_acc | val_loss | epochs |
|-----------|-----|------|---------|----------|--------|
| Tokenize only | 0.9182 | 0.2822 | 0.8026 | 0.4854 | 25/64 |
| Filter non alphanumeric (!@#$%^&*()-=_+,./<>?;:'\") | 0.9399 |0.1989 | 0.8292 | 0.4235 | 36/64|
| texts_to_matrix insteadt of texts_to_sequences | 0.4054 | 1.0872 | 0.3961 | 1.0898 | 6/64 |
| Filter like above without converting to lowercase | 0.9478 | 0.1815 | 0.8253 | 0.4169 | 35/64 |
| Filter special chars and remove stop words | 0.9314 | 0.2331 | 0.8184 | 0.4475 | 22/64 |
| Filter 