In [93]:
import numpy as np
from __future__ import print_function

In [94]:
from keras.utils import to_categorical
from sklearn.metrics import classification_report

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

from sklearn.metrics import confusion_matrix

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

In [117]:
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

def run(load_func, preprocess_func, create_model_func, verbosity=2):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = preprocess_func(get_text_only(full_data))
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)
    
    print("Test results")  
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))

In [110]:
import pandas as pd

def load_training_data():
    return pd.read_csv("train.csv")

In [111]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential

def create_simple_model(input_dim, embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [112]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
def convert_to_sequences(text, filters, to_lower):
    tokenizer = Tokenizer(filters=filters, lower=to_lower, split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

In [114]:
from keras.preprocessing.sequence import pad_sequences

def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen)    

In [116]:
def convert_to_sequence_and_pad(text):
    return pad_data(convert_to_sequences(text, "", False))

run(load_training_data, convert_to_sequence_and_pad, create_simple_model, verbosity=0)

TypeError: run() got an unexpected keyword argument 'verbosity'

In [None]:
def 

| Attempted | acc | loss | val_acc | val_loss | epochs |
|-----------|-----|------|---------|----------|--------|
| Tokenize only | 0.9182 | 0.2822 | 0.8026 | 0.4854 | 25/64 |
| Filter non alphanumeric (!@#$%^&*()-=_+,./<>?;:'\") | 0.9399 |0.1989 | 0.8292 | 0.4235 | 36/64|
| texts_to_matrix insteadt of texts_to_sequences | 0.4054 | 1.0872 | 0.3961 | 1.0898 | 6/64 |
| Filter like above without converting to lowercase | 0.9478 | 0.1815 | 0.8253 | 0.4169 | 35/64 |
| Filter special chars and remove stop words | 0.9314 | 0.2331 | 0.8184 | 0.4475 | 22/64 |
| Filter 