In [188]:
import numpy as np
from __future__ import print_function

In [189]:
from keras.utils import to_categorical
from sklearn.metrics import classification_report

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

from sklearn.metrics import confusion_matrix

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

In [190]:
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

def run(load_func, preprocess_func, create_model_func, verbosity=2):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = preprocess_func(get_text_only(full_data))
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))

In [191]:
import pandas as pd

def load_training_data():
    return pd.read_csv("train.csv")

In [192]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential

def create_simple_model(input_dim, embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [193]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [198]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
def convert_to_sequences(text, filters, to_lower):
    tokenizer = Tokenizer(filters=filters, lower=to_lower, split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

from keras.preprocessing.text import text_to_word_sequence

def convert_to_word_sequence(text):
    return [text_to_word_sequence(words) for words in text]

In [195]:
from keras.preprocessing.sequence import pad_sequences

def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen)    

In [170]:
def convert_to_sequence_and_pad(text):
    return pad_data(convert_to_sequences(text, "", False))

run(load_training_data, convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.831716036772
metrics
             precision    recall  f1-score   support

          0       0.81      0.87      0.84      1586
          1       0.91      0.75      0.82      1115
          2       0.80      0.86      0.83      1215

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1381   55  150]
 [ 172  837  106]
 [ 145   31 1039]]


In [122]:
def convert_to_sequence_and_pad_and_filter_chars(text):
    return pad_data(convert_to_sequences(text, "~!@#$%^&*()_+`-=,./;'<>?:\"", False))

run(load_training_data, convert_to_sequence_and_pad_and_filter_chars, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.821246169561
metrics
             precision    recall  f1-score   support

          0       0.74      0.92      0.82      1542
          1       0.87      0.81      0.84      1151
          2       0.92      0.70      0.80      1223

avg / total       0.84      0.82      0.82      3916

confusion matrix
[[1426   68   48]
 [ 201  928   22]
 [ 288   73  862]]


In [123]:
def remove_stopwords_then_convert_to_sequence_and_pad(text):
    return pad_data(convert_to_sequences(remove_stops(text), "", False))

run(load_training_data, remove_stopwords_then_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.835801838611
metrics
             precision    recall  f1-score   support

          0       0.77      0.92      0.84      1590
          1       0.92      0.78      0.84      1160
          2       0.87      0.78      0.82      1166

avg / total       0.85      0.84      0.84      3916

confusion matrix
[[1469   47   74]
 [ 204  900   56]
 [ 235   27  904]]


In [128]:
def convert_punctuation_to_words(text):
    t = text.replace(",", " , ")
    t = t.replace(".", " . ")
    t = t.replace("'", " ' ")
    t = t.replace(";", " ; ")
    t = t.replace(":", " : ")
    return t

def make_punctuation_words_and_convert_to_sequence(text):
    t = [convert_punctuation_to_words(x) for x in text]
    return pad_data(convert_to_sequences(t, "", False))

run(load_training_data, make_punctuation_words_and_convert_to_sequence, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
He cried aloud once ,  and a little later gave a gasp that was more terrible than a cry . 
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.853677221655
metrics
             precision    recall  f1-score   support

          0       0.86      0.87      0.86      1580
          1       0.92      0.80      0.86      1116
          2       0.80      0.88      0.84      1220

avg / total       0.86      0.85      0.85      3916

confusion matrix
[[1369   44  167]
 [ 110  898  108]
 [ 113   31 1076]]


In [129]:
def punctuation_as_words_and_remove_stopwords(text):
    t = convert_punctuation_to_words(text)
    t = remove_stops(t)
    return pad_data(convert_to_sequences(t, "", False))

run(load_training_data, punctuation_as_words_and_remove_stopwords, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Finding nothing else, not even gold, the Superintendent abandoned his attempts; but a perplexed look occasionally steals over his countenance as he sits thinking at his desk.
Finding nothing else , even gold , Superintendent abandoned attempts ; perplexed look occasionally steals countenance sits thinking desk .
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.837078651685
metrics
             precision    recall  f1-score   support

          0       0.80      0.88      0.83      1517
          1       0.90      0.79      0.84      1144
          2       0.84      0.83      0.84      1255

avg / total       0.84      0.84      0.84      3916

confusion matrix
[[1328   59  130]
 [ 171  903   70]
 [ 168   40 1047]]


In [130]:
def create_simple_model_with_fewer_embedding_dims(input_dim):
    return create_simple_model(input_dim, embedding_dims=10)

run(load_training_data, convert_to_sequence_and_pad, create_simple_model_with_fewer_embedding_dims, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.82328907048
metrics
             precision    recall  f1-score   support

          0       0.82      0.83      0.83      1559
          1       0.87      0.79      0.83      1167
          2       0.78      0.85      0.81      1190

avg / total       0.83      0.82      0.82      3916

confusion matrix
[[1291   83  185]
 [ 147  923   97]
 [ 131   49 1010]]


In [131]:
def create_simple_model_with_more_embedding_dims(input_dim):
    return create_simple_model(input_dim, embedding_dims=30)

run(load_training_data, convert_to_sequence_and_pad, create_simple_model_with_more_embedding_dims, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.812053115424
metrics
             precision    recall  f1-score   support

          0       0.91      0.71      0.80      1594
          1       0.79      0.89      0.83      1109
          2       0.75      0.87      0.81      1213

avg / total       0.82      0.81      0.81      3916

confusion matrix
[[1138  183  273]
 [  41  985   83]
 [  72   84 1057]]


In [136]:
import nltk
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_texts_to_verbs(texts):
    lmtzr = WordNetLemmatizer()
    return [" ".join([lmtzr.lemmatize(word, "v") for word in nltk.word_tokenize(text)]) for text in texts]

def lemmatize_and_convert_to_sequence_and_pad(text):
    t = lemmatize_texts_to_verbs(text)
    return convert_to_sequence_and_pad(t)

run(load_training_data, lemmatize_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Loading data
Getting labels
Preprocessing
The sky was serene; and, as I was unable to rest, I resolved to visit the spot where my poor William had been murdered.
The sky be serene ; and , as I be unable to rest , I resolve to visit the spot where my poor William have be murder .
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.832992849847
metrics
             precision    recall  f1-score   support

          0       0.83      0.85      0.84      1578
          1       0.93      0.75      0.83      1127
          2       0.78      0.89      0.83      1211

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1340   51  187]
 [ 157  848  122]
 [ 123   14 1074]]


In [137]:
import nltk
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_texts_to_nouns(texts):
    lmtzr = WordNetLemmatizer()
    return [" ".join([lmtzr.lemmatize(word, "n") for word in nltk.word_tokenize(text)]) for text in texts]

def lemmatize_and_convert_to_sequence_and_pad(text):
    t = lemmatize_texts_to_nouns(text)
    return convert_to_sequence_and_pad(t)

run(load_training_data, lemmatize_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Loading data
Getting labels
Preprocessing
The sky was serene; and, as I was unable to rest, I resolved to visit the spot where my poor William had been murdered.
The sky wa serene ; and , a I wa unable to rest , I resolved to visit the spot where my poor William had been murdered .
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.83835546476
metrics
             precision    recall  f1-score   support

          0       0.80      0.91      0.85      1589
          1       0.93      0.73      0.82      1161
          2       0.83      0.85      0.84      1166

avg / total       0.85      0.84      0.84      3916

confusion matrix
[[1443   34  112]
 [ 218  847   96]
 [ 146   27  993]]


In [142]:
from nltk.stem import PorterStemmer

def stem_texts(texts):
    stmr = PorterStemmer()
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]
    
def stem_and_convert_to_sequence_and_pad(text):
    t = stem_texts(text)
    return convert_to_sequence_and_pad(t)

run(load_training_data, stem_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
The sky was serene; and, as I was unable to rest, I resolved to visit the spot where my poor William had been murdered.
the sky wa seren ; and , as I wa unabl to rest , I resolv to visit the spot where my poor william had been murder .
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.843462717058
metrics
             precision    recall  f1-score   support

          0       0.81      0.90      0.85      1606
          1       0.88      0.85      0.86      1108
          2       0.87      0.76      0.81      1202

avg / total       0.85      0.84      0.84      3916

confusion matrix
[[1446   73   87]
 [ 119  938   51]
 [ 224   59  919]]


In [143]:
from nltk.stem import SnowballStemmer

def stem_texts(texts):
    stmr = SnowballStemmer("english")
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]
    
def stem_and_convert_to_sequence_and_pad(text):
    t = stem_texts(text)
    return convert_to_sequence_and_pad(t)

run(load_training_data, stem_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
I strove to think that all this grandeur was but more glaring infamy, and that, by planting his gold enwoven flag beside my tarnished and tattered banner, he proclaimed not his superiority, but his debasement.
i strove to think that all this grandeur was but more glare infami , and that , by plant his gold enwoven flag besid my tarnish and tatter banner , he proclaim not his superior , but his debas .
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.832992849847
metrics
             precision    recall  f1-score   support

          0       0.86      0.81      0.83      1588
          1       0.85      0.83      0.84      1136
          2       0.79      0.86      0.82      1192

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1283  118  187]
 [  96  948   92]
 [ 111   50 1031]]


In [203]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

def filter_words(sequences, filter_func):
    return [[word for word in sequence if filter_func(word) is not False] for sequence in sequences]

def create_infrequent_words_filter(num_to_keep):
    def f(text):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(text)
        t = convert_to_word_sequence(text)
        t = filter_words(t, lambda w: tokenizer.word_index[w] > num_to_keep)
        return [[tokenizer.word_index[word] for word in seq] for seq in t]
    return f

In [205]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(100)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.827630234934
metrics
             precision    recall  f1-score   support

          0       0.78      0.91      0.84      1608
          1       0.90      0.76      0.82      1152
          2       0.84      0.78      0.81      1156

avg / total       0.83      0.83      0.83      3916

confusion matrix
[[1462   55   91]
 [ 197  872   83]
 [ 205   44  907]]


In [213]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(1000)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.778855975485
metrics
             precision    recall  f1-score   support

          0       0.82      0.75      0.78      1600
          1       0.84      0.76      0.80      1111
          2       0.70      0.84      0.76      1205

avg / total       0.79      0.78      0.78      3916

confusion matrix
[[1195  108  297]
 [ 130  841  140]
 [ 136   55 1014]]


In [214]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(10000)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.516598569969
metrics
             precision    recall  f1-score   support

          0       0.47      0.90      0.62      1574
          1       0.72      0.34      0.46      1169
          2       0.56      0.18      0.27      1173

avg / total       0.57      0.52      0.47      3916

confusion matrix
[[1414   82   78]
 [ 686  398   85]
 [ 886   76  211]]


In [215]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

def create_frequent_words_filter(num_to_keep):
    def f(text):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(text)
        t = convert_to_word_sequence(text)
        t = filter_words(t, lambda w: tokenizer.word_index[w] <= num_to_keep)
        return [[tokenizer.word_index[word] for word in seq] for seq in t]
    return f

In [218]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(100)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.577374872319
metrics
             precision    recall  f1-score   support

          0       0.56      0.76      0.65      1565
          1       0.64      0.36      0.46      1152
          2       0.58      0.54      0.56      1199

avg / total       0.59      0.58      0.56      3916

confusion matrix
[[1197  120  248]
 [ 510  416  226]
 [ 434  117  648]]


In [219]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(1000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.728038815117
metrics
             precision    recall  f1-score   support

          0       0.76      0.73      0.74      1609
          1       0.63      0.81      0.71      1068
          2       0.81      0.66      0.73      1239

avg / total       0.74      0.73      0.73      3916

confusion matrix
[[1168  305  136]
 [ 145  861   62]
 [ 218  199  822]]


In [220]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(5000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.813329928498
metrics
             precision    recall  f1-score   support

          0       0.78      0.86      0.82      1573
          1       0.88      0.74      0.81      1131
          2       0.81      0.82      0.82      1212

avg / total       0.82      0.81      0.81      3916

confusion matrix
[[1350   73  150]
 [ 207  836   88]
 [ 176   37  999]]


In [221]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(10000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.833758937692
metrics
             precision    recall  f1-score   support

          0       0.77      0.91      0.84      1596
          1       0.87      0.79      0.83      1098
          2       0.91      0.78      0.84      1222

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1453   82   61]
 [ 198  863   37]
 [ 227   46  949]]


In [222]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(15000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.828907048008
metrics
             precision    recall  f1-score   support

          0       0.76      0.92      0.83      1524
          1       0.86      0.82      0.84      1154
          2       0.92      0.73      0.81      1238

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1396   81   47]
 [ 175  943   36]
 [ 264   67  907]]


In [138]:
# example knn classifier: https://stackoverflow.com/questions/42872425/text-classification-using-knn