In [1]:
import numpy as np
from __future__ import print_function

In [2]:
from keras.utils import to_categorical
from sklearn.metrics import classification_report

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

from sklearn.metrics import confusion_matrix

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

Using TensorFlow backend.


In [3]:
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

def run(load_func, preprocess_func, create_model_func, verbosity=2):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = preprocess_func(get_text_only(full_data))
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))

In [4]:
import pandas as pd

def load_training_data():
    return pd.read_csv("train.csv")

In [5]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential

def create_simple_model(input_dim, embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [6]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
def convert_to_sequences(text, filters, to_lower):
    tokenizer = Tokenizer(filters=filters, lower=to_lower, split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

from keras.preprocessing.text import text_to_word_sequence

def convert_to_word_sequence(text):
    return [text_to_word_sequence(words) for words in text]

In [8]:
from keras.preprocessing.sequence import pad_sequences

def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen)    

In [9]:
# Converts each text into a sequence of number and then pads so all sequences have identical length
def convert_to_sequence_and_pad(text):
    return pad_data(convert_to_sequences(text, "", False))

In [11]:
run(load_training_data, convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.823033707865
metrics
             precision    recall  f1-score   support

          0       0.87      0.79      0.83      1631
          1       0.76      0.88      0.82      1095
          2       0.83      0.81      0.82      1190

avg / total       0.83      0.82      0.82      3916

confusion matrix
[[1293  190  148]
 [  75  964   56]
 [ 113  111  966]]


In [20]:
# 1. Filters certain characters (~!@#$%^&*()_+`-=,./;'<>?:") from the text
# 2. converts each text into a sequence of numbers
# 3. pads each sequence to be the same length
def convert_to_sequence_and_pad_and_filter_chars(text):
    return pad_data(convert_to_sequences(text, "~!@#$%^&*()_+`-=,./;'<>?:\"", False))

sample = ["The quick brown fox jumped over the something, I can't remember what the fox jumped over, but it was brown."]
print("Before: ", sample[0])
print("After:  ", convert_to_sequence_and_pad_and_filter_chars(sample)[0])

Before:  The quick brown fox jumped over the something, I can't remember what the fox jumped over, but it was brown.
After:   [ 6  7  1  2  3  4  5  8  9 10 11 12 13  5  2  3  4 14 15 16  1]


In [21]:
run(load_training_data, convert_to_sequence_and_pad_and_filter_chars, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.83835546476
metrics
             precision    recall  f1-score   support

          0       0.86      0.83      0.84      1553
          1       0.94      0.77      0.84      1168
          2       0.76      0.91      0.83      1195

avg / total       0.85      0.84      0.84      3916

confusion matrix
[[1291   43  219]
 [ 135  899  134]
 [  83   19 1093]]


In [23]:
# 1. Removes stopwords using the nltk supplied stop words for english
# 2. Converts the texts to sequences of numbers
# 3. Pads the sequences to identical lengths
def remove_stopwords_then_convert_to_sequence_and_pad(text):
    return pad_data(convert_to_sequences(remove_stops(text), "", False))

sample = ["The quick brown fox jumped over the something, I can't remember what the fox jumped over, but it was brown."]
print("Before: ", sample[0])
print("After: ", remove_stopwords_then_convert_to_sequence_and_pad(sample)[0])

Before:  The quick brown fox jumped over the something, I can't remember what the fox jumped over, but it was brown.
After:  [ 5  6  1  2  3  7  4  8  9 10 11  2  3  4  1 12]


In [24]:
run(load_training_data, remove_stopwords_then_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.841164453524
metrics
             precision    recall  f1-score   support

          0       0.79      0.91      0.84      1598
          1       0.89      0.79      0.84      1089
          2       0.89      0.79      0.84      1229

avg / total       0.85      0.84      0.84      3916

confusion matrix
[[1456   66   76]
 [ 183  861   45]
 [ 210   42  977]]


In [26]:
# We had seen a submission where the person treated punctuation as distinct words, we thought this would be worth trying
#  seeing as some authors may have different patterns of punctuation
def convert_punctuation_to_words(text):
    t = text.replace(",", " , ")
    t = t.replace(".", " . ")
    t = t.replace("'", " ' ")
    t = t.replace(";", " ; ")
    t = t.replace(":", " : ")
    return t

# 1. Introduces spacing to make punctuation a distinct word
# 2. converts each text into a sequence of numbers
# 3. pads the data to the same length
def make_punctuation_words_and_convert_to_sequence(text):
    t = [convert_punctuation_to_words(x) for x in text]
    return pad_data(convert_to_sequences(t, "", False))

sample = ["Hello - The quick brown fox - jumped over the something - I can't remember what the fox jumped over, but it was brown!"]
print("Before: ", sample[0])
print("After: ", make_punctuation_words_and_convert_to_sequence(sample)[0])

Before:  Hello - The quick brown fox - jumped over the something - I can't remember what the fox jumped over, but it was brown!
After:  [ 6  1  7  8  9  2  1  3  4  5 10  1 11 12 13 14 15 16  5  2  3  4 17 18 19
 20 21]


In [27]:
run(load_training_data, make_punctuation_words_and_convert_to_sequence, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.848314606742
metrics
             precision    recall  f1-score   support

          0       0.88      0.82      0.85      1586
          1       0.83      0.86      0.85      1115
          2       0.83      0.87      0.85      1215

avg / total       0.85      0.85      0.85      3916

confusion matrix
[[1297  139  150]
 [  90  962   63]
 [  95   57 1063]]


In [28]:
# 1. Converts each punctuation to a separte word
# 2. removes the stop words from each text
# 3. converts the texts into sequences of numbers
# 4. Pads each sequence to the same length
def punctuation_as_words_and_remove_stopwords(text):
    t = convert_punctuation_to_words(text)
    t = remove_stops(t)
    return pad_data(convert_to_sequences(t, "", False))

sample = ["Hello - The quick brown fox - jumped over the something - I can't remember what the fox jumped over, but it was brown!"]
print("Before: ", sample[0])
print("After: ", make_punctuation_words_and_convert_to_sequence(sample)[0])

Before:  Hello - The quick brown fox - jumped over the something - I can't remember what the fox jumped over, but it was brown!
After:  [ 6  1  7  8  9  2  1  3  4  5 10  1 11 12 13 14 15 16  5  2  3  4 17 18 19
 20 21]


In [29]:
run(load_training_data, punctuation_as_words_and_remove_stopwords, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.834780388151
metrics
             precision    recall  f1-score   support

          0       0.76      0.94      0.84      1599
          1       0.93      0.78      0.85      1114
          2       0.89      0.74      0.81      1203

avg / total       0.85      0.83      0.83      3916

confusion matrix
[[1499   34   66]
 [ 196  874   44]
 [ 275   32  896]]


In [30]:
# This creates the simple model, but decreases the embedding dimensions to 10
def create_simple_model_with_fewer_embedding_dims(input_dim):
    return create_simple_model(input_dim, embedding_dims=10)

In [31]:
run(load_training_data, convert_to_sequence_and_pad, create_simple_model_with_fewer_embedding_dims, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.833758937692
metrics
             precision    recall  f1-score   support

          0       0.78      0.91      0.84      1593
          1       0.92      0.73      0.82      1119
          2       0.84      0.83      0.84      1204

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1445   45  103]
 [ 217  818   84]
 [ 179   23 1002]]


In [32]:
# Creates the simple model, but increases the embedding dimensions to 30
def create_simple_model_with_more_embedding_dims(input_dim):
    return create_simple_model(input_dim, embedding_dims=30)

In [33]:
run(load_training_data, convert_to_sequence_and_pad, create_simple_model_with_more_embedding_dims, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.821756894791
metrics
             precision    recall  f1-score   support

          0       0.80      0.87      0.83      1583
          1       0.79      0.87      0.83      1145
          2       0.90      0.72      0.80      1188

avg / total       0.83      0.82      0.82      3916

confusion matrix
[[1374  142   67]
 [ 124  991   30]
 [ 219  116  853]]


In [34]:
import nltk
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

# Does some lemmatizing on  the texts to hopefully make similar words the same
# set to prefer the verb version of words if there is are conflicting choices
def lemmatize_texts_to_verbs(texts):
    lmtzr = WordNetLemmatizer()
    return [" ".join([lmtzr.lemmatize(word, "v") for word in nltk.word_tokenize(text)]) for text in texts]

# 1. Lemmatizes each text
# 2. Converts each text into a sequence of numbers and pads the seqeunces to the same length
def lemmatize_verb_and_convert_to_sequence_and_pad(text):
    t = lemmatize_texts_to_verbs(text)
    return convert_to_sequence_and_pad(t)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
run(load_training_data, lemmatize_verb_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.850102145046
metrics
             precision    recall  f1-score   support

          0       0.82      0.90      0.86      1621
          1       0.94      0.79      0.86      1133
          2       0.83      0.84      0.84      1162

avg / total       0.86      0.85      0.85      3916

confusion matrix
[[1459   38  124]
 [ 165  896   72]
 [ 164   24  974]]


In [42]:
import nltk
nltk.download("wordnet")
from nltk.stem.wordnet import WordNetLemmatizer

# Does some lemmatizing on  the texts to hopefully make similar words the same
# set to prefer the verb version of words if there is are conflicting choices
def lemmatize_texts_to_nouns(texts):
    lmtzr = WordNetLemmatizer()
    return [" ".join([lmtzr.lemmatize(word, "n") for word in nltk.word_tokenize(text)]) for text in texts]

# 1. Lemmatizes each text
# 2. Converts each text into a sequence of numbers and pads the seqeunces to the same length
def lemmatize_noun_and_convert_to_sequence_and_pad(text):
    t = lemmatize_texts_to_nouns(text)
    return convert_to_sequence_and_pad(t)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
run(load_training_data, lemmatize_noun_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.820224719101
metrics
             precision    recall  f1-score   support

          0       0.77      0.89      0.83      1593
          1       0.94      0.72      0.82      1193
          2       0.81      0.82      0.82      1130

avg / total       0.83      0.82      0.82      3916

confusion matrix
[[1418   42  133]
 [ 242  863   88]
 [ 183   16  931]]


In [38]:
from nltk.stem import PorterStemmer

# Uses the Porter stemmer to stem each word in the texts
def stem_texts_porter(texts):
    stmr = PorterStemmer()
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]
    
# 1. Stems texts with the porter stemmer
# 2. Convert texts into sequences of numbers and pads those
def stem_porter_and_convert_to_sequence_and_pad(text):
    t = stem_texts_porter(text)
    return convert_to_sequence_and_pad(t)

In [39]:
run(load_training_data, stem_porter_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.850102145046
metrics
             precision    recall  f1-score   support

          0       0.89      0.81      0.85      1557
          1       0.86      0.87      0.86      1143
          2       0.80      0.88      0.84      1216

avg / total       0.85      0.85      0.85      3916

confusion matrix
[[1260  107  190]
 [  74  993   76]
 [  84   56 1076]]


In [40]:
from nltk.stem import SnowballStemmer

# Uses the snowball stemmer to stem words in texts
def stem_texts_snowball(texts):
    stmr = SnowballStemmer("english")
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]
  
# 1. stems words in texts with the snowball stemmer
# 2. converts texts to sequences of numbers and pads those
def stem_snowball_and_convert_to_sequence_and_pad(text):
    t = stem_texts_snowball(text)
    return convert_to_sequence_and_pad(t)

In [41]:
run(load_training_data, stem_snowball_and_convert_to_sequence_and_pad, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.847803881512
metrics
             precision    recall  f1-score   support

          0       0.84      0.87      0.85      1560
          1       0.84      0.88      0.86      1147
          2       0.87      0.80      0.83      1209

avg / total       0.85      0.85      0.85      3916

confusion matrix
[[1350  114   96]
 [ 100 1004   43]
 [ 163   80  966]]


In [203]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

# removes words from each sequence where the filter_func returns false
def filter_words(sequences, filter_func):
    return [[word for word in sequence if filter_func(word) is not False] for sequence in sequences]

def create_infrequent_words_filter(num_to_keep):
    def f(text):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(text)
        t = convert_to_word_sequence(text)
        t = filter_words(t, lambda w: tokenizer.word_index[w] > num_to_keep)
        return [[tokenizer.word_index[word] for word in seq] for seq in t]
    return f

In [205]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(100)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.827630234934
metrics
             precision    recall  f1-score   support

          0       0.78      0.91      0.84      1608
          1       0.90      0.76      0.82      1152
          2       0.84      0.78      0.81      1156

avg / total       0.83      0.83      0.83      3916

confusion matrix
[[1462   55   91]
 [ 197  872   83]
 [ 205   44  907]]


In [213]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(1000)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.778855975485
metrics
             precision    recall  f1-score   support

          0       0.82      0.75      0.78      1600
          1       0.84      0.76      0.80      1111
          2       0.70      0.84      0.76      1205

avg / total       0.79      0.78      0.78      3916

confusion matrix
[[1195  108  297]
 [ 130  841  140]
 [ 136   55 1014]]


In [214]:
def remove_most_frequent_words(text):
    t = create_infrequent_words_filter(10000)(text)
    return pad_data(t)

run(load_training_data, remove_most_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.516598569969
metrics
             precision    recall  f1-score   support

          0       0.47      0.90      0.62      1574
          1       0.72      0.34      0.46      1169
          2       0.56      0.18      0.27      1173

avg / total       0.57      0.52      0.47      3916

confusion matrix
[[1414   82   78]
 [ 686  398   85]
 [ 886   76  211]]


In [215]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

def create_frequent_words_filter(num_to_keep):
    def f(text):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(text)
        t = convert_to_word_sequence(text)
        t = filter_words(t, lambda w: tokenizer.word_index[w] <= num_to_keep)
        return [[tokenizer.word_index[word] for word in seq] for seq in t]
    return f

In [218]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(100)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.577374872319
metrics
             precision    recall  f1-score   support

          0       0.56      0.76      0.65      1565
          1       0.64      0.36      0.46      1152
          2       0.58      0.54      0.56      1199

avg / total       0.59      0.58      0.56      3916

confusion matrix
[[1197  120  248]
 [ 510  416  226]
 [ 434  117  648]]


In [219]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(1000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.728038815117
metrics
             precision    recall  f1-score   support

          0       0.76      0.73      0.74      1609
          1       0.63      0.81      0.71      1068
          2       0.81      0.66      0.73      1239

avg / total       0.74      0.73      0.73      3916

confusion matrix
[[1168  305  136]
 [ 145  861   62]
 [ 218  199  822]]


In [220]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(5000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.813329928498
metrics
             precision    recall  f1-score   support

          0       0.78      0.86      0.82      1573
          1       0.88      0.74      0.81      1131
          2       0.81      0.82      0.82      1212

avg / total       0.82      0.81      0.81      3916

confusion matrix
[[1350   73  150]
 [ 207  836   88]
 [ 176   37  999]]


In [221]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(10000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.833758937692
metrics
             precision    recall  f1-score   support

          0       0.77      0.91      0.84      1596
          1       0.87      0.79      0.83      1098
          2       0.91      0.78      0.84      1222

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1453   82   61]
 [ 198  863   37]
 [ 227   46  949]]


In [222]:
def remove_least_frequent_words(text):
    t = create_frequent_words_filter(15000)(text)
    return pad_data(t)

run(load_training_data, remove_least_frequent_words, create_simple_model, verbosity=0)

Loading data
Getting labels
Preprocessing
Creating model
Training model
Training complete
Testing model
Test results
accuracy 0.828907048008
metrics
             precision    recall  f1-score   support

          0       0.76      0.92      0.83      1524
          1       0.86      0.82      0.84      1154
          2       0.92      0.73      0.81      1238

avg / total       0.84      0.83      0.83      3916

confusion matrix
[[1396   81   47]
 [ 175  943   36]
 [ 264   67  907]]


In [138]:
# example knn classifier: https://stackoverflow.com/questions/42872425/text-classification-using-knn

In [14]:
from keras.utils import to_categorical
# The preprocess_func_arr is an array of functions to preprocess the text. The func will be called in order and the 
#   output of each will be the input to the next
def run_knn(load_func, preprocess_func_arr, create_model_func):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = get_text_only(full_data)
    for func in preprocess_func_arr:
        data = func(data)

    print("Creating model")
    model = create_model_func()
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    print(y_test[0])
    print("Training model")
    model.fit(x_train, y_train)
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict(x_test)
    print(y_pred[0])
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))

In [15]:
from sklearn.neighbors import KNeighborsClassifier

def create_knn_model():
    return KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

In [16]:
from keras.preprocessing.text import one_hot
from sklearn.feature_extraction.text import CountVectorizer

def convert_words_to_one_hot(text):
    print(text[0])
    vectorizer = CountVectorizer()
    text = vectorizer.fit_transform(text).todense()
    print(text[0])
    return text

In [17]:
run_knn(load_training_data, [convert_words_to_one_hot], create_knn_model)

Loading data
Getting labels
Preprocessing
This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
[[0 0 0 ..., 0 0 0]]
Creating model
[ 1.  0.  0.]
Training model
Training complete
Testing model
[ 0.  0.  1.]
Test results
accuracy 0.415730337079
metrics
             precision    recall  f1-score   support

          0       0.46      0.73      0.57      1577
          1       0.62      0.08      0.14      1096
          2       0.55      0.32      0.40      1243

avg / total       0.53      0.42      0.39      3916

confusion matrix
[[1345   26  206]
 [ 893   84  119]
 [ 824   26  393]]


In [18]:
run_knn(load_training_data, [remove_stops, convert_words_to_one_hot], create_knn_model)

Loading data
Getting labels
Preprocessing
This process , however , afforded means ascertaining dimensions dungeon ; I might make circuit , return point whence I set , without aware fact ; perfectly uniform seemed wall .
[[0 0 0 ..., 0 0 0]]
Creating model
[ 1.  0.  0.]
Training model
Training complete
Testing model
[ 0.  0.  1.]
Test results
accuracy 0.378958120531
metrics
             precision    recall  f1-score   support

          0       0.47      0.55      0.51      1620
          1       0.50      0.04      0.07      1108
          2       0.38      0.47      0.42      1188

avg / total       0.45      0.38      0.36      3916

confusion matrix
[[1050   19  551]
 [ 716   39  353]
 [ 613   20  555]]
