In [22]:
import numpy as np
import pandas as pd
from __future__ import print_function

train = pd.read_csv('train.csv')
train.shape # three columns are id, text, author

(19579, 3)

In [23]:
from keras.utils import to_categorical

a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
labels = np.array([a2c[a] for a in train.author])
labels = to_categorical(labels)
print(labels)

[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]
 ..., 
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]]


In [24]:
text = train["text"]
print("text shape: ", text.shape)
print(text[0])

text shape:  (19579,)
This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.


In [25]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

def removeStops(words):
    stops = set(stopwords.words("english"))
    return [word for word in words if word not in stops]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
from keras.preprocessing.text import Tokenizer

text = [removeStops(t) for t in text]
print(text[0])
tokenizer = Tokenizer(filters="!@#$%^&*()-=_+,./<>?;:'\"", lower=True, split=" ", char_level=False)
tokenizer.fit_on_texts(text);
docs = tokenizer.texts_to_sequences(text);
print("text: ", text[0])
print("tokenized: ", docs[0])

['T', 'h', ' ', 'p', 'r', 'c', 'e', ',', ' ', 'h', 'w', 'e', 'v', 'e', 'r', ',', ' ', 'f', 'f', 'r', 'e', ' ', 'e', ' ', 'n', ' ', 'e', 'n', ' ', 'f', ' ', 'c', 'e', 'r', 'n', 'n', 'g', ' ', 'h', 'e', ' ', 'e', 'n', 'n', ' ', 'f', ' ', ' ', 'u', 'n', 'g', 'e', 'n', ';', ' ', ' ', 'I', ' ', 'g', 'h', ' ', 'k', 'e', ' ', ' ', 'c', 'r', 'c', 'u', ',', ' ', 'n', ' ', 'r', 'e', 'u', 'r', 'n', ' ', ' ', 'h', 'e', ' ', 'p', 'n', ' ', 'w', 'h', 'e', 'n', 'c', 'e', ' ', 'I', ' ', 'e', ' ', 'u', ',', ' ', 'w', 'h', 'u', ' ', 'b', 'e', 'n', 'g', ' ', 'w', 'r', 'e', ' ', 'f', ' ', 'h', 'e', ' ', 'f', 'c', ';', ' ', ' ', 'p', 'e', 'r', 'f', 'e', 'c', 'l', ' ', 'u', 'n', 'f', 'r', ' ', 'e', 'e', 'e', ' ', 'h', 'e', ' ', 'w', 'l', 'l', '.']


AttributeError: 'list' object has no attribute 'lower'

In [69]:
from keras.preprocessing.sequence import pad_sequences

maxlen = np.amax([len(x) for x in docs], axis=0)
print("max doc length: ", maxlen)

docs = pad_sequences(sequences=docs, maxlen=maxlen)
print("docs[0]: ", len(docs[0]))

max doc length:  861
docs[0]:  861


In [70]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2)
input_dim = np.max(docs) + 1
embedding_dims = 20

In [71]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.models import Sequential

def create_model(input_dim, embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [72]:
model = create_model(input_dim, embedding_dims=embedding_dims)

In [73]:
from keras.callbacks import EarlyStopping

model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64


<keras.callbacks.History at 0x1cada922ef0>

| Attempted | acc | loss | val_acc | val_loss | epochs |
|-----------|-----|------|---------|----------|--------|
| Tokenize only | 0.9182 | 0.2822 | 0.8026 | 0.4854 | 25/64 |
| Filter non alphanumeric (!@#$%^&*()-=_+,./<>?;:'\") | 0.9399 |0.1989 | 0.8292 | 0.4235 | 36/64|
| texts_to_matrix insteadt of texts_to_sequences | 0.4054 | 1.0872 | 0.3961 | 1.0898 | 6/64 |
| Filter like above without converting to lowercase | 0.9478 | 0.1815 | 0.8253 | 0.4169 | 35/64 |