In [1]:
#imports
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
#Set Hyper Parameters:
max_features = 5000
batch_size = 32
embedding_dims = 20
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10
patience = 2

In [22]:
# Read in the Data
train = pd.read_csv('train.csv')
print(train.head())
print("training data shape:", train.shape)

#Transform Author and Split X and Y
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
labels = np.array([a2c[a] for a in train.author])
labels = to_categorical(labels)

print("train labels: ", labels)
text = train["text"]
print("train text shape: ", text.shape)

        id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL
training data shape: (19579, 3)
train labels:  [[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]
 ..., 
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]]
train text shape:  (19579,)


In [23]:
#Pre-Processing

#removing stopwords reduces accuracy

#Tokenize
tokenizer = Tokenizer(filters="", lower=True, split=" ", char_level=False)
tokenizer.fit_on_texts(text);
docs = tokenizer.texts_to_sequences(text);
print("text: ", text[0])
print("tokenized: ", docs[0])
maxlen = np.amax([len(x) for x in docs], axis=0)
print("max doc length: ", maxlen)

#Pad to all doc are the same length
print('Pad sequences (samples x time)')
docs = sequence.pad_sequences(docs, maxlen=maxlen)
print('padded docs shape:', docs.shape)


[nltk_data] Downloading package stopwords to C:\Users\Beth &
[nltk_data]     Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
text:  This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
tokenized:  [26, 8207, 142, 1331, 31, 37, 272, 2, 12060, 1, 5088, 2, 10, 22483, 16, 6, 75, 160, 45, 22484, 3, 341, 4, 1, 274, 1996, 6, 302, 1062, 121, 123, 886, 2, 1, 22485, 39, 1332, 6237, 88, 1, 2401]
max doc length:  861
Pad sequences (samples x time)
padded docs shape: (19579, 861)


In [24]:
#Split the testing and training data
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2)

In [25]:
#Create the model
model = Sequential()


#Embedding layer
input_dim = np.max(docs) + 1

model.add(Embedding(input_dim,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# Convolutional Layer
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

# Max Pooling Layer
model.add(GlobalMaxPooling1D())

# Hidden Layer
model.add(Dense(hidden_dims+200))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# Hidden Layer
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

#SoftMax Layer
model.add(Dense(3, activation='softmax'))

#Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#Train and test the model
model.fit(x_train, y_train,
                 batch_size=batch_size,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=patience, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x17b840c39b0>