# Submission 2 - RNN

In [1]:
#imports
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


Here we set our hyper parameters that we will use elsewhere. Tbese were first set randomly and then tweaked.

In [2]:
#Set Hyper Parameters:
max_features = 5000
batch_size = 32
embedding_dims = 20
filters = 250
kernel_size = 3
hidden_dims = 100
epochs = 10
patience = 2

Here we read in the data and change the author from initals to numerical

In [3]:
# Read in the Data
train = pd.read_csv('train.csv')
print(train.head())
print("training data shape:", train.shape)

#Transform Author and Split X and Y
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
labels = np.array([a2c[a] for a in train.author])
labels = to_categorical(labels)

print("train labels: ", labels)
text = train["text"]
print("train text shape: ", text.shape)

        id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL
training data shape: (19579, 3)
train labels:  [[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]
 ..., 
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]]
train text shape:  (19579,)


We pre-process the data and pad the sequence so that it is the same size. We did not remove stop words or do other pre-processing since we found that it reduce accuracy.

In [4]:
#Pre-Processing

#removing stopwords reduces accuracy

#Tokenize
tokenizer = Tokenizer(lower=True, split=" ", char_level=False)
tokenizer.fit_on_texts(text);
docs = tokenizer.texts_to_sequences(text);
print("text: ", text[0])
print("tokenized: ", docs[0])
maxlen = np.amax([len(x) for x in docs], axis=0)
print("max doc length: ", maxlen)

#Pad to all doc are the same length
print('Pad sequences (samples x time)')
docs = sequence.pad_sequences(docs, maxlen=maxlen)
print('padded docs shape:', docs.shape)


text:  This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
tokenized:  [26, 2945, 143, 1372, 22, 36, 294, 2, 7451, 1, 2440, 2, 10, 4556, 16, 6, 79, 179, 48, 4245, 3, 295, 4, 1, 249, 1943, 6, 326, 74, 134, 123, 891, 2, 1, 313, 39, 1438, 4928, 98, 1, 430]
max doc length:  861
Pad sequences (samples x time)
padded docs shape: (19579, 861)


In [5]:
#Split the testing and training data
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2)

In [6]:
#Create the model
model = Sequential()


#Embedding layer
input_dim = np.max(docs) + 1

model.add(Embedding(input_dim,
                    embedding_dims,
                    input_length=maxlen))

# Hidden Layer
model.add(Dense(hidden_dims, activation="relu"))
model.add(Dropout(0.2))

#RNN Layer
model.add(SimpleRNN(100, activation="relu"))

#SoftMax Layer
model.add(Dense(3, activation='softmax'))

#Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

#Train and test the model
model.fit(x_train, y_train,
                 batch_size=batch_size,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=patience, monitor='val_loss')])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 861, 20)           518880    
_________________________________________________________________
dense_1 (Dense)              (None, 861, 100)          2100      
_________________________________________________________________
dropout_1 (Dropout)          (None, 861, 100)          0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 541,383
Trainable params: 541,383
Non-trainable params: 0
_________________________________________________________________
None
Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x1e0f9da07f0>

This seems very promising our accuracy was 87%, which is the highest we have run into so far. This suggests that an RNN is a sutiable model for this text classification

In [8]:
print("Generating Kaggle test results - test_results.csv")
test_data = pd.read_csv("test.csv")
data = test_data["text"]

docs = tokenizer.texts_to_sequences(data);

#Pad to all doc are the same length
docs = sequence.pad_sequences(docs, maxlen=maxlen)

out = pd.DataFrame(model.predict(docs))
ids = test_data["id"]
data_to_write = pd.concat([ids, out], axis=1)
data_to_write.columns =["id", "EAP", "HPL", "MWS"]
print(data_to_write.shape)
data_to_write.to_csv("test_results.csv", index=False)

Generating Kaggle test results - test_results.csv
Pad sequences (samples x time)
padded docs shape: (8392, 861)
(8392, 4)


Our Kaggle results were good, the best that we got during the competition. Our loss was 0.577, and it placed us number # 728 on the public leader board. This suggest that RNN is a good method for this data