### Using Convolutional Neural Networks for Author Prediction

Evan Gordon


#### Base Example Model
The point of this notebook is to experiment with tweaking the shape size and sampling features of the model without actually modifying the input data. I want to do this in hopes to find the best method i can this way, and then see how similar changes might affect my third model when i do start making changes to the input data.

In [1]:
from __future__ import print_function
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
#from nltk.corpus import stopwords
#import nltk

#nltk.download('stopwords')
#nltk.download('punkt')
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

dframe = pd.read_csv("../input/train.csv")# read data into a dataframe
authors = dict([(auth, idx) for idx, auth in enumerate(dframe['author'].unique())])#turn author names into IDs
print(authors)
dframe['author_id'] = dframe['author'].apply(lambda x: authors[x])
dframe.head()

Using TensorFlow backend.


{'MWS': 2, 'EAP': 0, 'HPL': 1}


Unnamed: 0,id,text,author,author_id
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


In [89]:
def prepare_data(df):# Use the text and author_id fields to train a classifier.
    sents = df['text'].tolist()#Get the sentences, 
    labels = df['author_id'].tolist()

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)# Tokenize each sentence, 
    tokenizer.fit_on_texts(sents)
    sequences = tokenizer.texts_to_sequences(sents)
    print(len(sequences))
    print(sequences[0])
    
    word_index = tokenizer.word_index##    Get a vector of unique terms here
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    indices = np.arange(data.shape[0])# split the data into a training set and a validation set
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    return data[:-num_validation_samples], labels[:-num_validation_samples], data[-num_validation_samples:], labels[-num_validation_samples:], word_index

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

def print_data(testx, testy, m):#m for model
    score = m.evaluate(testx, testy, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])#print accuracy    
    predictions = m.predict_classes(testx)
    Y_t = np.argmax(testy, axis=1)
    print(precision_recall_fscore_support(Y_t, predictions))
    print("Classification")
    print(classification_report(Y_t, predictions))
    print("Confusion Matrix")
    print(confusion_matrix(Y_t, predictions))

In [90]:
#  3. Load embeddings
x_train, y_train, x_val, y_val, word_index = prepare_data(dframe)
embeddings_index = {}
f = open(os.path.join('../input/', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
#  4. Create the Embedding matrix for the training set
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

19579
[26, 2945, 143, 1372, 22, 36, 294, 2, 7451, 1, 2440, 2, 10, 4556, 16, 6, 79, 179, 48, 4245, 3, 295, 4, 1, 249, 1943, 6, 326, 74, 134, 123, 891, 2, 1, 313, 39, 1438, 4928, 98, 1, 430]
<class 'list'>
Found 25943 unique tokens.
Shape of data tensor: (19579, 1000)
Shape of label tensor: (19579, 3)
Found 400000 word vectors.


In [77]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5
#add drop probability?
#add hidden size?

model1 = Sequential()
model1.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model1.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model1.add(MaxPooling1D(pool_size))
model1.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model1.add(MaxPooling1D(pool_size))
model1.add(Conv1D(conv_depth_3, k_size, activation='relu'))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(nn_depth_1, activation='relu'))
model1.add(Dense(len(authors), activation='softmax'))
model1.summary()
#model1 = Model(sequence_input, x)
model1.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model1.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_113 (Conv1D)          (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_73 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_114 (Conv1D)          (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_74 (MaxPooling (None, 39, 128)           0         
_________________________________________________________________
conv1d_115 (Conv1D)          (None, 35, 128)           82048     
_________________________________________________________________
global_max_pooling1d_27 (Glo (None, 128)               0         
__________

### Results

This model is based off of Joe Dumoulin's base model at: https://github.com/JoeDumoulin/CSCD439F17/blob/master/notebooks/Final%20Project/Keras%20Convolutional%20Network%20for%20Spooky%20Author%20ID0.ipynb

The resulting model came across as a bit bulky and I intend to make a model that is a bit smaller than this one and see how a smaller model with less layers might fare compared to this one. Running this model a full 50 times like Joe did is a bit impossible on my current machine so the evaluations of this model will have to be based on what I have. From here on I will split up my efforts and try to create two different models. 

The first step will be to try to find a better shape for the layers of the model.
If i can find a shape that produces a more accurate prediction of the data I'll submit that model for testing.
I'll then create another model, but this time remove the stop words. This model will use the same shape as the previous model. By doing it this way i can create a somewhat controlled environment, in which I can clearly see the effect of removing stopwords from a dataset.


#### Model One 

##### attempt one: remove one convolutional layer 

In [78]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5

model2 = Sequential()
model2.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model2.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model2.add(MaxPooling1D(pool_size))
model2.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(nn_depth_1, activation='relu'))
model2.add(Dense(len(authors), activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_116 (Conv1D)          (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_75 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_117 (Conv1D)          (None, 195, 128)          82048     
_________________________________________________________________
global_max_pooling1d_28 (Glo (None, 128)               0         
_________________________________________________________________
dense_63 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_64 (Dense)             (None, 3)                 387       
Total para

Overall given the 5 epocs I ran this model ended up slightly more accurate than the base model (though only marginally so) I would hypothesize that this might not be the case if more epochs were to be run. To this end I'm going to try a compromise, I'm going to add the 3rd layer back into the model, however this time I will be decreasing the size of that layer and the Dense layer by half.

In [79]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 64
nn_depth_1 = 64
k_size = 5
pool_size = 5
#add drop probability?
#add hidden size?

model3 = Sequential()
model3.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model3.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model3.add(MaxPooling1D(pool_size))
model3.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model3.add(MaxPooling1D(pool_size))
model3.add(Conv1D(conv_depth_3, k_size, activation='relu'))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(nn_depth_1, activation='relu'))
model3.add(Dense(len(authors), activation='softmax'))
model3.summary()
model3.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model3.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_118 (Conv1D)          (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_76 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_119 (Conv1D)          (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_77 (MaxPooling (None, 39, 128)           0         
_________________________________________________________________
conv1d_120 (Conv1D)          (None, 35, 64)            41024     
_________________________________________________________________
global_max_pooling1d_29 (Glo (None, 64)                0         
__________

This model was ok, but the recall ended up worse. Otherwise this model ended up producing almost identical results, thus i think this model in the long run would end up inferior so I won't end up continuing donw this path. I'm going to roll back to the previous model but now try tweaking the kernel size.

In [80]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 7
pool_size = 5

model4 = Sequential()
model4.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model4.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model4.add(MaxPooling1D(pool_size))
model4.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model4.add(GlobalMaxPooling1D())
model4.add(Dense(nn_depth_1, activation='relu'))
model4.add(Dense(len(authors), activation='softmax'))
model4.summary()
model4.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model4.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model4)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_121 (Conv1D)          (None, 994, 128)          89728     
_________________________________________________________________
max_pooling1d_78 (MaxPooling (None, 198, 128)          0         
_________________________________________________________________
conv1d_122 (Conv1D)          (None, 192, 128)          114816    
_________________________________________________________________
global_max_pooling1d_30 (Glo (None, 128)               0         
_________________________________________________________________
dense_67 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_68 (Dense)             (None, 3)                 387       
Total para

What I learned here is that increasing the kernel size didn't really help my model in a positive way. I'm going to now revert back to a kernel size of 5 but this time try increasing the size of the first convolutional layer, and then add some dropout to the model.

In [81]:
from keras.models import Sequential
from keras.layers import Dropout
batch_size = 128
num_epochs = 5
conv_depth_1 = 256
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5
drop_prob_1 = 0.5
#add hidden size?

model5 = Sequential()
model5.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model5.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model5.add(MaxPooling1D(pool_size))
model5.add(Dropout(drop_prob_1))
model5.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model5.add(GlobalMaxPooling1D())
model5.add(Dropout(drop_prob_1))
model5.add(Dense(nn_depth_1, activation='relu'))
model5.add(Dense(len(authors), activation='softmax'))
model5.summary()
model5.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model5.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model5)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_123 (Conv1D)          (None, 996, 256)          128256    
_________________________________________________________________
max_pooling1d_79 (MaxPooling (None, 199, 256)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 199, 256)          0         
_________________________________________________________________
conv1d_124 (Conv1D)          (None, 195, 128)          163968    
_________________________________________________________________
global_max_pooling1d_31 (Glo (None, 128)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
__________

In [83]:
from keras.models import Sequential
from keras.layers import Dropout
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 3
pool_size = 5
drop_prob_1 = 0.5

model6 = Sequential()
model6.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model6.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model6.add(MaxPooling1D(pool_size))
model6.add(Dropout(drop_prob_1))
model6.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model6.add(GlobalMaxPooling1D())
model6.add(Dropout(drop_prob_1))
model6.add(Dense(nn_depth_1, activation='relu'))
model6.add(Dense(len(authors), activation='softmax'))
model6.summary()
model6.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model6.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model6)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_127 (Conv1D)          (None, 998, 128)          38528     
_________________________________________________________________
max_pooling1d_81 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 199, 128)          0         
_________________________________________________________________
conv1d_128 (Conv1D)          (None, 197, 128)          49280     
_________________________________________________________________
global_max_pooling1d_33 (Glo (None, 128)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
__________

Yet again this model hasn't suprassed my original modification from the main model. I'm going to make one more attempt. In this one I'm reverting to that original modification and adding in the dropout layers.

In [84]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5
drop_prob_1 = 0.5

model7 = Sequential()
model7.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model7.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model7.add(MaxPooling1D(pool_size))
model7.add(Dropout(drop_prob_1))
model7.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model7.add(GlobalMaxPooling1D())
model7.add(Dropout(drop_prob_1))
model7.add(Dense(nn_depth_1, activation='relu'))
model7.add(Dense(len(authors), activation='softmax'))
model7.summary()
model7.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model7.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model7)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_129 (Conv1D)          (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_82 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 199, 128)          0         
_________________________________________________________________
conv1d_130 (Conv1D)          (None, 195, 128)          82048     
_________________________________________________________________
global_max_pooling1d_34 (Glo (None, 128)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
__________

It looks like between all of my different attempts removing the third convolutional layer seems to have accected the model the most positively, so since none of my other attepmts produced noticable improved results within the given timeframe of 5 epochs, I am going to now run my best model but for 25 epochs and see about where the accuracy of my model ends up.

In [85]:
#model 2 revisited
from keras.models import Sequential
batch_size = 128
num_epochs = 25
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5

model2 = Sequential()
model2.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model2.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model2.add(MaxPooling1D(pool_size))
model2.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(nn_depth_1, activation='relu'))
model2.add(Dense(len(authors), activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_131 (Conv1D)          (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_83 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_132 (Conv1D)          (None, 195, 128)          82048     
_________________________________________________________________
global_max_pooling1d_35 (Glo (None, 128)               0         
_________________________________________________________________
dense_77 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_78 (Dense)             (None, 3)                 387       
Total para

### Result of the Model
The accuracy of this model after going through the full 25 epochs turned out to be quite high. By human standards this would be a very accurate model even with how low the value accuracy is.

Please see the next notbook (final3) for a futher attempt at this subject.