### Spooky Author Predictions Attempt Three

#### Evan Gordon


In [6]:
from __future__ import print_function
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('punkt')
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

dframe = pd.read_csv("../input/train.csv")# read data into a dataframe
authors = dict([(auth, idx) for idx, auth in enumerate(dframe['author'].unique())])#turn author names into IDs
print(authors)
dframe['author_id'] = dframe['author'].apply(lambda x: authors[x])
dframe.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/naazarik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/naazarik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
{'HPL': 1, 'MWS': 2, 'EAP': 0}


Unnamed: 0,id,text,author,author_id
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


In [68]:
from nltk.tokenize import word_tokenize

def prepare_data(df):# Use the text and author_id fields to train a classifier.
    sents = df['text'].tolist()#Get the sentences, 
    labels = df['author_id'].tolist()
    stops = set(stopwords.words('english'))
    
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)# Tokenize each sentence, 
    tokenizer.fit_on_texts(sents)
    sequences = tokenizer.texts_to_sequences(sents)
    print(len(sequences))
    print(sequences[0])
    
    #print(len(sents))
    #modified = []
    #for sent in sents:
    #    #modified_sent = []
    #    for term in sent:#word_tokenize(sent):
    #        if term in stops:
    #            sents.remove(sent, term)
    #            #modified_sent.append(term.lower())
    #    modified.append(modified_sent)#======stopword end
    #tok = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
    #tok.fit_on_texts(modified)
    
    
    word_index = dict([(w,i) for w,i in tokenizer.word_index.items() if w not in stops])##    Get a vector of unique terms here
    print('Found %s unique tokens.' % len(word_index))
    
    #labels = df['author_id'].tolist()
    #labeled_corpus = list(zip(modified, labels))
    #print(labeled_corpus[0])
    #print(type(labeled_corpus))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    indices = np.arange(data.shape[0])# split the data into a training set and a validation set
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    return data[:-num_validation_samples], labels[:-num_validation_samples], data[-num_validation_samples:], labels[-num_validation_samples:], word_index

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

def print_data(testx, testy, m):#m for model
    score = m.evaluate(testx, testy, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])#print accuracy    
    predictions = m.predict_classes(testx)
    Y_t = np.argmax(testy, axis=1)
    print(precision_recall_fscore_support(Y_t, predictions))
    print("Classification")
    print(classification_report(Y_t, predictions))
    print("Confusion Matrix")
    print(confusion_matrix(Y_t, predictions))

In [69]:
#  3. Load embeddings
x_train, y_train, x_val, y_val, word_index = prepare_data(dframe)
embeddings_index = {}
f = open(os.path.join('../input/', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
#  4. Create the Embedding matrix for the training set
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

19579
[26, 2945, 143, 1372, 22, 36, 294, 2, 7451, 1, 2440, 2, 10, 4556, 16, 6, 79, 179, 48, 4245, 3, 295, 4, 1, 249, 1943, 6, 326, 74, 134, 123, 891, 2, 1, 313, 39, 1438, 4928, 98, 1, 430]
Found 25808 unique tokens.
Shape of data tensor: (19579, 1000)
Shape of label tensor: (19579, 3)
Found 400000 word vectors.


In [71]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5
#add drop probability?
#add hidden size?

model1 = Sequential()
model1.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model1.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model1.add(MaxPooling1D(pool_size))
model1.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model1.add(MaxPooling1D(pool_size))
model1.add(Conv1D(conv_depth_3, k_size, activation='relu'))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(nn_depth_1, activation='relu'))
model1.add(Dense(len(authors), activation='softmax'))
model1.summary()
#model1 = Model(sequence_input, x)
model1.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model1.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
__________

My first observation of this model would be that it was the fastest to converge towards it's optimal state. No matter what changes i made in my previous models none of them converged nearly this fast reaching over 70% accuracy in just over 3 epochs.
Since the shape of this model is based off of the one provided my next goal is to take my best model from my second notebook and try training that.

In [72]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 5
pool_size = 5

model2 = Sequential()
model2.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model2.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model2.add(MaxPooling1D(pool_size))
model2.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(nn_depth_1, activation='relu'))
model2.add(Dense(len(authors), activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total para

In [74]:
from keras.models import Sequential
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 3
pool_size = 5

model2 = Sequential()
model2.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model2.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model2.add(MaxPooling1D(pool_size))
model2.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(nn_depth_1, activation='relu'))
model2.add(Dense(len(authors), activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print("Training...")
model2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model2)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 998, 128)          38528     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 197, 128)          49280     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 387       
Total para

### Results

From looking at the accuracy of the different models that I produced here by modifying the input data and then tweaking certain values of the model itself, my second model (model2) was the most accurate thus far.

In order to get a better indicator of how accurate this model can get i will now run it for 30 epochs.

In [76]:
batch_size = 128
num_epochs = 5
conv_depth_1 = 128
conv_depth_2 = 128
conv_depth_3 = 128
nn_depth_1 = 128
k_size = 3
pool_size = 5

model2 = Sequential()
model2.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model2.add(Conv1D(conv_depth_1, k_size, activation='relu', input_shape =(1000, 1)))
model2.add(MaxPooling1D(pool_size))
model2.add(Conv1D(conv_depth_2, k_size, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(nn_depth_1, activation='relu'))
model2.add(Dense(len(authors), activation='softmax'))
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model2.fit(x_train, y_train, batch_size=batch_size, epochs=30, verbose=1, validation_data=(x_val, y_val))

print_data(x_val, y_val, model2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 998, 128)          38528     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 197, 128)          49280     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 387       
Total para

### Results

While this model ended up with a high accuracy of 96.5% the value accuracy turned out to not be much better than the rest of my models, having a value accuracy of 68.7%. For future tests i might concider adding in garbage words to the model, removing smaller words (since larger words tend to be used less frequently and might help indicate who is talking) and possibly try changing values in the model some more.