## Predicting Horror Authors Using Naive Bayes

#### Evan Gordon

In [11]:

import os
import nltk
import pandas as pd
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
df = pd.read_csv("../input/train.csv")

print(df.shape)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/naazarik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/naazarik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
(19579, 3)


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [12]:
#create function for preparing data
from nltk.tokenize import word_tokenize
def prepare_data(dataframe):
    corpus = dataframe['text'].tolist()# Read the text of the training examples  
    unique_labels = dataframe['author'].unique().tolist()
    word_tokenize(corpus[0])#tokenize
    stops = set(stopwords.words('english'))# Remove english stopwords from the tokenized lists
    modified_corpus = []
    for sent in corpus:
        modified_sent = []
        for term in word_tokenize(sent):
            if term not in stops:
                modified_sent.append(term)
        modified_corpus.append(modified_sent)
    #print(modified_corpus[0])#print without stopwords
    labels = dataframe['author'].tolist()
    print(unique_labels)
    labeled_corpus = list(zip(modified_corpus, labels))#label data
    print(labeled_corpus[0])#print without stopwords, but with labels
    return labeled_corpus

def add_class(prepared_data):
    data = []
    for passage in prepared_data:
        d = {}
        for term in passage[0]:
            d[term] = True
        data.append((d, passage[1]))
    print(data[0])
    print()
    return data

In [13]:
# create a labeled set of training features. len(all_words)
# in other words: all_words = set(word.lower() for passage in modified_corpus for word in passage[0])
labeled_data = prepare_data(df)
all_data = add_class(labeled_data)
    
print("Keys with features removed:")
print(list(all_data[0][0].keys()))

['EAP', 'HPL', 'MWS']
(['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.'], 'EAP')
({'afforded': True, 'ascertaining': True, 'whence': True, 'This': True, 'might': True, 'I': True, 'however': True, 'point': True, '.': True, ';': True, 'seemed': True, 'make': True, 'circuit': True, ',': True, 'dungeon': True, 'means': True, 'without': True, 'uniform': True, 'dimensions': True, 'aware': True, 'fact': True, 'wall': True, 'return': True, 'set': True, 'perfectly': True, 'process': True}, 'EAP')

Keys with features removed:
['afforded', 'ascertaining', 'whence', 'This', 'might', 'I', 'however', 'point', '.', ';', 'seemed', 'make', 'circuit', ',', 'dungeon', 'means', 'without', 'uniform', 'dimensions', 'aware', 'fact', 'wall', 'return', 'set', 'perfectly', 'process']


In [14]:
from random import shuffle #get a random set of features to use to create negative examples for training.
def add_random_negatives(labeled, data_with_classes, percent_negative=1):#allows 0.0-2.0 for percent negative
    if(percent_negative > 2 or percent_negative < 0):
        percent_negative = 1
    all_words = set()  
    for passage in labeled:
        for word in passage[0]:
            all_words.add(word)
    all_words = list(all_words)
    print(all_words[:5])
    all_words_idx = [i for i,_ in enumerate(all_words)]
    shuffled_word_idxs = [i for i,_ in enumerate(all_words)]
    shuffle(shuffled_word_idxs)# shuffle the indexes so that we can produce reamdom samples
    print(shuffled_word_idxs[:5])
    print([all_words[shuffled_idx] for shuffled_idx in shuffled_word_idxs[:5]])
    # Add some of the shuffled terms as negative examples for each of the data samples.
    allw = len(all_words)
    idx = 0 # we are going to loop through the shuffled values.
    for passage in data_with_classes:
        sample = list(passage[0].keys())
        j = 0
        num_to_add = len(sample) * percent_negative#  add the same number of negative samples as positive. try another model with different ammount of these
        while j < num_to_add:
            current = all_words[shuffled_word_idxs[idx]]
            #print(current)
            if current not in sample:
                passage[0][current] = False#  add the current term as a negative sample
                j = j+1## increment j
            idx = idx+1
            if idx == allw:
                idx = 0 # reset and go around again                
    print(data_with_classes[0])
    return data_with_classes

In [7]:
all_data = add_random_negatives(labeled_data, all_data)

['fanlight', 'footman', 'Doctor', 'dreamings', 'Prophet']
[12257, 26067, 14504, 16736, 3149]
['Virtu', 'Metzengerstein', 'stem', 'Moreover', 'reveries']
({'impelled': False, 'irresponsibility': False, 'rules': False, 'fact': True, 'adaption': False, 'make': True, 'respectable': False, 'Guyon': False, 'bot': False, 'noises': False, 'ascertaining': True, "'Boy": False, 'red': False, 'Paterson': False, 'steeple': False, 'thy': False, 'Et': False, 'stroked': False, 'sojourn': False, 'obliterated': False, 'condensed': False, 'circled': False, 'traffic': False, 'peaked': False, 'i.e.': False, 'Leipsic': False, 'possibly': False, 'exaggerating': False, 'comer': False, 'Andrée': False, 'churning': False, 'point': True, 'killing': False, ';': True, 'I': True, 'process': True, 'downstairs': False, 'CHARMION': False, 'bygone': False, 'theosophical': False, 'Maine': False, 'functions': False, 'stem': False, 'equal': False, 'aware': True, 'without': True, 'afforded': True, 'Basil': False, 'Showed':

In [15]:
import math
def seperate_data(prepared_data, cv):# separate training and testing data
    shuffle(prepared_data)
    cv = CountVectorizer(max_features = 1500)
    X = cv.fit_transform(all_data2).toarray()
    train_len = math.ceil(len(prepared_data)*.8)
    train_data = prepared_data[:train_len]
    test_data = prepared_data[train_len:]
    test_data_stripped = list(test[0] for test in test_data)
    return train_data, test_data, test_data_stripped

In [9]:
train_data, test_data, test_data_stripped = seperate_data(all_data)
print(test_data_stripped[0])

classifier = nltk.NaiveBayesClassifier.train(train_data)#create Naive Bayes Classifier
classifier.show_most_informative_features()

{'tuberoses': False, 'offered': False, 'quenched': False, "D'Indaginé": False, 'next': False, 'proximity': False, 'marchers': False, 'cast': True, 'contrary': True, 'footstool': False, 'collocations': False, 'painful': True, 'horrify': False, 'circumstance': True, 'transacted': False, 'resided': False, 'Tem': False, 'timidly': False, 'Wheeler': False, 'rummaging': False, 'trolleys': False, 'universe': False, 'flowed': False, 'counterfeits': False, 'Oonai': False, 'desk': False, 'declivity': False, 'Alcyone': False, 'salts': False, 'weaken': False, 'mind': True, 'ossi': False, 'retrogression': False, 'Raymond': True, 'monopoly': False, 'wearying': False, 'every': True, 'observant': False, 'unhappy': False, 'aids': False, 'Malign': False, 'told': False, 'contagion': True, 'Trist': False, ';': True, "'impossibilities": False, 'Huge': False, 'body': False, 'eerily': False, 'inherit': False, 'foul': True, 'etc': False, 'Corroborates': False, 'stove': False, 'Calcutta': False, 'essence': Tru

In [26]:
def print_accuracy(classify):
    preds = [classify.classify(test) for test in test_data_stripped]
    #print(preds[0])
    #print(test_data[0][1])
    accuracy = 0.0
    len_preds = len(preds)
    for i in range(len_preds):
        accuracy += (preds[i] == test_data[i][1])
    
    accuracy /= len(preds)
    print("Accuracy:")
    print(accuracy)
    print("For authors:")
    print(classify.labels())
    print("Over test data:")

    dftest =  pd.read_csv("../input/test.csv")# read the test data
    print(dftest.shape)
    print(dftest.head())

In [12]:
print_accuracy(classifier)

Accuracy:
0.5376756066411239
For authors:
['MWS', 'HPL', 'EAP']
Over test data:
(8392, 2)
        id                                               text
0  id02310  Still, as I urged our leaving Ireland with suc...
1  id24541  If a fire wanted fanning, it could readily be ...
2  id00134  And when they had broken down the frail door t...
3  id27757  While I was thinking how I should possibly man...
4  id04081  I am not sure to what limit his knowledge may ...


The preceding code was a baseline example by Joe Doumolin at: https://github.com/JoeDumoulin/CSCD439F17/blob/master/notebooks/Final%20Project/Text%20Processing.ipynb using a Naive Bayes classifier model. The model ended up with about a 55% prediction accuracy. It is now my goal to try to make a model that does better than that. I've modified some of Joe's code to better suit that purpose. I first want to try decreasing/increasing the number of false examples per dataset. I also want to try and change how the model is setup so that the false words in each passage didn't come from the author who wrote that passage.
I want to first start by adding double the ammount of negatives to the model and see if that positively or negatively affects the model.

In [19]:
labeled_data = prepare_data(df)
prepared = add_class(labeled_data)
all_data2 = add_random_negatives(labeled_data, prepared, 2.0)

['EAP', 'HPL', 'MWS']
(['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.'], 'EAP')
({'afforded': True, 'ascertaining': True, 'whence': True, 'This': True, 'might': True, 'I': True, 'however': True, 'point': True, '.': True, ';': True, 'seemed': True, 'make': True, 'circuit': True, ',': True, 'dungeon': True, 'means': True, 'without': True, 'uniform': True, 'dimensions': True, 'aware': True, 'fact': True, 'wall': True, 'return': True, 'set': True, 'perfectly': True, 'process': True}, 'EAP')

['obsessions', 'Gresset', 'retread', 'Birch', 'scymetars']
[18689, 22898, 13008, 4387, 11557]
['Stripes', "'but", 'Journal', 'immortal', 'shot']
({'afforded': True, 'despises': False, 'doll': False, 'Subterrene': False, 'purplish': False, 'whence': True, 'process': True, 'coach': Fal

In [24]:
#train_data, test_data, test_data_stripped, count_v = seperate_data(all_data2)
cv = CountVectorizer(max_features = 1500)#not using for now
shuffle(all_data2) 
#X = cv.fit_transform(all_data2).toarray()
#y = all_data2[:, 2].values

train_len = math.ceil(len(all_data2)*.8)
train_data = all_data2[:train_len]
test_data = all_data2[train_len:]
test_data_stripped = list(test[0] for test in test_data)
    #return train_data, test_data, test_data_stripped
print(test_data_stripped[0])

classifier2 = nltk.NaiveBayesClassifier.train(train_data)#create Naive Bayes Classifier
classifier2.show_most_informative_features()

{'cats': False, 'Mostly': False, 'Te': False, 'girdled': False, 'fru': False, 'comings': False, 'kinsmen': False, 'pots': False, 'interference': True, 'Rhone': False, 'wholesale': False, 'suspects': False, 'comprising': False, 'mxther': False, '.': True, 'copiousness': False, 'Arabella': False, 'casks': False, 'excited': True, 'reëntered': False, 'Health': False, 'ce': False, 'followed': True, 'shaggy': False, 'responded': False, 'consummation': False, 'violence': True, 'bounteous': False, 'Survive': False, ',': True, 'indignation': True, 'Grave': False, 'ingratitude': True, 'flight': False, 'public': True, 'renewed': True, 'blackest': True, 'muffling': False, 'Elizabeth': True, 'refuted': False, 'turned': True, 'inanimate': False, 'powerful': True, 'vale': False, 'creations': False, 'appeal': True, 'generous': True, 'murmur': True, 'incidents': False, 'allow': False, 'carried': False, 'approbation': True, 'poor': True, 'Landaff': False, 'Machen': False, 'neutral': False, 'derives': Fa

In [27]:
print_accuracy(classifier2)

Accuracy:
0.5149425287356322
For authors:
['HPL', 'MWS', 'EAP']
Over test data:
(8392, 2)
        id                                               text
0  id02310  Still, as I urged our leaving Ireland with suc...
1  id24541  If a fire wanted fanning, it could readily be ...
2  id00134  And when they had broken down the frail door t...
3  id27757  While I was thinking how I should possibly man...
4  id04081  I am not sure to what limit his knowledge may ...


After running this a few times it seems that doubling the ammount of negative examples within the model actually decreased the accuracy of the model significantly. The question that follows is, will decreasing the amount of negatives increase the accuracy?

In [28]:
data = prepare_data(df)
prepared = add_class(data)
all_data3 = add_random_negatives(data, prepared, 0.75)#each passage gets 75% of its size in negative values
shuffle(all_data3) 

train_len = math.ceil(len(all_data3)*.8)
train_data = all_data3[:train_len]
test_data = all_data3[train_len:]
test_data_stripped = list(test[0] for test in test_data)
print(test_data_stripped[0])

classifier3 = nltk.NaiveBayesClassifier.train(train_data)#create Naive Bayes Classifier
classifier3.show_most_informative_features()
print_accuracy(classifier3)

['EAP', 'HPL', 'MWS']
(['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.'], 'EAP')
({'afforded': True, 'ascertaining': True, 'whence': True, 'This': True, 'might': True, 'I': True, 'however': True, 'point': True, '.': True, ';': True, 'seemed': True, 'make': True, 'circuit': True, ',': True, 'dungeon': True, 'means': True, 'without': True, 'uniform': True, 'dimensions': True, 'aware': True, 'fact': True, 'wall': True, 'return': True, 'set': True, 'perfectly': True, 'process': True}, 'EAP')

['obsessions', 'Gresset', 'retread', 'Birch', 'scymetars']
[22460, 19047, 7791, 26790, 10283]
['fruits', 'peopled', 'Nubium', 'rumble', 'citations']
({'afforded': True, 'Wisely': False, 'fades': False, 'despondence': False, 'citations': False, 'uniform': True, 'whence': True, 'Nubium

## Results of adjusting the number of negative features
While the Bayesian model didn't produce highly accurate results, I was able to make minor advancements upon the base model I started with to end up with a more accurate predictions. My best guess as to why reducing the ammount of negatives helped the model would be that it mayhave reduced the chance of similar words appearing as negatives in one of the passages. A future attempt might be to try and create a large dictionary of words by each given author and ensure none of the negatives for a given passage are ever used by that author.

In [41]:
#Print to cv file
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def output_data(model):
    test = pd.read_csv("../input/train.csv")
    data = prepare_data(test)
    prepared = add_class(data)
    all_data = add_random_negatives(data, prepared, 0.75)
    #test["length"] = test["text"].apply(lambda x: len(str(x).split()))
    #corpus = test['text'].tolist()# Read the text of the training examples  
    #labels = test['author'].tolist()
    #labeled_data = prepare_data(df)
    #eng_stopwords = set(stopwords.words("english"))
    #test_corpus_text = []
    #all_data = add_class(labeled_data)
    #for i in range(0, test.shape[0]):
    #    corpus = test["text"][i]
    #    corpus = corpus.lower()
    #    corpus = corpus.split()
    #    ps = PorterStemmer()
    #    corpus = [ps.stem(word) for word in corpus if not word in eng_stopwords]
    #    corpus = ' '.join(corpus)
    #    test_corpus_text.append(corpus)
    #train_len = math.ceil(len(all_data))
    #train_data = all_data[:train_len]
    #x_data = all_data[train_len:]
    print(all_data.shape)
    dict(zip(all_data))
    print(type(all_data))
    y_predicted = model.prob_classify(all_data)
    #X_test_output = cv.transform(test_corpus_text).toarray()
    #y_prob_output = classifier.predict_proba(X_test_output)
    submission_df = pd.DataFrame(y_predicted,index=test['id'],columns=['EAP','HPL','MWS'])
    submission_df.to_csv("m1submission.csv")

output_data(classifier3)

['EAP', 'HPL', 'MWS']
(['This', 'process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'I', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'I', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.'], 'EAP')
({'afforded': True, 'ascertaining': True, 'whence': True, 'This': True, 'might': True, 'I': True, 'however': True, 'point': True, '.': True, ';': True, 'seemed': True, 'make': True, 'circuit': True, ',': True, 'dungeon': True, 'means': True, 'without': True, 'uniform': True, 'dimensions': True, 'aware': True, 'fact': True, 'wall': True, 'return': True, 'set': True, 'perfectly': True, 'process': True}, 'EAP')

['obsessions', 'Gresset', 'retread', 'Birch', 'scymetars']
[6800, 14378, 14049, 22975, 22001]
["'although", 'discharge', 'declin', 'devotions', 'armour']
({'afforded': True, 'walls': False, 'experience': False, '..': False, 'whence': True, 'process': True, 'mutable': False, '.': True

AttributeError: 'list' object has no attribute 'shape'