# Naive Bayes on R.Goldblatt's Stories

In [1]:
import string
import numpy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import os
import textblob
import collections
from textblob import Word
from textblob.tokenizers import WordTokenizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [120]:
#Pulling in and cleaning data
tz = pd.read_csv(r'C:\Users\ced4689\Desktop\TVF\twilightzone.csv',sep=('|'),index_col=0)
hh = pd.read_csv(r'C:\Users\ced4689\Desktop\TVF\hitchhikers.csv',sep=('|'),index_col=0)

gf = pd.read_csv(r'C:\Users\ced4689\Documents\GitHub\twilightvalefalls\gravityfalls\gf_eps.csv',sep=('|'),index_col=0)
gf = gf.drop('text',axis=1)
gf = gf.rename(columns = {'source':'Source','title':'Title', 'date':'Date', 'handled_text':'Text'})
gf_strat = gf[:16]

rs = pd.read_csv(r'C:\Users\ced4689\Documents\GitHub\twilightvalefalls\rstories\rs_df.csv',sep=('|'),index_col=0)
rs = rs.drop('text',axis=1)
rs = rs.rename(columns = {'source':'Source','title':'Title', 'date':'Date', 'handled_text':'Text'})

nv = pd.read_csv(r'C:\Users\ced4689\Desktop\TVF\wtnv_final.csv',sep=('|'),index_col=0).reset_index()
nv_strat = nv[:16]

data = pd.concat([tz,hh,gf,nv,rs])
data = data.reset_index(drop=True)
data = data.drop([151,152],axis=0)
data = data.reset_index(drop=True)

data_strat = pd.concat([tz,hh,gf_strat,nv_strat,rs])
data_strat = data_strat.reset_index(drop=True)

In [118]:
# Creating the Class variables for each source material
data['Class'] = ''
for i,j in enumerate(data['Source']):
    if j == 'Twilight Zone':
        data['Class'][i] = 1
    elif j == "Hitchhiker's Guide to the Galaxy":
        data['Class'][i] = 2
    elif j == 'gravity falls':
        data['Class'][i] = 3
    elif j == 'WTNV':
        data['Class'][i] = 4
    else:
        data['Class'][i] = 5
        
data_strat['Class'] = ''
for i,j in enumerate(data_strat['Source']):
    if j == 'Twilight Zone':
        data_strat['Class'][i] = 1
    elif j == "Hitchhiker's Guide to the Galaxy":
        data_strat['Class'][i] = 2
    elif j == 'gravity falls':
        data_strat['Class'][i] = 3
    elif j == 'WTNV':
        data_strat['Class'][i] = 4
    else:
        data_strat['Class'][i] = 5

In [65]:
features = data['Text']
labels = data['Class'].astype(int)

features_strat = data_strat['Text']
labels_strat = data_strat['Class'].astype(int)

## Unstratified Naive Bayes
### Including R.Goldblatt's Stories

In [67]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=42)

In [82]:
x_train_counts = count_vect.fit_transform(x_train)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf

<171x22522 sparse matrix of type '<class 'numpy.float64'>'
	with 163725 stored elements in Compressed Sparse Row format>

In [83]:
mnb = MultinomialNB(alpha=0.1).fit(x_train_tfidf, y_train)

In [84]:
pred = mnb.predict(count_vect.transform(x_test))
print(confusion_matrix(y_test, pred))

[[ 0  0  0  2  0]
 [ 0  0  0  1  0]
 [ 0  0  4  2  0]
 [ 0  0  0 46  0]
 [ 0  0  0  2  0]]


In [85]:
print(accuracy_score(y_test,pred))

0.8771929824561403


The unstratified results produced a high accuracy only due to the classifier only choosing Welcome to Nightvale as its class. Because Nightvale has the most instances of text, this would make sense.

We decided to stratify the data so each source material is represented equally

## Stratified Naive Bayes
### Including R.Goldblatt's Stories

In [73]:
x_train_strat, x_test_strat, y_train_strat, y_test_strat = train_test_split(features_strat, labels_strat, random_state=42)

In [74]:
x_train_counts = count_vect.fit_transform(x_train_strat)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf

<42x13215 sparse matrix of type '<class 'numpy.float64'>'
	with 44127 stored elements in Compressed Sparse Row format>

In [80]:
mnb = MultinomialNB(alpha=0.1).fit(x_train_tfidf, y_train_strat)

In [78]:
pred_strat = mnb.predict(count_vect.transform(x_test_strat))
print(confusion_matrix(y_test_strat, pred_strat))

[[4 0 1 0]
 [0 3 0 0]
 [0 0 4 0]
 [0 0 2 0]]


In [79]:
print(accuracy_score(y_test_strat,pred_strat))

0.7857142857142857


Using the stratified test, only four classes were produced. This is due to Hitchhiker's Guide to the Galaxy only having one instance and being glossed over in the split of the data. Besides this, the Naive Bayes worked better in classifying under stratified conditions. The accuracy score is lower due to a smaller smaple size

We decided to use R.Goldblatt's text as the predictor to determine whether her stories are independent from one another

## Unstratified Naive Bayes
### Using to predict R. Goldblatt's Stories

In [104]:
x_train = data['Text'][data['Class'] != 5]
x_test = data['Text'][data['Class'] == 5]
y = data['Class'][data['Class'] != 5].astype(int)

In [105]:
x_train_counts = count_vect.fit_transform(x_train)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf

<221x27510 sparse matrix of type '<class 'numpy.float64'>'
	with 220998 stored elements in Compressed Sparse Row format>

In [106]:
mnb = MultinomialNB(alpha=0.1).fit(x_train_tfidf, y)

In [108]:
pred = mnb.predict(count_vect.transform(x_test))
print(pred)

[4 4 4 4 4 4 4]


Testing whether the unstratififed results would be any different from before while testing on R.Goldblatt's stories produced the same results, everything is Nightvale

## Stratified Naive Bayes
### Using to predict R. Goldblatt's Stories

In [109]:
x_train_strat = data_strat['Text'][data_strat['Class'] != 5]
x_test_strat = data_strat['Text'][data_strat['Class'] == 5]
y_strat = data_strat['Class'][data_strat['Class'] != 5].astype(int)

In [110]:
x_train_counts = count_vect.fit_transform(x_train_strat)

x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf

<49x14613 sparse matrix of type '<class 'numpy.float64'>'
	with 55812 stored elements in Compressed Sparse Row format>

In [111]:
mnb = MultinomialNB(alpha=0.1).fit(x_train_tfidf, y_strat)

In [116]:
pred_strat = mnb.predict(count_vect.transform(x_test_strat))
print(pred_strat)

[4 4 4 4 1 4 4]


Interesting with the stratified data, the results produced everything to coincide with Nightvale once again, except that for her 5th story 'Mobsters' which was classified to be from Twilight Zone