In [851]:
### Baseline sentiment analysis classifer ###
# Uses 5-fold cross validation and a Naive Bayes ML model #
# Outputs average accuracy of the model #

In [852]:
import tarfile
import nltk
import sys
import random
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
import string
import re
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk.classify import SklearnClassifier
from nltk import NaiveBayesClassifier, DecisionTreeClassifier, MaxentClassifier, maxent
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics.scores import precision, recall


In [853]:
# training/testing data files
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

nrctar = tarfile.open("../Data/NRC-Sentiment-Emotion-Lexicons.tar.gz")
nrctar.extractall('../Data/NRC_Data')

In [854]:
### get all the lines from all the reviews ###

# lines from negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = openFile.readlines()

# lines from positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = openFile.readlines()
    

In [855]:
intensityFile = open('../Data/NRC_Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt')
intensitylines = intensityFile.readlines()

In [856]:
intensitywords = []
word_emotions = dict()
for line in intensitylines[1:]:
    stuff = line.strip().split("\t")
    tup = (stuff[0], stuff[2])
    if stuff[0] not in intensitywords:
        intensitywords.append(stuff[0])
    word_emotions.update({tup})

In [857]:
### tokenize each line based on whitespace ###

# tokens for positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
# tokens for negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [858]:
### helper function to remove non-alphanumeric characters and lowercase each token ###
def text_process(tweet_tokens):
    cleaned_tokens = []
    
    for token, tag in pos_tag(tweet_tokens):
        if len(token) != 0 and token not in string.punctuation and token.lower() not in stopwords.words('english'):
            cleaned_tokens.append(token.lower())

    #return pos_tag(cleaned_tokens)
    return cleaned_tokens

In [859]:
### clean up the tokens list ###
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(text_process(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(text_process(tokens))

In [860]:
### helper function to create the model from the tokens list ###
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            if token in intensitywords:
                yield {(token , word_emotions[token]) : True}
            else:
                yield {(token , "None") : True}

In [861]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [862]:
# categorize the tokens in each tweet to creat the dataset
positive_dataset = list()
for t in positive_tokens_for_model:
    for pair in t.keys():
        if pair[1] == "None" or pair[1] == "joy":
            positive_dataset.append( ({pair[0] : "Positive"} ,True) )
        else:
            positive_dataset.append( ({pair[0] : "Negative"}, True))

negative_dataset = list()
for t in negative_tokens_for_model:
    for k in t.keys():
        if pair[1] == "joy":
            negative_dataset.append( ({pair[0]: "Positive"}, True))
        else:
            negative_dataset.append(({pair[0] : "Negative"}, True))

# dataset will be shuffled later so no need to shuffle here
dataset = positive_dataset + negative_dataset


In [863]:
dataset[:30]

[({'capsule': 'Positive'}, True),
 ({'director': 'Positive'}, True),
 ({'cure': 'Positive'}, True),
 ({'brings': 'Positive'}, True),
 ({'weird': 'Positive'}, True),
 ({'complex': 'Positive'}, True),
 ({'concept': 'Positive'}, True),
 ({'screen': 'Positive'}, True),
 ({'one': 'Positive'}, True),
 ({'viewing': 'Positive'}, True),
 ({'enough': 'Positive'}, True),
 ({'understand': 'Positive'}, True),
 ({'fully': 'Positive'}, True),
 ({'premise': 'Positive'}, True),
 ({'pulse': 'Positive'}, True),
 ({'idea': 'Positive'}, True),
 ({'something': 'Positive'}, True),
 ({'ghosts': 'Positive'}, True),
 ({'internet': 'Positive'}, True),
 ({'film': 'Positive'}, True),
 ({'amazing': 'Positive'}, True),
 ({'apocalyptic': 'Positive'}, True),
 ({'style': 'Positive'}, True),
 ({'+2': 'Positive'}, True),
 ({'-4': 'Positive'}, True),
 ({'+4': 'Positive'}, True),
 ({'perhaps': 'Positive'}, True),
 ({'disturbing': 'Positive'}, True),
 ({'disturbed': 'Negative'}, True),
 ({'filmmaker': 'Positive'}, True)]

In [864]:
np_dataset = np.array(dataset)

In [865]:

# use k-fold cross validation with k = 10 to train and test
kfold = KFold(n_splits=9, shuffle=True, random_state=1)
nb_mean_accuracy, dt_mean_accuracy, bern_mean_accuracy = list(), list(), list()

for train, test in kfold.split(np_dataset):
    # use naive bayes classifier -  different classifiers are explored in revised version
    nb_classifier = NaiveBayesClassifier.train(np_dataset[train])
    nb_mean_accuracy.append(classify.accuracy(nb_classifier, np_dataset[test]))

    # dt_classifier = DecisionTreeClassifier.train(np_dataset[train])
    # dt_mean_accuracy.append(classify.accuracy(dt_classifier, np_dataset[test]))

    # bern_classifier = SklearnClassifier(BernoulliNB()).train(np_dataset[train])
    # bern_mean_accuracy.append(classify.accuracy(bern_classifier, np_dataset[test]))
    
# print the mean accuracy across all the folds
print("Naive Bayes accuracy is:", np.mean(nb_mean_accuracy))
#print("Decision Tree accuracy is:", np.mean(dt_mean_accuracy))
#print(" accuracy is:", np.mean(bern_mean_accuracy))


Naive Bayes accuracy is: 1.0


In [866]:
# list of changes:
    # 1. in text_process: only append token if len > 0 and not punctuation and not stop word-0
    # 2. Change n_splits to 9 (not sure why but accuracy goes from 84.7% to 85.5%)
    # 3. add POS tags to each token so it's (Token, POS)
    # 4. add in Decision Tree classifier