In [128]:
### Improved sentiment analysis classifer ###
# Uses k-fold cross validation and Naive Bayes, Decision Tree, and Bernoulli ML models #
# Outputs average accuracy of the model #

In [129]:
import tarfile
import collections
import nltk
import sys
import random
import string
import re
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk.classify import SklearnClassifier
from nltk import NaiveBayesClassifier, DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics.scores import precision, recall
from nltk.stem import WordNetLemmatizer



In [130]:
# training/testing data files
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

nrctar = tarfile.open("../Data/NRC-Sentiment-Emotion-Lexicons.tar.gz", 'r')
nrctar.extractall('../Data/NRC_Data')

In [131]:
### get all the lines from all the reviews ###

# lines from negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = open_file.readlines()

# lines from positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = open_file.readlines()
    

In [132]:
intensity_file = open('../Data/NRC_Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
intensity_lines = intensity_file.readlines()

In [133]:
lemmatizer = WordNetLemmatizer()

In [134]:
word_emotions = dict()
# use the data from affect-intensity file
for line in intensity_lines[1:]:
    features = line.split()
    # features[0]: the word
    # features[2]: the primary sentiment (fear, sadness, anger, joy)
    if features[2] == '1':
        word_emotion = (lemmatizer.lemmatize(features[0]), features[1])
        word_emotions.update({word_emotion})

In [135]:
### tokenize each line based on whitespace ###

# tokens for positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
# tokens for negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [136]:
### helper function to remove non-alphanumeric characters and lowercase each token ###
def clean_tokens(tokens):

    cleaned_tokens = []
    for token in tokens:
        # removing stop words from the vocabulary also decreases performance pretty significantly
        if len(token) != 0 and token not in string.punctuation: # and token.lower() not in stopwords.words('english'):
            cleaned_tokens.append(lemmatizer.lemmatize(token.lower()))

    # I also tried messing around with POS tags - appending them to the word, replacing the word, etc
    # I think POS tags might have been more helpful if it had a label for neg words - this might have been useful for the "conflicting sentiments" part a few cells down

    # return pos_tag(cleaned_tokens)
    return cleaned_tokens

In [137]:
### clean up the tokens list ###
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [138]:
### helper function to create the model from the tokens list ###
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        # True is just a placeholder value
        yield dict([token, True] for token in tokens)

In [139]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [140]:
# categorize the tokens in each review
positive_dataset = [(t,"Positive") for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative") for t in negative_tokens_for_model]


In [141]:
positive_emotions = ['positive', 'anticipation', 'joy', 'surprise', 'trust']
negative_emotions = ['anger', 'disgust', 'fear', 'negative', 'sadness']

In [142]:
# here we remove words from "conflicting" sentiments from the reviews
# i.e. if there is a word in a review marked as positive that has a "sadness" label, that word will be removed

# first we remove negative words from the set of positive reviews
pos_to_remove = list()
for (review, sentiment) in positive_dataset:
    for word in review:
        if word in word_emotions and word_emotions[word] in negative_emotions:
            pos_to_remove.append(word)

# kinda convoluted way of doing it because modify review while iterating through it = bad
for (review, sentiment) in positive_dataset:
    for neg_word in pos_to_remove:
        if neg_word in review.keys():
            review.pop(neg_word)

# remove positive words from negative reviews
# interestingly enough, doing this actually decreases the performance. I'm guessing it's because it's more common to negate a positive word to make a negative phrase than to negate a negative word and make a positive one
# I commented it out to maximize performance, but if I had time in abundance I might try playing with scope of negation stuff to see if I could get it to improve performance

# neg_to_remove = list()
# for (review, sentiment) in negative_dataset:
#     for word in review:
#         if word in word_emotions and word_emotions[word] in positive_emotions:
#             neg_to_remove.append(word)

# for (review, sentiment) in negative_dataset:
#     for pos_word in neg_to_remove:
#         if pos_word in review.keys():
#             review.pop(pos_word)


In [143]:
dataset = positive_dataset + negative_dataset
# I commented out the line below because I wanted to see how adding or changing just one feature would improve the performance, without the random variations caused by shuffling
#random.shuffle(dataset)


In [144]:
np_dataset = np.array(dataset)

In [145]:

# use k-fold cross validation with k = 9 to train and test
kfold = KFold(n_splits=9, shuffle=True, random_state=1)
nb_mean_accuracy, dt_mean_accuracy, bern_mean_accuracy = list(), list(), list()
nb_mean_precision, dt_mean_precision, bern_mean_precision = list(), list(), list()

for train, test in kfold.split(np_dataset):
    # naive bayes classifier
    nb_classifier = NaiveBayesClassifier.train(np_dataset[train])
    nb_mean_accuracy.append(classify.accuracy(nb_classifier, np_dataset[test]))

    # decitions tree classifier
    dt_classifier = DecisionTreeClassifier.train(np_dataset[train])
    dt_mean_accuracy.append(classify.accuracy(dt_classifier, np_dataset[test]))

    # bernoulli classifier
    bern_classifier = SklearnClassifier(BernoulliNB()).train(np_dataset[train])
    bern_mean_accuracy.append(classify.accuracy(bern_classifier, np_dataset[test]))
    
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    classifiers = [nb_classifier, dt_classifier, bern_classifier]

    for classifier in classifiers:
        for i, (feats, label) in enumerate(np_dataset[test]):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)   

        if classifier == nb_classifier:
            nb_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            nb_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

        elif classifier == dt_classifier:
            dt_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            dt_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

        elif classifier == bern_classifier:
            bern_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            bern_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

# print the mean accuracy across all the folds for each classifier
print("Naive Bayes accuracy:", np.mean(nb_mean_accuracy))
print("Naive Bayes precision:", np.mean(nb_mean_precision))
print('\n')
print("Decision Tree accuracy:", np.mean(dt_mean_accuracy))
print("Decision Tree precision:", np.mean(dt_mean_precision))
print('\n')
print("Bernoulli accuracy:", np.mean(bern_mean_accuracy))
print("Bernoulli precision:", np.mean(bern_mean_precision))



Naive Bayes accuracy is: 0.8552188552188551
Naive Bayes precision is: 0.8728174603174602


Decision Tree accuracy is: 0.7575757575757576
Decision Tree precision is: 0.7366402116402115


Bernoulli accuracy is: 0.8367003367003368
Bernoulli precision is: 0.7150573192239859
