In [1]:
### Improved sentiment analysis classifer ###
# Uses k-fold cross validation and Naive Bayes, Decision Tree, and Bernoulli ML models #
# Outputs average accuracy of the model #

In [65]:
import tarfile
import nltk
import sys
import random
import string
import re
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk.classify import SklearnClassifier
from nltk import NaiveBayesClassifier, DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics.scores import precision, recall


In [66]:
# training/testing data files
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

nrctar = tarfile.open("../Data/NRC-Sentiment-Emotion-Lexicons.tar.gz")
nrctar.extractall('../Data/NRC_Data')

In [67]:
### get all the lines from all the reviews ###

# lines from negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = open_file.readlines()

# lines from positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = open_file.readlines()
    

In [68]:
intensity_file = open('../Data/NRC_Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt')
intensity_lines = intensity_file.readlines()

In [69]:
word_emotions = dict()
# use the data from affect-intensity file
for line in intensity_lines[1:]:
    features = line.strip().split("\t")
    # features[0]: the word
    # features[2]: the primary sentiment (fear, sadness, anger, joy)
    word_emotion = (features[0], features[2])
    word_emotions.update({word_emotion})

In [70]:
### tokenize each line based on whitespace ###

# tokens for positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
# tokens for negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [71]:
### helper function to remove non-alphanumeric characters and lowercase each token ###
def clean_tokens(tweet_tokens):
    cleaned_tokens = []
    
    for (token, tag) in pos_tag(tweet_tokens):
        # removing stop words from the vocabulary also decreases performance pretty significantly
        if len(token) != 0 and token not in string.punctuation: # and token.lower() not in stopwords.words('english'):
            cleaned_tokens.append(token.lower())

    # I also tried messing around with POS tags - appending them to the word, replacing the word, etc
    # I think POS tags might have been more helpful if it had a label for neg words - this might have been useful for the "conflicting sentiments" part a few cells down

    # return pos_tag(cleaned_tokens)
    return cleaned_tokens

In [72]:
### clean up the tokens list ###
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [73]:
### helper function to create the model from the tokens list ###
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        # True is just a placeholder value
        yield dict([token, True] for token in tokens)

In [74]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [75]:
# categorize the tokens in each tweet to creat the dataset
positive_dataset = [(t,"Positive")
                     for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative")
                     for t in negative_tokens_for_model]


In [76]:
# here we remove words from "conflicting" sentiments from the reviews
# i.e. if there is a word in a review marked as positive that has a "sadness" label, that word will be removed

# first we remove negative words from the set of positive reviews
pos_to_remove = list()
for (review, sentiment) in positive_dataset:
    for word in review:
        if word in word_emotions:
            if word_emotions[word] == "sadness" or word_emotions[word] == "anger" or  word_emotions[word] == "fear":
                pos_to_remove.append(word)

# kinda convoluted way of doing it because modify review while iterating through it = bad
for (review, sentiment) in positive_dataset:
    for neg_word in pos_to_remove:
        if neg_word in review.keys():
            review.pop(neg_word)

# remove positive words from negative reviews
# interestingly enough, doing this actually decreases the performance. I'm guessing it's because the "joy" label is the only one that is "positive", and it seems like there is less "precision" about which words can be labelled as joyful - is "custom" really a "joy" word? or "mucis"? what is mucis anyways??
# also, I'd guess that it's more common to negate a positive word to make a negative phrase than to negate a negative word and make a positive one
# I commented it out to maximize performance, but if I had time in abundance I might try playing with scope of negation stuff to see if I could get it to improve performance

# neg_to_remove = list()
# for (review, sentiment) in negative_dataset:
#     for word in review:
#         if word in word_emotions:
#             if word_emotions[word] == "joy":
#                 neg_to_remove.append(word)

# for (review, sentiment) in negative_dataset:
#     for pos_word in neg_to_remove:
#         if pos_word in review.keys():
#             review.pop(pos_word)


In [77]:
dataset = positive_dataset + negative_dataset
# I commented out the line below because I wanted to see how adding or changing just one feature would improve the performance, without the random variations caused by shuffling
#random.shuffle(dataset)


In [78]:
np_dataset = np.array(dataset)

In [79]:

# use k-fold cross validation with k = 9 to train and test
kfold = KFold(n_splits=9, shuffle=True, random_state=1)
nb_mean_accuracy, dt_mean_accuracy, bern_mean_accuracy = list(), list(), list()

for train, test in kfold.split(np_dataset):
    # naive bayes classifier
    nb_classifier = NaiveBayesClassifier.train(np_dataset[train])
    nb_mean_accuracy.append(classify.accuracy(nb_classifier, np_dataset[test]))

    # decitions tree classifier
    dt_classifier = DecisionTreeClassifier.train(np_dataset[train])
    dt_mean_accuracy.append(classify.accuracy(dt_classifier, np_dataset[test]))

    # bernoulli classifier
    bern_classifier = SklearnClassifier(BernoulliNB()).train(np_dataset[train])
    bern_mean_accuracy.append(classify.accuracy(bern_classifier, np_dataset[test]))
    
# print the mean accuracy across all the folds for each classifier
print("Naive Bayes accuracy is:", np.mean(nb_mean_accuracy))
print("Decision Tree accuracy is:", np.mean(dt_mean_accuracy))
print("Bernoulli accuracy is:", np.mean(bern_mean_accuracy))


Naive Bayes accuracy is: 0.8644781144781145
Decision Tree accuracy is: 0.7104377104377105
Bernoulli accuracy is: 0.8257575757575758


In [80]:
# without shuffle
# Naive Bayes accuracy is: 0.8644781144781145
# Decision Tree accuracy is: 0.7491582491582492
# Bernoulli accuracy is: 0.8257575757575758

In [81]:
# list of changes (i didn't keep all of them)
    # 1. in text_process: only append token if len > 0 and not punctuation and not stop word
    # 2. Change n_splits to 9 (maybe 10 was too many)
    # 3. add POS tags to each token so it's (Token, POS)
    # 4. add in Decision Tree and Bernoulli classifiers
    # 5. remove words with conflicting sentiment from positive