In [26]:
### Baseline sentiment analysis classifer ###
# Uses k-fold cross validation and a Naive Bayes ML model #
# Outputs average accuracy of the model #

In [92]:
import tarfile
import nltk
import sys
import random
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.metrics import precision_recall_fscore_support, f1_score, precision_score, recall_score, accuracy_score
import string
import re
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics import *
import collections



In [103]:
# training/testing data files
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

In [104]:
### get all the lines from all the reviews ###

# lines from negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = openFile.readlines()

# lines from positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = openFile.readlines()
    

In [105]:
### tokenize each line based on whitespace ###

# tokens for positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
# tokens for negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [106]:
### helper function to remove non-alphanumeric characters and lowercase each token ###
def clean_tokens(tokens):
    cleaned_tokens = []
    
    for token in tokens:
        cleaned_tokens.append(token.lower())

    return cleaned_tokens

In [107]:
### clean up the tokens list ###
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [108]:
### helper function to create the model from the tokens list ###
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

In [109]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [110]:
# categorize the tokens in each review
positive_dataset = [(t,"Positive") for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative") for t in negative_tokens_for_model]

# dataset will be shuffled later so no need to shuffle here
dataset = positive_dataset + negative_dataset


In [111]:
np_dataset = np.array(dataset)

In [112]:
# use k-fold cross validation with k = 10 to train and test
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
mean_accuracy, mean_precision = list(), list()

for train_i, test_i in kfold.split(np_dataset):
    
    xtrain, xtest = np_dataset[train_i], np_dataset[test_i]
    classifier = NaiveBayesClassifier.train(xtrain)

    mean_accuracy.append(classify.accuracy(classifier, xtest))
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(xtest):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)   

    mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
    mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

# print the mean accuracy across all the folds
print("Accuracy:", np.mean(mean_accuracy))
print("Precision:", np.mean(mean_precision))


Accuracy: 0.8272727272727274
Precision: 0.8582738095238096
