In [41]:
import re
import csv
import random

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dvjr2\AppData\Roaming\nltk_data...


True

In [None]:
# process initial file to convert to two rows (review, sentiment)
file = open('moviereview_v2.csv', 'w')

with open('moviereview.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter = ',')
    for row in reader:
        sentiment = row[-1]
        row.pop()  
        file.write(re.sub('\W+',' ', ''.join(row)) + ',' + sentiment+'\n')
        
file.close()

In [35]:
# format for sentiment analysis
docs = []
docs_v =[]

with open('moviereview_v2.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter = ',')
    for row in reader:
        
        docs.append((row[0].split(), row[1]))
        docs_v.append(row[0])

In [45]:
len(docs)
docs = docs[1:] # get rid of header

# shuffle so not in order
random.shuffle(docs)
random.shuffle(docs)
random.shuffle(docs)

# create training and test docs
train_docs = docs[:1800]
test_docs = docs[1800:]

# Sentiment Analyzer

In [29]:
sentim_analyzer = SentimentAnalyzer()

all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_docs])

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

len(unigram_feats)

18236

In [33]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(train_docs)
test_set = sentim_analyzer.apply_features(test_docs)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

Training classifier


In [34]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.8407960199004975
F-measure [neg]: 0.8504672897196263
F-measure [pos]: 0.8297872340425531
Precision [neg]: 0.8348623853211009
Precision [pos]: 0.8478260869565217
Recall [neg]: 0.8666666666666667
Recall [pos]: 0.8125


# Vader

In [48]:
random.shuffle(docs_v)
random.shuffle(docs_v)
random.shuffle(docs_v)

In [46]:
docs_v = docs_v[1:]

In [53]:
sid = SentimentIntensityAnalyzer()

for s in docs_v:
    #print(s)
    ss = sid.polarity_scores(s)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()
    

compound: 0.989, neg: 0.115, neu: 0.734, pos: 0.151, 
compound: -0.9688, neg: 0.127, neu: 0.79, pos: 0.083, 
compound: 0.9365, neg: 0.131, neu: 0.732, pos: 0.137, 
compound: 0.4917, neg: 0.12, neu: 0.757, pos: 0.123, 
compound: -0.9765, neg: 0.116, neu: 0.788, pos: 0.095, 
compound: 0.8808, neg: 0.122, neu: 0.728, pos: 0.15, 
compound: -0.9942, neg: 0.161, neu: 0.759, pos: 0.08, 
compound: -0.7824, neg: 0.11, neu: 0.775, pos: 0.114, 
compound: 0.999, neg: 0.079, neu: 0.752, pos: 0.17, 
compound: -0.9625, neg: 0.156, neu: 0.736, pos: 0.109, 
compound: 0.9991, neg: 0.06, neu: 0.72, pos: 0.219, 
compound: 0.9971, neg: 0.07, neu: 0.786, pos: 0.144, 
compound: 0.3075, neg: 0.117, neu: 0.766, pos: 0.116, 
compound: 0.9989, neg: 0.04, neu: 0.822, pos: 0.138, 
compound: 0.2579, neg: 0.094, neu: 0.822, pos: 0.084, 
compound: 0.0929, neg: 0.121, neu: 0.756, pos: 0.123, 
compound: 0.9962, neg: 0.129, neu: 0.702, pos: 0.169, 
compound: 0.9936, neg: 0.051, neu: 0.797, pos: 0.152, 
compound: 0.9977,

compound: -0.9134, neg: 0.094, neu: 0.826, pos: 0.08, 
compound: -0.8648, neg: 0.114, neu: 0.794, pos: 0.092, 
compound: 0.938, neg: 0.117, neu: 0.743, pos: 0.14, 
compound: 0.9936, neg: 0.081, neu: 0.772, pos: 0.147, 
compound: -0.9445, neg: 0.108, neu: 0.793, pos: 0.099, 
compound: 0.9981, neg: 0.069, neu: 0.747, pos: 0.184, 
compound: 0.9806, neg: 0.114, neu: 0.728, pos: 0.157, 
compound: -0.9608, neg: 0.139, neu: 0.754, pos: 0.107, 
compound: -0.8738, neg: 0.105, neu: 0.799, pos: 0.096, 
compound: 0.9912, neg: 0.086, neu: 0.696, pos: 0.218, 
compound: 0.9935, neg: 0.073, neu: 0.76, pos: 0.167, 
compound: 0.2839, neg: 0.093, neu: 0.813, pos: 0.095, 
compound: -0.7895, neg: 0.08, neu: 0.852, pos: 0.068, 
compound: 0.9016, neg: 0.07, neu: 0.851, pos: 0.08, 
compound: 0.9936, neg: 0.078, neu: 0.791, pos: 0.13, 
compound: 0.9751, neg: 0.097, neu: 0.769, pos: 0.134, 
compound: 0.4829, neg: 0.124, neu: 0.743, pos: 0.133, 
compound: -0.9921, neg: 0.17, neu: 0.734, pos: 0.096, 
compound: 0.

compound: 0.9916, neg: 0.114, neu: 0.733, pos: 0.153, 
compound: 0.8376, neg: 0.138, neu: 0.705, pos: 0.157, 
compound: 0.9687, neg: 0.13, neu: 0.711, pos: 0.159, 
compound: 0.9073, neg: 0.107, neu: 0.756, pos: 0.138, 
compound: 0.9782, neg: 0.094, neu: 0.768, pos: 0.138, 
compound: -0.9838, neg: 0.152, neu: 0.715, pos: 0.133, 
compound: 0.9619, neg: 0.091, neu: 0.784, pos: 0.125, 
compound: 0.977, neg: 0.044, neu: 0.838, pos: 0.118, 
compound: 0.9985, neg: 0.057, neu: 0.774, pos: 0.169, 
compound: 0.9989, neg: 0.077, neu: 0.753, pos: 0.17, 
compound: 0.8784, neg: 0.144, neu: 0.697, pos: 0.159, 
compound: 0.9916, neg: 0.119, neu: 0.713, pos: 0.168, 
compound: 0.9988, neg: 0.107, neu: 0.712, pos: 0.18, 
compound: -0.9191, neg: 0.154, neu: 0.707, pos: 0.139, 
compound: 0.9993, neg: 0.041, neu: 0.804, pos: 0.155, 
compound: -0.9614, neg: 0.13, neu: 0.803, pos: 0.068, 
compound: 0.9892, neg: 0.07, neu: 0.809, pos: 0.121, 
compound: -0.9971, neg: 0.123, neu: 0.822, pos: 0.055, 
compound: -0

compound: 0.9923, neg: 0.057, neu: 0.821, pos: 0.122, 
compound: 0.9847, neg: 0.087, neu: 0.76, pos: 0.153, 
compound: -0.9787, neg: 0.13, neu: 0.8, pos: 0.07, 
compound: -0.9926, neg: 0.17, neu: 0.736, pos: 0.094, 
compound: 0.9893, neg: 0.053, neu: 0.787, pos: 0.16, 
compound: 0.9659, neg: 0.097, neu: 0.783, pos: 0.12, 
compound: 0.9993, neg: 0.038, neu: 0.782, pos: 0.181, 
compound: -0.9796, neg: 0.11, neu: 0.802, pos: 0.087, 
compound: 0.9982, neg: 0.078, neu: 0.76, pos: 0.162, 
compound: 0.9969, neg: 0.058, neu: 0.771, pos: 0.171, 
compound: 0.9442, neg: 0.079, neu: 0.788, pos: 0.133, 
compound: 0.9924, neg: 0.071, neu: 0.79, pos: 0.139, 
compound: 0.9578, neg: 0.081, neu: 0.82, pos: 0.099, 
compound: -0.9114, neg: 0.141, neu: 0.738, pos: 0.12, 
compound: -0.9734, neg: 0.121, neu: 0.812, pos: 0.067, 
compound: -0.8375, neg: 0.085, neu: 0.827, pos: 0.088, 
compound: -0.9871, neg: 0.174, neu: 0.675, pos: 0.151, 
compound: -0.9538, neg: 0.144, neu: 0.717, pos: 0.138, 
compound: 0.999

compound: 0.9954, neg: 0.048, neu: 0.83, pos: 0.122, 
compound: -0.8655, neg: 0.124, neu: 0.794, pos: 0.081, 
compound: -0.949, neg: 0.109, neu: 0.792, pos: 0.099, 
compound: 0.9945, neg: 0.102, neu: 0.694, pos: 0.204, 
compound: 0.9985, neg: 0.08, neu: 0.7, pos: 0.22, 
compound: -0.9982, neg: 0.254, neu: 0.678, pos: 0.067, 
compound: -0.9919, neg: 0.137, neu: 0.75, pos: 0.113, 
compound: 0.9982, neg: 0.06, neu: 0.752, pos: 0.187, 
compound: 0.9958, neg: 0.072, neu: 0.781, pos: 0.147, 
compound: 0.9971, neg: 0.1, neu: 0.734, pos: 0.166, 
compound: -0.9381, neg: 0.115, neu: 0.779, pos: 0.106, 
compound: 0.9997, neg: 0.058, neu: 0.725, pos: 0.217, 
compound: -0.9628, neg: 0.157, neu: 0.714, pos: 0.129, 
compound: -0.9617, neg: 0.13, neu: 0.778, pos: 0.092, 
compound: 0.9993, neg: 0.046, neu: 0.765, pos: 0.188, 
compound: 0.997, neg: 0.055, neu: 0.768, pos: 0.177, 
compound: 0.9991, neg: 0.049, neu: 0.749, pos: 0.202, 
compound: 0.9953, neg: 0.044, neu: 0.81, pos: 0.146, 
compound: -0.424

compound: 0.9799, neg: 0.088, neu: 0.786, pos: 0.126, 
compound: 0.9411, neg: 0.038, neu: 0.859, pos: 0.103, 
compound: 0.9965, neg: 0.037, neu: 0.794, pos: 0.17, 
compound: -0.7291, neg: 0.108, neu: 0.798, pos: 0.094, 
compound: 0.9936, neg: 0.103, neu: 0.749, pos: 0.148, 
compound: -0.8584, neg: 0.164, neu: 0.677, pos: 0.16, 
compound: 0.9989, neg: 0.099, neu: 0.737, pos: 0.164, 
compound: -0.9989, neg: 0.177, neu: 0.771, pos: 0.051, 
compound: 0.9994, neg: 0.08, neu: 0.729, pos: 0.192, 
compound: 0.9962, neg: 0.051, neu: 0.81, pos: 0.139, 
compound: -0.876, neg: 0.149, neu: 0.706, pos: 0.145, 
compound: 0.9649, neg: 0.059, neu: 0.809, pos: 0.131, 
compound: -0.8056, neg: 0.114, neu: 0.771, pos: 0.115, 
compound: 0.9989, neg: 0.105, neu: 0.699, pos: 0.196, 
compound: -0.9753, neg: 0.128, neu: 0.778, pos: 0.094, 
compound: 0.9995, neg: 0.039, neu: 0.74, pos: 0.221, 
compound: 0.9908, neg: 0.086, neu: 0.758, pos: 0.155, 
compound: -0.9873, neg: 0.133, neu: 0.762, pos: 0.105, 
compound:

compound: -0.8146, neg: 0.146, neu: 0.721, pos: 0.134, 
compound: 0.4503, neg: 0.116, neu: 0.758, pos: 0.126, 
compound: 0.9578, neg: 0.05, neu: 0.836, pos: 0.114, 
compound: -0.479, neg: 0.153, neu: 0.695, pos: 0.152, 
compound: -0.9873, neg: 0.205, neu: 0.65, pos: 0.146, 
compound: -0.9941, neg: 0.106, neu: 0.864, pos: 0.03, 
compound: 0.9693, neg: 0.078, neu: 0.816, pos: 0.106, 
compound: 0.9958, neg: 0.081, neu: 0.709, pos: 0.21, 
compound: 0.9237, neg: 0.077, neu: 0.813, pos: 0.11, 
compound: 0.9937, neg: 0.062, neu: 0.823, pos: 0.115, 
compound: 0.9981, neg: 0.094, neu: 0.769, pos: 0.137, 
compound: 0.7476, neg: 0.101, neu: 0.791, pos: 0.107, 
compound: 0.7598, neg: 0.1, neu: 0.78, pos: 0.121, 
compound: -0.9575, neg: 0.131, neu: 0.761, pos: 0.108, 
compound: -0.9644, neg: 0.137, neu: 0.741, pos: 0.122, 
compound: -0.995, neg: 0.232, neu: 0.659, pos: 0.109, 
compound: 0.9815, neg: 0.058, neu: 0.826, pos: 0.116, 
compound: -0.9805, neg: 0.11, neu: 0.822, pos: 0.068, 
compound: -0.

compound: 0.9998, neg: 0.072, neu: 0.663, pos: 0.265, 
compound: 0.8839, neg: 0.096, neu: 0.795, pos: 0.109, 
compound: 0.9989, neg: 0.072, neu: 0.776, pos: 0.151, 
compound: 0.9489, neg: 0.083, neu: 0.773, pos: 0.144, 
compound: 0.994, neg: 0.047, neu: 0.828, pos: 0.125, 
compound: 0.9977, neg: 0.045, neu: 0.782, pos: 0.173, 
compound: 0.9566, neg: 0.095, neu: 0.741, pos: 0.164, 
compound: 0.997, neg: 0.05, neu: 0.798, pos: 0.152, 
compound: -0.9925, neg: 0.149, neu: 0.773, pos: 0.078, 
compound: 0.9963, neg: 0.071, neu: 0.772, pos: 0.157, 
compound: 0.8887, neg: 0.037, neu: 0.863, pos: 0.1, 
compound: 0.996, neg: 0.106, neu: 0.731, pos: 0.163, 
compound: 0.9572, neg: 0.046, neu: 0.845, pos: 0.109, 
compound: 0.9989, neg: 0.11, neu: 0.589, pos: 0.3, 
compound: 0.8718, neg: 0.113, neu: 0.76, pos: 0.128, 
compound: 0.9951, neg: 0.027, neu: 0.814, pos: 0.159, 
compound: 0.9535, neg: 0.078, neu: 0.776, pos: 0.146, 
compound: 0.9917, neg: 0.069, neu: 0.824, pos: 0.107, 
compound: 0.9987, n

compound: 0.9901, neg: 0.11, neu: 0.75, pos: 0.14, 
compound: 0.6289, neg: 0.141, neu: 0.705, pos: 0.154, 
compound: 0.3639, neg: 0.122, neu: 0.735, pos: 0.143, 
compound: 0.7655, neg: 0.129, neu: 0.728, pos: 0.143, 
compound: 0.9976, neg: 0.048, neu: 0.798, pos: 0.153, 
compound: 0.7311, neg: 0.106, neu: 0.759, pos: 0.135, 
compound: 0.9746, neg: 0.083, neu: 0.802, pos: 0.116, 
compound: 0.9992, neg: 0.06, neu: 0.783, pos: 0.157, 
compound: 0.9993, neg: 0.047, neu: 0.724, pos: 0.229, 
compound: -0.3804, neg: 0.13, neu: 0.738, pos: 0.132, 
compound: -0.9692, neg: 0.125, neu: 0.786, pos: 0.09, 
compound: 0.9863, neg: 0.081, neu: 0.807, pos: 0.112, 
compound: 0.9902, neg: 0.069, neu: 0.817, pos: 0.114, 
compound: -0.6316, neg: 0.12, neu: 0.758, pos: 0.122, 
compound: 0.9871, neg: 0.11, neu: 0.753, pos: 0.137, 
compound: 0.9891, neg: 0.092, neu: 0.766, pos: 0.143, 
compound: 0.9955, neg: 0.073, neu: 0.78, pos: 0.147, 
compound: 0.9988, neg: 0.046, neu: 0.747, pos: 0.207, 
compound: -0.995

compound: 0.9982, neg: 0.111, neu: 0.728, pos: 0.16, 
compound: 0.997, neg: 0.102, neu: 0.69, pos: 0.208, 
compound: -0.4284, neg: 0.091, neu: 0.832, pos: 0.077, 
compound: -0.9795, neg: 0.149, neu: 0.739, pos: 0.112, 
compound: 0.9985, neg: 0.086, neu: 0.755, pos: 0.159, 
compound: -0.9202, neg: 0.108, neu: 0.797, pos: 0.095, 
compound: 0.9962, neg: 0.072, neu: 0.766, pos: 0.161, 
compound: 0.9673, neg: 0.101, neu: 0.774, pos: 0.125, 
compound: 0.736, neg: 0.086, neu: 0.785, pos: 0.129, 
compound: 0.8893, neg: 0.075, neu: 0.834, pos: 0.091, 
compound: 0.9295, neg: 0.098, neu: 0.782, pos: 0.12, 
compound: 0.9315, neg: 0.107, neu: 0.779, pos: 0.114, 
compound: 0.9701, neg: 0.084, neu: 0.785, pos: 0.131, 
compound: 0.8771, neg: 0.091, neu: 0.796, pos: 0.113, 
compound: 0.8816, neg: 0.117, neu: 0.747, pos: 0.136, 
compound: -0.8929, neg: 0.162, neu: 0.693, pos: 0.146, 
compound: 0.9975, neg: 0.063, neu: 0.813, pos: 0.123, 
compound: 0.9685, neg: 0.115, neu: 0.734, pos: 0.151, 
compound: 0

compound: 0.9893, neg: 0.064, neu: 0.793, pos: 0.143, 
compound: 0.9831, neg: 0.062, neu: 0.835, pos: 0.103, 
compound: 0.998, neg: 0.068, neu: 0.757, pos: 0.175, 
compound: 0.9977, neg: 0.084, neu: 0.748, pos: 0.169, 
compound: 0.913, neg: 0.098, neu: 0.779, pos: 0.123, 
compound: -0.8235, neg: 0.135, neu: 0.727, pos: 0.138, 
compound: 0.9074, neg: 0.1, neu: 0.788, pos: 0.112, 
compound: 0.9937, neg: 0.115, neu: 0.68, pos: 0.206, 
compound: 0.9931, neg: 0.068, neu: 0.791, pos: 0.141, 
compound: -0.5825, neg: 0.108, neu: 0.805, pos: 0.088, 
compound: 0.9914, neg: 0.108, neu: 0.734, pos: 0.157, 
compound: 0.9598, neg: 0.058, neu: 0.815, pos: 0.127, 
compound: 0.9995, neg: 0.091, neu: 0.705, pos: 0.204, 
compound: -0.9596, neg: 0.162, neu: 0.692, pos: 0.146, 
compound: 0.9993, neg: 0.068, neu: 0.78, pos: 0.152, 
compound: -0.964, neg: 0.162, neu: 0.748, pos: 0.09, 
compound: 0.9986, neg: 0.073, neu: 0.764, pos: 0.163, 
compound: 0.9771, neg: 0.072, neu: 0.795, pos: 0.133, 
compound: 0.99

compound: 0.996, neg: 0.1, neu: 0.735, pos: 0.165, 
compound: 0.1475, neg: 0.107, neu: 0.783, pos: 0.11, 
compound: 0.9886, neg: 0.085, neu: 0.772, pos: 0.143, 
compound: 0.9866, neg: 0.104, neu: 0.742, pos: 0.154, 
compound: -0.9957, neg: 0.241, neu: 0.691, pos: 0.069, 
compound: 0.9893, neg: 0.097, neu: 0.773, pos: 0.129, 
compound: 0.9914, neg: 0.082, neu: 0.79, pos: 0.128, 
compound: -0.9497, neg: 0.126, neu: 0.755, pos: 0.119, 
compound: 0.8414, neg: 0.08, neu: 0.83, pos: 0.09, 
compound: 0.9961, neg: 0.049, neu: 0.826, pos: 0.125, 
compound: 0.1165, neg: 0.139, neu: 0.727, pos: 0.134, 
compound: 0.9127, neg: 0.13, neu: 0.722, pos: 0.148, 
compound: 0.8134, neg: 0.097, neu: 0.794, pos: 0.11, 
compound: 0.9431, neg: 0.113, neu: 0.743, pos: 0.144, 
compound: 0.9945, neg: 0.094, neu: 0.77, pos: 0.136, 
compound: 0.9893, neg: 0.099, neu: 0.776, pos: 0.125, 
compound: 0.9939, neg: 0.104, neu: 0.727, pos: 0.169, 
compound: -0.8652, neg: 0.139, neu: 0.729, pos: 0.132, 
compound: 0.9701, 

compound: 0.9803, neg: 0.118, neu: 0.726, pos: 0.156, 
compound: 0.9963, neg: 0.036, neu: 0.787, pos: 0.177, 
compound: -0.9779, neg: 0.153, neu: 0.809, pos: 0.038, 
compound: -0.951, neg: 0.124, neu: 0.798, pos: 0.078, 
compound: 0.9892, neg: 0.118, neu: 0.723, pos: 0.159, 
compound: 0.9982, neg: 0.08, neu: 0.709, pos: 0.212, 
compound: 0.9977, neg: 0.13, neu: 0.701, pos: 0.169, 
compound: -0.198, neg: 0.113, neu: 0.789, pos: 0.098, 
compound: -0.984, neg: 0.159, neu: 0.731, pos: 0.11, 
compound: 0.9975, neg: 0.04, neu: 0.776, pos: 0.184, 
compound: -0.9913, neg: 0.143, neu: 0.738, pos: 0.119, 
compound: 0.9845, neg: 0.085, neu: 0.791, pos: 0.124, 
compound: -0.9532, neg: 0.114, neu: 0.791, pos: 0.095, 
compound: -0.1753, neg: 0.088, neu: 0.829, pos: 0.083, 
compound: 0.9983, neg: 0.052, neu: 0.759, pos: 0.188, 
compound: 0.9917, neg: 0.052, neu: 0.804, pos: 0.144, 
compound: -0.9957, neg: 0.149, neu: 0.759, pos: 0.092, 
compound: 0.9381, neg: 0.095, neu: 0.784, pos: 0.122, 
compound: