# Tutorial: Predicting Movie Review Sentiment with Naive Bayes

In [48]:
# A nice python class that lets you count how many times items occur in a list
from collections import Counter
import csv
import re

# Read in the training data
with open("Data/movie_sentiment_train.csv", 'r') as file:
    reviews = list(csv.reader(file))

#print(reviews)
    
def get_text(reviews, score):
      # Join together the text in the reviews for a particular tone.
      # We lowercase to avoid "Not" and "not" being seen as different words, for example.
    for r in reviews:
        return " ".join([r[0].lower() for r in reviews if r[1] == str(score)])

def count_text(text):
    # Split text into words based on whitespace. Simple but effective
    words = re.split("\s+", text)
    # Count up the occurence of each word
    return Counter(words)

negative_text = get_text(reviews, -1)
positive_text = get_text(reviews, 1)

# Generate word counts for negative tone
list_negative_counts = count_text(negative_text)

# Generate word counts for positive tone
list_positive_counts = count_text(positive_text)

print("Negative text sample: {0}".format(negative_text[:100]))
print("Positive text sample: {0}".format(positive_text[:100]))
print()

print("Negative Counts:",)
print(list_negative_counts)
print()
print("Positive Counts:")
print(list_positive_counts)

Negative text sample: a series of escapades demonstrating the adage that what is good for the goose is also good for the g
Positive text sample: this quiet , introspective and entertaining independent is worth seeking . a source of high hilarity

Negative Counts:
Counter({'a': 9, 'hard': 6, 'time': 6, 'sitting': 6, 'through': 6, 'this': 6, 'one': 6, 'of': 5, 'have': 5, ',': 4, '.': 4, 'would': 4, 'the': 3, 'is': 2, 'good': 2, 'for': 2, 'which': 2, 'i': 2, 'suspect': 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, "'s": 1, 'work': 1, 'shakespearean': 1, 'tragedy': 1})

Positive Counts:
Counter({'and': 15, 'a': 14, ',': 10, '.': 8, 'of': 6, 'is': 5, 'this': 4, 'sweet': 4, 'modest': 4, 'ultimately': 4, 'winning': 4, 'story': 4, 'the': 4, 'be': 3, 'one': 3, 'p

In [49]:
import re
from collections import Counter

def get_y_count(score):
  # Compute the count of each classification occuring in the data.
  return len([r for r in reviews if r[1] == str(score)])

# We need these counts to use for smoothing when computing the prediction.
positive_review_count = get_y_count(1)
negative_review_count = get_y_count(-1)
#print("Positive Review Count = ", positive_review_count)
#print("Negative Review Count = ", negative_review_count)

# These are the class probabilities (we saw them in the formula as P(y)).
prob_positive = positive_review_count / len(reviews)
prob_negative = negative_review_count / len(reviews)
#print("prob_positive = ", positive_review_count, "/", len(reviews), " = ", prob_positive)
#print("prob_negative = ", negative_review_count, "/", len(reviews), " = ", prob_negative)

def make_class_prediction(text, counts, class_prob, class_count):
  prediction = 1
  text_counts = Counter(re.split("\s+", text))
  #print("Text Counts = ", text_counts)
  for word in text_counts:
      # For every word in the text, we get the number of times that word occured in the reviews for a given class, add 1 to smooth the value, and divide by the total number of words in the class (plus the class_count to also smooth the denominator).
      # Smoothing ensures that we don't multiply the prediction by 0 if the word didn't exist in the training data.
      # We also smooth the denominator counts to keep things even.
      #print(text_counts.get(word))
      prediction *=  text_counts.get(word) * ((counts.get(word, 0) + 1) / (sum(counts.values()) + class_count))
  # Now we multiply by the probability of the class existing in the documents.
  return prediction * class_prob

# As you can see, we can now generate probabilities for which class a given review is part of.
# The probabilities themselves aren't very useful -- we make our classification decision based on which value is greater.
print()
print("Review => {0}".format(reviews[0][0]))
print()
print("Negative prediction => {0}".format(make_class_prediction(reviews[0][0], list_negative_counts, prob_negative, negative_review_count)))
print()
print("Positive prediction => {0}".format(make_class_prediction(reviews[0][0], list_positive_counts, prob_positive, positive_review_count)))



Review => A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .

Negative prediction => 1.5336056959847352e-45

Positive prediction => 3.9540827896477066e-55


In [50]:
import csv

def make_decision(text, make_class_prediction):
    # Compute the negative and positive probabilities.
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_review_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_review_count)

    # We assign a classification based on which probability is greater.
    if negative_prediction > positive_prediction:
      return -1
    return 1

with open("Data/movie_sentiment_test.csv", 'r') as file:
    test = list(csv.reader(file))

predictions = [make_decision(r[0], make_class_prediction) for r in test]

In [52]:
actual = [int(r[1]) for r in test]

from sklearn import metrics

# Generate the roc curve using scikit-learn.
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)

# Measure the area under the curve.  The closer to 1, the "better" the predictions.
print("AUC of the predictions: {0}".format(metrics.auc(fpr, tpr)))

AUC of the predictions: 0.5


In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

# Generate counts from text using a vectorizer.  There are other vectorizers available, and lots of options you can set.
# This performs our step of computing word counts.
vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform([r[0] for r in reviews])
test_features = vectorizer.transform([r[0] for r in test])

# Fit a naive bayes model to the training data.
# This will train the model using the word counts we computer, and the existing classifications in the training set.
nb = MultinomialNB()
nb.fit(train_features, [int(r[1]) for r in reviews])

# Now we can use the model to predict classifications for our test features.
predictions = nb.predict(test_features)

# Compute the error.  It is slightly different from our model because the internals of this process work differently from our implementation.
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))

Multinomial naive bayes AUC: 0.75
