## Sentiment Analysis of IMDB Reviews using Naive Bayes

This is a much larger and more complicated dataset as compared with the Rotten Tomatoes dataset

#### Some imports to make code compatible with Python 2 as well as 3

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import re
import tarfile

In [3]:
from six.moves import urllib

#### Import nltk, the Natural Language Processing Toolkit

This is one of the most popular packages for natural language processing on text data. It has APIs to access a large corpus of documents and other lexical resources

In [4]:
import numpy as np
import nltk

In [5]:
print(np.__version__)
print(nltk.__version__)

1.14.3
3.3


#### Automate the download, unzip and untar of the reviews dataset

The tarred and gzipped file is stored in the same directory as the code

In [10]:
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)

    print('Found and verified file from this path: ', url_path)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

#### Clean up the reviews by removing special characters

In [11]:
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")


def get_reviews(dirname, positive=True):
    label = 1 if positive else 0

    reviews = []
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            with open(dirname + filename, 'r+') as f:
                review = f.read().decode('utf-8')
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                
                # Return a tuple of the review text and a label for whether it 
                # is a positive or negative review
                reviews.append((review, label))
    
    return reviews 

def extract_reviews():
    # If the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
        
    positive_reviews = get_reviews("aclImdb/train/pos/", positive=True)
    negative_reviews = get_reviews("aclImdb/train/neg/", positive=False)
    
    return positive_reviews, negative_reviews

In [12]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

('Found and verified file from this path: ', 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
('Downloaded file: ', 'ImdbReviews.tar.gz')


In [90]:
positive_reviews, negative_reviews = extract_reviews()

In [91]:
positive_reviews[:2]

[(u'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt',
  1),
 (u'homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the m

In [115]:
TRAIN_DATA = 5000
TOTAL_DATA = 6000

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]

test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]
test_negative_reviews = negative_reviews[TRAIN_DATA:TOTAL_DATA]

#### Get a list of all the unque words in the dataset, the vocabulary

In [116]:
def get_vocabulary(train_reviews):
    words_set = set()
    
    for review in train_reviews:
        words_set.update(review[0].split())
    
    return list(words_set)

vocabulary = get_vocabulary(train_reviews)

In [117]:
len(vocabulary)

66056

In [118]:
vocabulary[:5]

[u'fawn', u'tsukino', u'4000odd', u'clichewise', u'mclendoncovey']

### Represent the words in the review as a feature vector

* *review_text* The review in text form

Each review is represented as a dictionary where keys are all words in the vocabulary. The values associated with each key is True if the word is present in the review.

In [4]:
def extract_features(review_text):
    # Split the review into words, and create a set of the words
    review_words = set(review_text.split())

    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
        
    return features    

#### Map feature vector to labels

* *extract_features* Function to extract the features in feature vector form
* *train_reviews* Training dataset, a list of tuples of the form (review_text, label)

In [130]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

#### Train the classifier on the training data

In [131]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [5]:
def sentiment_calculator(review_text):
    features = extract_features(review_words)
    return trained_classifier.classify(features)

In [133]:
sentiment_calculator("What an amazing movie!")

0

In [134]:
sentiment_calculator("What a terrible movie")

0

#### Classify and measure the accuracy of the model on test data

In [135]:
def classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x > 0 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    percent_true_positive = float(true_positives) / len(positive_results)
    percent_true_negative = float(true_negatives) / len(negative_results)

    total_accurate = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)

    print("Accuracy on positive reviews = " +"%.2f" % (percent_true_positive * 100) + "%")
    print("Accurance on negative reviews = " +"%.2f" % (percent_true_negative * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100/ total) + "%")

In [136]:
classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator)

Accuracy on positive reviews = 74.70%
Accurance on negative reviews = 81.60%
Overall accuracy = 78.15%
