In [86]:
%load_ext autoreload
%autoreload 2

import csv
import math
from utility_functions import *
from component_functions import *


def classifier(review, cls, vblry):
    """
        Calculates the likelihood component of a review.
    """
    
    # sanitize review
    review = review.strip(' \n\t')
    for ch in [',', ';', '.', '"', '!', '(', ')', ':', '/', "-", '\\' ]:
        if ch in review:
            review = review.replace(ch, ' ')
    lst = list(review.strip().split())
    
    likelihood = 1
    for word in lst:
        likelihood*=likelihoodOfWordInCls(word, cls)

    return likelihood
        
    


def naive_bayes_classifier(testdoc):
    """
        This function takes a testdoc.txt file containing a several sentences each sentence representing a review.
        and returns a result.txt file containing assigned categories for each review, whether a positive review or negative.
    """
    
    # training dataset
    vocabulary = '../sentiment-labelled-sentences/mainDataset.csv'
    
    # classes used
    cls = [0, 1]
    
    
    with open(testdoc, 'r') as f1, open('../sentiment-labelled-sentences/result.txt', 'w') as f2:
        for review in f1:
            prob = []
            for c in cls:
                # calculating prior probability
                prior_prob = priorProb(vocabulary, c)
                # calculating likelihood
                likelihoodOfReview = classifier(review, c, vocabulary)
                # find the probability of review in a class
                prob_of_review_in_cls = likelihoodOfReview * prior_prob
                # append to list
                prob.append(prob_of_review_in_cls)
                
            # compare and select category of review
            f2.write(str(prob.index(max(prob))) + '\n')
            
    
    
    

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
# Testing countWrdsOfVocInCls function in component_functions.py
countWrdsOfVocInCls('../sentiment-labelled-sentences/mainDataset.csv', 1)

23098

In [88]:
# Testing likelihoodOfWordInCls in component_functions.py
likelihoodOfWordInCls('the', 1)

-4.363574377010446

In [89]:
# Testing priorProb function from component_functions.py
priorProb('../sentiment-labelled-sentences/mainDataset.csv', 1)

-0.3010299956639812

In [90]:
# Testing function that calculates likehood of a review in a class
classifier('young man.', 1, '../sentiment-labelled-sentences/mainDataset.csv')

19.0407813437021

In [None]:
# Testing the naive bayes classifier
naive_bayes_classifier('../sentiment-labelled-sentences/test.txt')

In [19]:
#Testing function that converts .txt to .csv format
convertToCSV('../sentiment-labelled-sentences/mainDataset.txt', '../sentiment-labelled-sentences/mainDataset.csv')

In [33]:
createList('../sentiment-labelled-sentences/mainDataset.csv')

[['A',
  'very',
  'very',
  'very',
  'slow',
  'moving',
  'aimless',
  'movie',
  'about',
  'a',
  'distressed',
  'drifting',
  'young',
  'man',
  '0'],
 ['Not',
  'sure',
  'who',
  'was',
  'more',
  'lost',
  'the',
  'flat',
  'characters',
  'or',
  'the',
  'audience',
  'nearly',
  'half',
  'of',
  'whom',
  'walked',
  'out',
  '0'],
 ['Attempting',
  'artiness',
  'with',
  'black',
  '&',
  'white',
  'and',
  'clever',
  'camera',
  'angles',
  'the',
  'movie',
  'disappointed',
  'became',
  'even',
  'more',
  'ridiculous',
  'as',
  'the',
  'acting',
  'was',
  'poor',
  'and',
  'the',
  'plot',
  'and',
  'lines',
  'almost',
  'non',
  'existent',
  '0'],
 ['Very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of', '0'],
 ['The',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'was',
  'when',
  'Gerardo',
  'is',
  'trying',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keeps',
  'running',
  'through',
  'his',
  'head',
  '1'],
 ['The',
  'rest'

In [24]:
createDict('../sentiment-labelled-sentences/mainDataset.csv', 1)

['the', 'best', 'scene', 'in', 'the', 'movie', 'was', 'when', 'gerardo', 'is', 'trying', 'to', 'find', 'a', 'song', 'that', 'keeps', 'running', 'through', 'his', 'head']
['saw', 'the', 'movie', 'today', 'and', 'thought', 'it', 'was', 'a', 'good', 'effort', 'good', 'messages', 'for', 'kids']
['loved', 'the', 'casting', 'of', 'jimmy', 'buffet', 'as', 'the', 'science', 'teacher']
['and', 'those', 'baby', 'owls', 'were', 'adorable']
['the', 'movie', 'showed', 'a', 'lot', 'of', 'florida', 'at', "it's", 'best', 'made', 'it', 'look', 'very', 'appealing']
['the', 'songs', 'were', 'the', 'best', 'and', 'the', 'muppets', 'were', 'so', 'hilarious']
['it', 'was', 'so', 'cool']
['this', 'is', 'a', 'very', 'right', 'on', 'case', 'movie', 'that', 'delivers', 'everything', 'almost', 'right', 'in', 'your', 'face']
['this', 'review', 'is', 'long', 'overdue', 'since', 'i', 'consider', 'a', 'tale', 'of', 'two', 'sisters', 'to', 'be', 'the', 'single', 'greatest', 'film', 'ever', 'made']
["i'll", 'put', 'th

['i', 'got', 'this', 'phone', 'on', 'reccomendation', 'from', 'a', 'relative', 'and', "i'm", 'glad', 'i', 'did']
['i', 'had', 'absolutely', 'no', 'problem', 'with', 'this', 'headset', 'linking', 'to', 'my', '8530', 'blackberry', 'curve']
['i', 'know', 'that', 'sounds', 'funny', 'but', 'to', 'me', 'it', 'seemed', 'like', 'sketchy', 'technology', 'that', "wouldn't", 'work', 'well', 'well', 'this', 'one', 'works', 'great']
['this', 'phone', 'is', 'very', 'fast', 'with', 'sending', 'any', 'kind', 'of', 'messages', 'and', 'web', 'browsing', 'is', 'significantly', 'faster', 'than', 'previous', 'phones', 'i', 'have', 'used']
['nice', 'quality', 'build', 'unlike', 'some', 'cheap', 's***', 'out', 'there']
['this', 'item', 'is', 'fantastic', 'and', 'works', 'perfectly']
['and', 'i', 'just', 'love', 'the', 'colors']
['w810i', 'is', 'just', 'superb']
['great', 'charger']
['works', 'like', 'a', 'charm', 'it', 'work', 'the', 'same', 'as', 'the', 'one', 'i', 'got', 'with', 'the', 'phone']
['great', '

['it', 'plays', 'louder', 'than', 'any', 'other', 'speaker', 'of', 'this', 'size', 'the', 'price', 'is', 'so', 'low', 'that', 'most', 'would', 'think', 'the', 'quality', 'is', 'lacking', 'however', "it's", 'not']
['better', 'than', 'expected']
['battery', 'is', 'holding', 'up', 'well']
['linked', 'to', 'my', 'phone', 'without', 'effort']
['they', 'are', 'so', 'cool']
['easy', 'to', 'use']
['better', 'than', 'new']
['a', 'pretty', 'good', 'product']
['this', 'is', 'a', 'great', 'phone']
['for', 'the', 'price', 'on', 'amazon', 'it', 'is', 'an', 'excellent', 'product', 'which', 'i', 'would', 'highly', 'recommend']
['their', 'research', 'and', 'development', 'division', 'obviously', 'knows', 'what', "they're", 'doing']
['those', 'phones', 'are', 'working', 'just', 'fine', 'now']
['i', 'am', 'more', 'than', 'happy', 'with', 'this', 'product']
['overall', 'i', 'would', 'recommend', 'this', 'phone', 'over', 'the', 'new', 'walkman']
['it', 'fits', 'my', 'ear', 'well', 'and', 'is', 'comfortable

['if', 'you', 'love', 'authentic', 'mexican', 'food', 'and', 'want', 'a', 'whole', 'bunch', 'of', 'interesting', 'yet', 'delicious', 'meats', 'to', 'choose', 'from', 'you', 'need', 'to', 'try', 'this', 'place']
['an', 'excellent', 'new', 'restaurant', 'by', 'an', 'experienced', 'frenchman']
['great', 'steak', 'great', 'sides', 'great', 'wine', 'amazing', 'desserts']
['the', 'steak', 'and', 'the', 'shrimp', 'are', 'in', 'my', 'opinion', 'the', 'best', 'entrees', 'at', 'gc']
['i', 'had', 'the', 'opportunity', 'today', 'to', 'sample', 'your', 'amazing', 'pizzas']
['the', 'yellowtail', 'carpaccio', 'was', 'melt', 'in', 'your', 'mouth', 'fresh']
['just', 'spicy', 'enough', 'perfect', 'actually']
['last', 'night', 'was', 'my', 'second', 'time', 'dining', 'here', 'and', 'i', 'was', 'so', 'happy', 'i', 'decided', 'to', 'go', 'back']
['my', 'boyfriend', 'and', 'i', 'came', 'here', 'for', 'the', 'first', 'time', 'on', 'a', 'recent', 'trip', 'to', 'vegas', 'and', 'could', 'not', 'have', 'been', '

Counter({'the': 996,
         'best': 63,
         'scene': 7,
         'in': 206,
         'movie': 85,
         'was': 257,
         'when': 36,
         'gerardo': 1,
         'is': 417,
         'trying': 2,
         'to': 307,
         'find': 10,
         'a': 467,
         'song': 4,
         'that': 126,
         'keeps': 2,
         'running': 3,
         'through': 8,
         'his': 30,
         'head': 2,
         'saw': 9,
         'today': 5,
         'and': 674,
         'thought': 12,
         'it': 317,
         'good': 177,
         'effort': 2,
         'messages': 2,
         'for': 159,
         'kids': 7,
         'loved': 21,
         'casting': 5,
         'of': 310,
         'jimmy': 2,
         'buffet': 8,
         'as': 86,
         'science': 1,
         'teacher': 1,
         'those': 10,
         'baby': 2,
         'owls': 1,
         'were': 54,
         'adorable': 6,
         'showed': 3,
         'lot': 15,
         'florida': 1,
         'at': 47,
 