In [98]:
%load_ext autoreload
%autoreload 2

import csv
import math
from utility_functions import *
from component_functions import *
from combine_datasets import combineDataset


def classifier(review, cls, vblry):
    """
        Calculates the likelihood component of a review.
    """
    # sanitize review
    review = review.strip(' \n\t')
    for ch in [',', ';', '.', '"', '!', '(', ')', ':', '/', "-", '\\', '?' ]:
        if ch in review:
            review = review.replace(ch, ' ')
    lst = list(review.strip().split())
    
    likelihood = 0
    for word in lst:
        likelihood+=likelihoodOfWordInCls(word, cls)

    return likelihood
        
    


def naive_bayes_classifier(testdoc, cls, resultsdoc):
    """
        This function takes a testdoc.txt file containing a several sentences each sentence representing a review.
        and returns a result.txt file containing assigned categories for each review, whether a positive review or negative.
    """
    
    # training dataset
    vocabulary = '../sentiment-labelled-sentences/mainDataset.csv'
    
    results = []
    with open(testdoc, 'r') as f1, open(resultsdoc, 'w') as f2:
        for review in f1:
            prob = []
            for c in cls:
                # calculating prior probability
                prior_prob = priorProb(vocabulary, c)
                # calculating likelihood
                likelihoodOfReview = classifier(review, c, vocabulary)
                # find the probability of review in a class
                prob_of_review_in_cls = likelihoodOfReview + prior_prob
                # append to list
                prob.append(prob_of_review_in_cls)
                
            # compare and select category of review
            category = prob.index(max(prob))
            # appends results to list
            results.append(category)
            #write results to file
            f2.write(str(category) + '\n')
            
    return results
            
    
def accuracy(testdoc, cls, resultsdoc, newtestdoc):
    """
        This function measures the accuracy of the model.
    """
    # restructing testdoc with labels to newdoc without lables
    knowResults = []
    with open(newtestdoc, 'w') as f1, open(testdoc, 'r') as f2:
        for review in f2:
            testReview, c = review.split('\t')
            knowResults.append(int(c.strip()))
            f1.write(testReview + "\n")
    # results from naive bayes classifier
    resultsfromClassifier = naive_bayes_classifier(newtestdoc, cls, resultsdoc)
    # computing accuracy
    matchCount = 0
    for i in range(len(knowResults)):
        if knowResults[i] == resultsfromClassifier[i]:
            matchCount+=1
    
    
    return matchCount/len(knowResults)  
    
    
        
    
    
    

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
# Testing countWrdsOfVocInCls function in component_functions.py
countWrdsOfVocInCls('../sentiment-labelled-sentences/mainDataset.csv', 1)

21032

In [100]:
# Testing likelihoodOfWordInCls in component_functions.py
likelihoodOfWordInCls('bad', 1)

-8.855188077741607

In [101]:
# Computing the accuracy of the model
accuracy('../sentiment-labelled-sentences/test.txt', [0,1], '../sentiment-labelled-sentences/result.txt', '../sentiment-labelled-sentences/newdoc.txt')

[0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]


0.83

In [27]:
# Testing priorProb function from component_functions.py
priorProb('../sentiment-labelled-sentences/mainDataset.csv', 0)

-0.7002117200730477

In [6]:
# Testing function that calculates likehood of a review in a class
classifier('young man.', 1, '../sentiment-labelled-sentences/mainDataset.csv')

-16.309080489590965

In [57]:
# Testing the naive bayes classifier
naive_bayes_classifier('../sentiment-labelled-sentences/test.txt', [0, 1], '../sentiment-labelled-sentences/result.txt')

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,


In [22]:
#Testing function that converts .txt to .csv format
convertToCSV('../sentiment-labelled-sentences/mainDataset.txt', '../sentiment-labelled-sentences/mainDataset.csv')

In [23]:
createList('../sentiment-labelled-sentences/mainDataset.csv')

[['A',
  'very',
  'very',
  'very',
  'slow',
  'moving',
  'aimless',
  'movie',
  'about',
  'a',
  'distressed',
  'drifting',
  'young',
  'man',
  '0'],
 ['Not',
  'sure',
  'who',
  'was',
  'more',
  'lost',
  'the',
  'flat',
  'characters',
  'or',
  'the',
  'audience',
  'nearly',
  'half',
  'of',
  'whom',
  'walked',
  'out',
  '0'],
 ['Attempting',
  'artiness',
  'with',
  'black',
  '&',
  'white',
  'and',
  'clever',
  'camera',
  'angles',
  'the',
  'movie',
  'disappointed',
  'became',
  'even',
  'more',
  'ridiculous',
  'as',
  'the',
  'acting',
  'was',
  'poor',
  'and',
  'the',
  'plot',
  'and',
  'lines',
  'almost',
  'non',
  'existent',
  '0'],
 ['Very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of', '0'],
 ['The',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'was',
  'when',
  'Gerardo',
  'is',
  'trying',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keeps',
  'running',
  'through',
  'his',
  'head',
  '1'],
 ['The',
  'rest'

In [24]:
createDict('../sentiment-labelled-sentences/mainDataset.csv', 1)

Counter({'the': 896,
         'best': 60,
         'scene': 7,
         'in': 185,
         'movie': 71,
         'was': 230,
         'when': 27,
         'gerardo': 1,
         'is': 383,
         'trying': 2,
         'to': 280,
         'find': 10,
         'a': 424,
         'song': 4,
         'that': 114,
         'keeps': 2,
         'running': 2,
         'through': 7,
         'his': 28,
         'head': 2,
         'saw': 8,
         'today': 3,
         'and': 604,
         'thought': 12,
         'it': 277,
         'good': 158,
         'effort': 2,
         'messages': 2,
         'for': 146,
         'kids': 6,
         'loved': 16,
         'casting': 4,
         'of': 285,
         'jimmy': 2,
         'buffet': 6,
         'as': 80,
         'science': 1,
         'teacher': 1,
         'those': 9,
         'baby': 2,
         'owls': 1,
         'were': 49,
         'adorable': 4,
         'showed': 3,
         'lot': 12,
         'florida': 1,
         'at': 45,
  

In [21]:
# combineDataset('../sentiment-labelled-sentences/imdb_labelled.txt', '../sentiment-labelled-sentences/mainDataset.txt')
# combineDataset('../sentiment-labelled-sentences/amazon_cells_labelled.txt', '../sentiment-labelled-sentences/mainDataset.txt')
# combineDataset('../sentiment-labelled-sentences/yelp_labelled.txt', '../sentiment-labelled-sentences/mainDataset.txt')