# David Saffo
## Comp379-HW4
## Naive Bayes Classifier

In [1]:
import re
import os
import string
from collections import Counter
import numpy as np
import pandas as pd
import random
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### Reading in two data files, splitting them line by line into a list, merging them into one list, labeling them pos or neg, shuffling the indexes, and splittin the data 70,15,15

In [2]:
goodFile = open("rt-polarity.pos","r")
goodReviews = goodFile.read()
goodReviews = goodReviews.lower()
goodReviews = goodReviews.split("\n")

badFile = open("rt-polarity.neg","r")
badReviews = badFile.read()
badReviews = badReviews.lower()
badReviews = badReviews.split("\n")

ratings = []
for i in range(0, len(goodReviews)):
    ratings.append('pos')
for i in range(0, len(badReviews)):
    ratings.append('neg')

reviews = []
reviews = goodReviews + badReviews
review = []
for i in range(0, len(reviews)):   
    if ratings[i] == 'pos':
        category = "pos"
        value = reviews[i]
    else: 
        category = "neg"
        value = reviews[i]
        
    review.append((category, value))

print (len(review))

random.shuffle(review)

split = round(len(review)*.70)
split2 = round(len(review)*.15)

train = review[0:split]
del review[0:split]
dev = review[0:split2]
del review[0:split2]
test = review[0:split2]

print (len(train))
print (len(dev))
print (len(test))

10664
7465
1600
1599


### Helper functions for removing punctuation and triming the whitespace

In [3]:
def removeP(data):
    table = data.translate(str.maketrans("", "", string.punctuation)) 
    return table

def Trim2(data):
    data = removeP(data)
    data = data.lower()
    return re.split("\W+", data)

### Testing the trim function and counter method 

In [4]:
a = Trim2(goodReviews[0])
c = Counter(a)
print(c)

Counter({'the': 2, 'to': 2, 'going': 1, '': 1, 'destined': 1, 'than': 1, 'or': 1, 'arnold': 1, 'schwarzenegger': 1, 'new': 1, 'van': 1, 'greater': 1, 'a': 1, 'segal': 1, 'conan': 1, 'be': 1, 'and': 1, 'damme': 1, '21st': 1, 'jeanclaud': 1, 'steven': 1, 'rock': 1, 'is': 1, 'make': 1, 'centurys': 1, 'splash': 1, 'even': 1, 'hes': 1, 'that': 1})


### Training the model by counting words and their prior apperances/labels
###### Implementation learned here : http://blog.yhat.com/posts/naive-bayes-in-python.html

In [5]:
bag = {}
Wcount = {"pos": {}, "neg": {}}
prior = {"pos" : 0., "neg" : 0.}
docs = []

for i in range(0, len(train)):   
    category = train[i][0]
    value = train[i][1]
        
    docs.append((category, value))
    
    prior[category] += 1
    
    words = Trim2(value)
    
    counts = Counter(words)
    
    for word, count in counts.items():
        if word not in bag:
            bag[word] = 0.0 
        if word not in Wcount[category]:
            Wcount[category][word] = 0.0
        bag[word] += count
        Wcount[category][word] += count

### Method for label prediction, takes reviews one at a time and uses the naive bayes algorithm to predict labels 

In [6]:
def Predict(data):
    scores = []
    for i in range(0, len(data)):
        words = Trim2(data[i][1])
        counts = Counter(words)
    
        prior_pos = (prior["pos"] / sum(prior.values()))
        prior_neg = (prior["neg"] / sum(prior.values()))

        log_pos = 0.0
        log_neg = 0.0

        for w, count in counts.items():
            if not w in bag or len(w) <=2:
                continue
            p_word = bag[w] / sum(bag.values())
            p_w_give_pos = Wcount["pos"].get(w, 0.0) / sum(Wcount["pos"].values())
            p_w_give_neg = Wcount["neg"].get(w, 0.0) / sum(Wcount["neg"].values())
    
            if p_w_give_pos > 0:
                log_pos += math.log(count * p_w_give_pos / p_word)
            if p_w_give_neg > 0:
                log_neg += math.log(count * p_w_give_neg / p_word)
        
        Pscore = math.exp(log_pos + math.log(prior_pos))
        Nscore = math.exp(log_neg + math.log(prior_neg))                  
    
        if Pscore > Nscore:
            scores.append("pos")
        else:
            scores.append("neg")
    
    return scores

### Takes list of predicted labels from above and runs accuracey and f1score tests

In [7]:
target = []
target2 = []
predictions2 = []
for l in range(0, len(dev)):
    target.append(dev[l][0])
predictions = Predict(dev)

for l in range(0, len(dev)):
    if target[l] == 'pos':
        target2.append(1)
    else:
        target2.append(0)

for l in range(0, len(dev)):
    if predictions[l] == 'pos':
        predictions2.append(1)
    else:
        predictions2.append(0)        

print("accuracy: " , accuracy_score(target, predictions))
print("f1 score: " , f1_score(target2, predictions2, average = 'binary'))

accuracy:  0.76125
f1 score:  0.757306226175


### Same as above but for test set for final scores 

In [8]:
target = []
target2 = []
predictions2 = []
for l in range(0, len(test)):
    target.append(test[l][0])

predictions = Predict(test)

for l in range(0, len(test)):
    if target[l] == 'pos':
        target2.append(1)
    else:
        target2.append(0)

for l in range(0, len(test)):
    if predictions[l] == 'pos':
        predictions2.append(1)
    else:
        predictions2.append(0)        

print("accuracy: " , accuracy_score(target, predictions))
print("f1 score: " , f1_score(target2, predictions2, average = 'binary'))

accuracy:  0.775484677924
f1 score:  0.770607028754
