# Multi-nomial Naive Bayes Classifier

## Steps for classification using Naive Bayes:

1. `Clean your data` - different ways and options to do this. All I do is remove punctuation, 

In [1]:
from collections import defaultdict
from math import log, exp
import matplotlib.pyplot as plt
import numpy as np
from text_preprocessor import process_df,tokenizer

cleaned_df = process_df("ice_cream_reviews.csv")

# 3732 for training (80%), 1600 (20%) for testing, 
X_train = cleaned_df[1600:]
print(len(X_train))

X_test = cleaned_df[:1600]
print(len(X_test))


3732
1600


In [11]:
class NaiveBayesClassifier:
    
    def __init__(self, train_data, test_data):
        
        self.train = train_data
        self.test = test_data
       
        # separate positive and negative reviews
        self.pos_entries = self.train.loc[self.train["sentiment"]==1]
        self.neg_entries = self.train.loc[self.train["sentiment"]==0]
    
        # all words from each class (need to be lists since I turn them into bag of words)
        self.pos_vocab = [word for review in self.pos_entries["review"] for word in review.split()]
        self.neg_vocab = [word for review in self.neg_entries["review"] for word in review.split()]
        
        # bag of words for each class
        self.pos_BoW = self._BoW(self.pos_vocab)
        self.neg_BoW = self._BoW(self.neg_vocab)
    
    def _BoW(self, doc):
        
        d = defaultdict(lambda:0)
        for word in doc:
            d[word]+=1  
        return dict(d)


    # class log prior probability
    def _Pprob(self, c):
        # log of total # of c class reviews divided by total # of reviews
        return log(len(c) / (len(self.train)))
    
    # maxiumum likelihood probability
    def _MLprob(self, sentence):
        
        sentence = tokenizer(sentence)
        scores = [] 
        
        # positive and negative log prior probability
        pos_prob = self._Pprob(self.pos_entries)
        neg_prob = self._Pprob(self.neg_entries)
        
        
        # most important part of the classifier
        
        # calculates the likelihood probability of each word given the class prior probability
        
        # laplace (+1) smoothing to prevent possible 0 probabilities
        
        # reason for try/except: we dont want to deal with words not in training set, so if we try to acces an unattested word, just dont do anything
        
        # using log speeds up calculations
        
        # EQUATION: count of word+1 / len(all words in class + len(all reviews)). Need to add len(all reviews) to account for smoothing in numerator.
        
        for word in sentence: 
                try:
                    w = self.neg_BoW[word] + 1
                    #print("NEG w:",w) # testing
                    V = (len(self.neg_BoW)+ len(self.train))
                    neg_prob += log(w / V)
                except:
                    pass
                
                
        scores.append(exp(neg_prob)) 
        
        for word in sentence: 
                try:
                    w = self.pos_BoW[word]+ 1
                    #print("POS w:",w) # testing
                    V = (len(self.pos_BoW)+ len(self.train))
                    pos_prob += log(w / V)
                except:
                    pass
                
        scores.append(exp(pos_prob))
        
        # index 0 == negative, index 1 == positive
        return np.argmax(np.array(scores))
      
    def get_scores(self, extra=False):
         
        true_pos = 0
        false_pos = 0
        true_neg = 0
        false_neg = 0
        
        test_reviews = self.test["review"]
        test_labels = self.test["sentiment"]
        
        for label, review in zip(test_labels, test_reviews):
            
            MLprob = self._MLprob(review)

            if MLprob == 0 and label == 0:   # predicted neg, label neg 
                true_neg += 1
            elif MLprob == 1 and label == 1: # predicted pos, label pos
                true_pos += 1   
            elif MLprob == 0 and label == 1: # predicted neg, label pos
                false_neg += 1
            elif MLprob == 1 and label == 0: # predicted pos, label neg
                false_pos += 1 
                
        # option to print out more details about the results if desired
        if extra:
            print("TN:",true_neg)
            print("TP:",true_pos)
            print("FP:",false_pos)
            print("FN:",false_neg)
        
                
                
        accuracy = (true_neg + true_pos) / (true_pos + true_neg + false_neg + false_pos)
        precision = (true_pos) / (true_pos + false_pos)
        recall = (true_pos) / (true_pos + false_neg)
        f1score = (2* (precision*recall)) / (precision + recall)
        
        return round(accuracy,3), round(precision,3), round(recall,3), round(f1score, 3)
        
    def classification_report(self, extra=False):
        
        accuracy, precision, recall, f1score = self.get_scores(extra)
        print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-score: {f1score}")
        
        
    def confusion_matrix(self):
        pass
        
    def predict(self, review):
        pass
    
    def plotcounts(self):
        pass
    
    


# instantiate and fit the model to data
model = NaiveBayesClassifier(X_train, X_test)


In [10]:


model.classification_report()

Accuracy: 0.695
Precision: 0.814
Recall: 0.797
F1-score: 0.805
