# Multinomial Naivebayes Implementation

In [1]:
from __future__ import division
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
import matplotlib as plt
import math
import logging


In [2]:

class MyMultinomialBayesClassifier():
    # For graduate students only
    def __init__(self, smooth=1):
        self._smooth = smooth # This is for add one smoothing, don't forget!
        self._feat_prob = []
        self._class_prob = []
        self._Priors = []
        self._Ncls = []
        self._Nfeat = []

    # Train the classifier using features in X and class labels in Y
    def train(self, X, y):
       #counting total  number values in each class
        unique, counts = np.unique(y, return_counts=True)

        #Calculating Prior Probabilities of each class
        for i in range(len(unique)):
            self._class_prob.append(np.log(float(counts[i]) / float(len(y))))

        #Separating documents classwise
        doc_classwise = [[x for x, i in zip(X, y) if cl == i] for cl in np.unique(y)]

        #counting the total frequency of every word for each class
        word_count =   np.array([np.array(i).sum(axis=0) for i in doc_classwise])
       
        #Counting total number of words in each class
        total_word_count =  word_count.sum(axis=1)[np.newaxis].T

        #Calculating feature probabilities P(Xi/Y)= (Nyi + a)/(Ny + ap)
        self._feat_prob =np.log((word_count + self._smooth) / (total_word_count + len(X[0])))

        return
    # should return an array of predictions, one for each row in X
    def predict(self, X):
            #calculating Posterior Probability for each data row
        posterior_prob = [(self._feat_prob*x).sum(axis=1) + self._class_prob for x in X]

        #comparing posterior for every class and picking the maximum one
        pred_y = np.argmax(posterior_prob, axis=1)
        return pred_y


In [3]:

""" 
Here is the calling code

"""

categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')

y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a count vectorizer")


vectorizer = CountVectorizer(stop_words='english', binary=True)#, analyzer='char', ngram_range=(1,3))
X_train = vectorizer.fit_transform(data_train.data).toarray()
X_test = vectorizer.transform(data_test.data).toarray()


alpha = 1   #smothing parameter
clf_multi =MyMultinomialBayesClassifier(alpha)
clf_multi.train(X_train,y_train)
y_pred2 =clf_multi.predict(X_test)

print ("Multinomial naive bayes Accuracy Score", accuracy_score(y_test,y_pred2) )


data loaded
Extracting features from the training data using a count vectorizer
Multinomial naive bayes Accuracy Score 0.777531411678
