# NaiveBayes Bernoulli Implementation

In [1]:
from __future__ import division
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
import matplotlib as plt
import math
import logging
from time import time

In [2]:
class MyBayesClassifier():
    # For graduate and undergraduate students to implement Bernoulli Bayes
    def __init__(self, smooth=1):
        self._smooth = smooth # This is for add one smoothing, don't forget!
        self._feat_prob = []
        self._class_prob = []
        self.__Priors = []
        self._Ncls = []
        self._Nfeat = []

    def train(self, X, y):
        # counting total  number values in each class
        unique, counts = np.unique(y, return_counts=True)

        # Calculating Prior Probabilities of each class
        for i in range(len(unique)):
            self._class_prob.append(np.log(float(counts[i]) / float(len(y))))

        # Separating documents classwise
        doc_classwise = [[x for x, i in zip(X, y) if cl == i] for cl in np.unique(y)]

        # counting the total frequency of every word for each class
        word_count = np.array([np.array(i).sum(axis=0) for i in doc_classwise])


        #counting each class
        class_count = np.array([len(i) for i in doc_classwise])

        # Calculating feature probabilities
        self._feat_prob = (word_count + self._smooth) / (class_count[np.newaxis].T + 2*self._smooth)


        return
    
    def predict(self, X):
      #  posterior_prob=  np.array([ np.array(self._feat_prob) if x>0 else 1 - np.array(self._feat_prob)  for x in X ])
        posterior_prob=  [(np.log(self._feat_prob) * x +
          np.log(1 - self._feat_prob) * np.abs(x - 1)
          ).sum(axis=1) + self._class_prob for x in X]

      # comparing posterior for every class and picking the maximum one
        pred_y = np.argmax(posterior_prob, axis=1)
        return pred_y


In [3]:
""" 
Here is the calling code

"""

categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')

y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a count vectorizer")
t0 = time()

vectorizer = CountVectorizer(stop_words='english', binary=True)#, analyzer='char', ngram_range=(1,3))
X_train = vectorizer.fit_transform(data_train.data).toarray()
X_test = vectorizer.transform(data_test.data).toarray()


clf = MyBayesClassifier()
clf.train(X_train,y_train)
y_pred1=clf.predict(X_test)

print ("Bernoulli Naive bayes Accuracy Score", accuracy_score(y_test,y_pred1) )

data loaded
Extracting features from the training data using a count vectorizer
Bernoulli Naive bayes Accuracy Score 0.59940872136
