In [560]:
from sklearn import datasets
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd
import numpy as np
from numpy import log
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Fetching dataset

In [478]:
newstrain = datasets.fetch_20newsgroups(subset='train') # For training
newstest = datasets.fetch_20newsgroups(subset='test') # For testing

# Data preparation and cleaning

In [488]:
# Preparing list of stopwords.
myStopWords = list(string.punctuation) + stopwords.words('english')

# Using CountVectorizer from sklearn to extract features from training data.
CV =CountVectorizer(max_features=3000,stop_words=myStopWords)

#Using only training data for preparing vocabulary.
xtrain = CV.fit_transform(newstrain.data)

#Transforming xtrain and ytrain
xtrain = xtrain.toarray()
ytrain = newstrain.target

In [563]:
#Transforming xtest and ytest
xtest = (CV.transform(newstest.data)).toarray()
ytest = newstest.target

In [None]:
# To see the features run this cell
CV.get_feature_names()

# Using sklearn Multinomial Naive Bayes

In [483]:
from sklearn.naive_bayes import MultinomialNB

In [504]:
clf = MultinomialNB()
clf.fit(xtrain,ytrain)
ypred_sk = clf.predict(xtest)

# Implementing Multinomial Naive Bayes

In [494]:
#function to create a training dictionary out of the text files for training set, consisiting the frequency of
#words in our feature set (vocabulary) in each class or label of the 20 newsgroup

def model(X,Y):
    store = {}
    classes = list(set(Y))
    store['classes'] = len(classes)
    store["total"] = len(Y)
    for c in classes:
        store[c] = {}
        currentRows = Y==c
        currentClassX = X[currentRows]
        tempSum = 0
        for f in range(X.shape[1]):
            store[c][f] = (currentClassX[:,f]!=0).sum()
            tempSum += store[c][f]
        store[c]['total'] = len(currentClassX)
    return store

In [495]:
#function for calculating naive bayesian log probablity for each test document being in a particular class
def classProbability(X,c,model):
    p = log(model[c]['total']) - log(model['total'])
    for f in range(X.shape[0]):
        if(X[f]!=0):
            numerator = model[c][f]+1
            denominator = model[c]['total'] + X.shape[0]
            p += log(numerator)-log(denominator)
    return p

#predict function that predicts the class or label of test documents using train dictionary made using the fit() function
def predict(X,model):
    pred = []
    for t in X:
        bestP = -99999
        bestC = 0
        for f in range(model['classes']):
            tempP = classProbability(t,f,model)
            if( tempP > bestP):
                bestP=tempP
                bestC=f
        pred.append(bestC)
    return pred     

In [502]:
D = model(xtrain,ytrain)
ypred = predict(xtest,D)

In [535]:
ypred_Train = predict(xtrain,D)

# Comparision of results.

In [565]:
# Analysis of sklearn implementation of Naive Bayes
print("SKLEARN MULTINOMIAL NAIVE BAYES")
print("TRAIN SCORE : ", clf.score(xtrain,ytrain))
print("TEST SCORE : ", clf.score(xtest,ytest))
print("Classification Report : ")
print(classification_report(ytest,ypred_sk,target_names=newstest.target_names))
print("Confusion Matrix : ")
pd.DataFrame(confusion_matrix(ytest,ypred_sk))

SKLEARN MULTINOMIAL NAIVE BAYES
TRAIN SCORE :  0.8452359908078487
TEST SCORE :  0.7137546468401487
Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.66      0.70      0.68       319
           comp.graphics       0.50      0.74      0.60       389
 comp.os.ms-windows.misc       0.33      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.47      0.64      0.54       392
   comp.sys.mac.hardware       0.62      0.76      0.68       385
          comp.windows.x       0.75      0.66      0.70       395
            misc.forsale       0.71      0.84      0.77       390
               rec.autos       0.73      0.82      0.77       396
         rec.motorcycles       0.71      0.92      0.80       398
      rec.sport.baseball       0.81      0.86      0.83       397
        rec.sport.hockey       0.96      0.81      0.88       399
               sci.crypt       0.92      0.81      0.86       396
         sci.elec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,223,1,0,1,0,0,1,4,12,2,1,1,0,5,1,13,8,3,4,39
1,5,288,0,14,14,20,11,5,2,4,0,1,10,5,5,1,1,1,0,2
2,0,92,1,160,37,57,11,4,9,0,0,1,4,6,6,0,0,0,0,6
3,0,20,0,252,56,1,18,7,2,1,0,1,30,0,4,0,0,0,0,0
4,0,13,0,35,291,0,16,2,4,2,0,1,15,2,4,0,0,0,0,0
5,0,76,2,18,7,260,6,0,3,3,0,3,4,6,5,0,2,0,0,0
6,0,7,0,17,13,0,329,9,3,1,1,1,6,2,0,0,0,0,1,0
7,2,3,0,1,5,1,11,324,22,1,1,0,14,3,4,0,0,1,2,1
8,0,0,0,1,0,0,5,15,365,2,0,0,4,2,0,0,1,0,3,0
9,3,2,0,1,1,0,11,9,5,340,11,0,4,0,4,0,3,0,2,1


In [538]:
# Analysis of my implementation of Naive Bayes.
print("SKLEARN MULTINOMIAL NAIVE BAYES")
print("TRAIN SCORE : ", accuracy_score(ytrain,ypred_Train))
print("TEST SCORE : ", accuracy_score(ytest,ypred))
print("Classification Report : ")
print(classification_report(ytest,ypred,target_names=newstest.target_names))
print("Confusion Matrix : ")
pd.DataFrame(confusion_matrix(ytest,ypred))

SKLEARN MULTINOMIAL NAIVE BAYES
TRAIN SCORE :  0.7386423899593424
TEST SCORE :  0.5813860860329262
Classification Report : 
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.37      0.50       319
           comp.graphics       0.75      0.32      0.45       389
 comp.os.ms-windows.misc       0.70      0.27      0.39       394
comp.sys.ibm.pc.hardware       0.55      0.59      0.57       392
   comp.sys.mac.hardware       0.88      0.34      0.49       385
          comp.windows.x       0.69      0.72      0.71       395
            misc.forsale       0.92      0.52      0.67       390
               rec.autos       0.83      0.60      0.70       396
         rec.motorcycles       0.95      0.63      0.76       398
      rec.sport.baseball       0.96      0.44      0.61       397
        rec.sport.hockey       0.78      0.94      0.85       399
               sci.crypt       0.26      0.96      0.40       396
         sci.elec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,117,0,0,0,0,1,0,0,0,0,3,24,0,0,0,111,17,46,0,0
1,1,125,10,14,2,49,1,2,0,0,0,146,1,2,16,3,6,9,2,0
2,0,10,107,61,1,52,0,0,0,1,0,138,0,3,10,1,6,3,1,0
3,1,4,8,232,5,9,2,2,0,0,1,103,2,2,11,2,5,3,0,0
4,0,5,16,62,130,4,6,3,1,1,2,108,1,2,21,2,7,14,0,0
5,1,10,7,8,0,285,0,0,0,0,1,66,0,0,8,2,2,5,0,0
6,0,1,1,31,8,0,204,13,1,1,8,50,3,1,24,3,17,19,5,0
7,0,2,0,1,0,0,3,236,10,3,4,40,2,5,25,2,46,13,4,0
8,0,1,0,1,0,0,1,17,250,0,2,32,0,2,4,2,55,30,1,0
9,0,0,0,0,1,0,0,3,1,176,82,39,0,5,10,8,47,21,4,0


# Conclusion

In my version of multinomial naive bayes, many test cases are being misclassified as 'sci.crypt'. Sklearn's implementation is performing approximately 22% better than my version of naive bayes on basis of test accuracy scores.