## Implementing multinomial Naive Bayes classifier on ‘20 Newsgroups Dataset’

## Importing Libraries

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import os
import nltk
from string import punctuation
import re
from nltk.corpus import stopwords

## Get the training data

In [43]:
# get cwd 
cwd = os.getcwd()
mypath = cwd + '/20_newsgroups/'
folders = [f for f in os.listdir(mypath)]
folders.sort()
if folders[0] == ".DS_Store":
    folders.pop(0)
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Stop Words

Stop words are words that show up a lot in every document (e.g. prepositions and pronouns). Since stop words are of no use for us we will not consider them.


In [44]:
nltk.download('stopwords')
nltk.download('punkt')
punctuations=list(punctuation)
stopWords=stopwords.words('english')
stopWords+=punctuations 
stopWords= set(stopWords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divyansh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/divyansh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Our Vocabulary/Feature Set

Vocabulary will contain words that will act as features for our model , we will take some amount of words from all documents sorted by frequencies.

In [45]:
# vocab_dict will be a dictionary of the form {word: frequency} over all documents

data = {}
vocab_dict = {}
for folder in folders:
    data[folder] = []
    for doc in os.listdir(os.path.join(mypath, folder)):
        with open(os.path.join(mypath, folder,doc), 'r', encoding='latin-1') as f:
            text = f.read()
            text = text.lower()
            # remove any word that has a non alphabetical character
            text = re.sub(r'[^a-z]', ' ', text)
            temp=text.split()
            data[folder].append(temp)
            for token in temp:
                if token in vocab_dict:
                    vocab_dict[token] += 1
                elif len(token) >=5 and token not in stopWords:
                    vocab_dict[token] = 1
                    
            

len(vocab_dict)


90926

## Final Feature List



In [46]:
# Sort the dictionary based on frequency of each 'possible' vocabulary word
sorted_vocab=sorted(vocab_dict.items(),key=operator.itemgetter(1),reverse=True)
# Choosing top 2000 vocab words as features
feature_list=[]
for key in sorted_vocab:
    feature_list.append(key[0])
feature_list=feature_list[0:200] # K = 200 (number of words in vocab)
feature_list = set(feature_list)
feature_list

{'access',
 'actually',
 'agate',
 'always',
 'american',
 'andrew',
 'another',
 'anyone',
 'anything',
 'apple',
 'around',
 'article',
 'atheism',
 'athos',
 'autos',
 'available',
 'baseball',
 'based',
 'believe',
 'berkeley',
 'better',
 'called',
 'cantaloupe',
 'center',
 'change',
 'children',
 'christian',
 'clipper',
 'colorado',
 'columbia',
 'computer',
 'control',
 'could',
 'course',
 'crabapple',
 'crypt',
 'culture',
 'darwin',
 'david',
 'different',
 'distribution',
 'drive',
 'either',
 'electronics',
 'email',
 'enough',
 'europa',
 'every',
 'evidence',
 'example',
 'files',
 'first',
 'following',
 'followup',
 'forsale',
 'found',
 'games',
 'gatech',
 'general',
 'geneva',
 'getting',
 'given',
 'going',
 'government',
 'graphics',
 'great',
 'group',
 'gtefsd',
 'hardware',
 'harvard',
 'heard',
 'history',
 'hockey',
 'however',
 'howland',
 'human',
 'image',
 'information',
 'internet',
 'israel',
 'jesus',
 'jewish',
 'keywords',
 'least',
 'lines',
 'litt

## Data Preparation

In [47]:
def prepare_data():
    '''
    return : a dataframe of columns as features and rows as documents
    '''
    df = pd.DataFrame(columns = feature_list)
    Y=[]
    for folder in folders:
        print("|",end="")
        for doc in data[folder]:
            # print(":",end="")
            Y.append(folder)
            # Add a new row for every file
            df.loc[len(df)] = np.zeros(len(feature_list))
            for txt in doc:
                for word in txt.split():
                    if word in feature_list:
                        df.loc[len(df)-1,word] += 1
        
    # add Y as a column to dataframe
    df['Y']=Y
    return df

    

## Getting Test Data and Training Data

In [48]:
df = prepare_data()

||||||||||||||||||||

In [49]:
X = df.drop(['Y'],axis=1).values
#remove last column of X

Y = df['Y'].values
df.head()


Unnamed: 0,probably,technology,american,subject,power,support,government,state,might,files,...,keywords,looking,history,least,second,today,clipper,columbia,course,Y
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,alt.atheism
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism


## Normalize 

In [50]:
# get the last column of df
last = df.iloc[:,-1]
# divide each row of df by sum of all values in that row except the last column
new_df = df.drop(['Y'],axis=1)
new_df = new_df.div(new_df.sum(axis=1), axis=0)
df = pd.concat([new_df, last], axis=1)


## Splitting the Data

In [51]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.25)

In [52]:
# count each distinct element in the y_train
from collections import Counter
Counter(y_train)


Counter({'talk.politics.misc': 741,
         'talk.religion.misc': 764,
         'sci.electronics': 756,
         'misc.forsale': 739,
         'rec.motorcycles': 716,
         'comp.sys.mac.hardware': 764,
         'rec.sport.hockey': 769,
         'talk.politics.mideast': 719,
         'soc.religion.christian': 745,
         'sci.med': 744,
         'comp.os.ms-windows.misc': 751,
         'talk.politics.guns': 751,
         'alt.atheism': 767,
         'comp.graphics': 747,
         'comp.sys.ibm.pc.hardware': 760,
         'rec.sport.baseball': 752,
         'sci.crypt': 767,
         'rec.autos': 731,
         'sci.space': 754,
         'comp.windows.x': 760})

## Using the inbuilt Multinomial Naive Bayes

In [53]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.7642

## Priors

In [54]:
# calculate priors for each type of folder
priors ={}
for folder in folders:
    priors[folder] = len(y_train[y_train==folder]) / len(y_train)
    print(folder,priors[folder])


alt.atheism 0.05114356204574248
comp.graphics 0.04980996199239848
comp.os.ms-windows.misc 0.05007668200306728
comp.sys.ibm.pc.hardware 0.05067680202707208
comp.sys.mac.hardware 0.050943522037740885
comp.windows.x 0.05067680202707208
misc.forsale 0.04927652197106088
rec.autos 0.04874308194972328
rec.motorcycles 0.047742881909715276
rec.sport.baseball 0.05014336200573448
rec.sport.hockey 0.05127692205107688
sci.crypt 0.05114356204574248
sci.electronics 0.05041008201640328
sci.med 0.04960992198439688
sci.space 0.05027672201106888
soc.religion.christian 0.04967660198706408
talk.politics.guns 0.05007668200306728
talk.politics.mideast 0.047942921917716874
talk.politics.misc 0.04940988197639528
talk.religion.misc 0.050943522037740885


## CCD

In [55]:
word= 'example'
folder=folders[0]
(df[word][df['Y']==folder].sum() + 1) / (len(df[df['Y']==folder]) + len(feature_list))

0.004518957759377265

In [56]:
# calculate class conditional probabilities p(x|y) where x is a word and y is a newsgroup for each word in the vocabulary as a dataframe
conditional_probabilities = pd.DataFrame(columns=feature_list)
i=0;
for folder in folders:
    # add a new row for each folder
    conditional_probabilities.loc[len(conditional_probabilities)] = np.zeros(len(feature_list))
    # use df calculated above to calculate conditional probabilities
    for word in feature_list:
        #print(word)
        conditional_probabilities[word][i] = (df[word][df['Y']==folder].sum() + 1) / (len(df[df['Y']==folder]) + len(feature_list))
    i+=1
conditional_probabilities.head()


Unnamed: 0,probably,technology,american,subject,power,support,government,state,might,files,...,system,keywords,looking,history,least,second,today,clipper,columbia,course
0,0.002815,0.002969,0.001693,0.026948,0.00212,0.002765,0.001663,0.0211,0.003856,0.001132,...,0.006697,0.001485,0.00162,0.002694,0.003231,0.002037,0.00179,0.000903,0.002181,0.003926
1,0.002048,0.002884,0.001319,0.033009,0.001182,0.003349,0.000925,0.022417,0.002469,0.008202,...,0.005849,0.005926,0.007169,0.000883,0.002015,0.001788,0.001179,0.000943,0.003427,0.002181
2,0.002324,0.003116,0.001571,0.030831,0.001165,0.00308,0.000987,0.021891,0.002201,0.009495,...,0.00987,0.003932,0.003673,0.00087,0.002044,0.001581,0.001074,0.000942,0.003674,0.001391
3,0.002585,0.003457,0.001132,0.032156,0.003883,0.003982,0.000944,0.024436,0.003074,0.002778,...,0.013153,0.004035,0.003246,0.000859,0.002502,0.002889,0.001132,0.000863,0.004353,0.001601
4,0.002536,0.003397,0.001493,0.031956,0.006675,0.003033,0.000987,0.027947,0.003085,0.001612,...,0.012864,0.00285,0.003039,0.000833,0.002196,0.002171,0.001306,0.000833,0.002455,0.0019


In [57]:
# normalize each columns of conditional_probabilities
conditional_probabilities = conditional_probabilities.div(conditional_probabilities.sum(axis=0), axis=1)
# get the sum of each column
conditional_probabilities.sum(axis=0)

probably      1.0
technology    1.0
american      1.0
subject       1.0
power         1.0
             ... 
second        1.0
today         1.0
clipper       1.0
columbia      1.0
course        1.0
Length: 200, dtype: float64

## Multinomial Naive Bayes Classifier

In [58]:
a={'a':1,'b':2,'c':3}
max(a,key=a.get)

'c'

In [59]:
def predict(text):
    # calculate the probability of each word in the text belonging to each newsgroup
    probabilities = {}
    i=0
    for f_in in range(len(folders)):
        probabilities[folders[f_in]] = np.log(priors[folders[f_in]])
        for i in range(len(text)):
            probabilities[folders[f_in]] += np.log(conditional_probabilities.values[f_in][i])*text[i]
    # return the newsgroup with the highest probability
    # print(probabilities)
    return max(probabilities, key=probabilities.get)

In [60]:
def predict_data(x_test):
    predictions = []
    for i in range(len(x_test)):
        predictions.append(predict(x_test[i]))
    return predictions
    

In [61]:
from sklearn.metrics import classification_report,confusion_matrix
y_pred=predict_data(x_test)
# print(y_pred)

In [63]:
sum(y_pred==y_test)/len(y_test)

0.7904