## Implementing multinomial Naive Bayes classifier on ‘20 Newsgroups Dataset’

In [199]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\RAZORBLADE\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [200]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\RAZORBLADE\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [201]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import os
import nltk
from string import punctuation
import re
from nltk.corpus import stopwords

## Get the training data

In [202]:
mypath = r"C:\Users\RAZORBLADE\Desktop\20news-19997\20_newsgroups"
folders = [f for f in os.listdir(mypath)]
folders.sort()
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Stop Words

Stop words are words that show up a lot in every document (e.g. prepositions and pronouns). Since stop words are of no use for us we will not consider them


In [203]:
nltk.download('stopwords')
nltk.download('punkt')
punctuations=list(punctuation)
stopWords=stopwords.words('english')
stopWords+=punctuations 
stopWords= set(stopWords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAZORBLADE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAZORBLADE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Our Vocabulary/Feature Set

In [204]:
# vocab_dict will be a dictionary of the form {word: frequency} over all documents

data = {}
vocab_dict = {}
for folder in folders:
    data[folder] = []
    for doc in os.listdir(os.path.join(mypath, folder)):
        with open(os.path.join(mypath, folder,doc), 'r') as f:
            text = f.read()
            text = text.lower()
            # remove any word that has a non alphabetical character
            text = re.sub(r'[^a-z]', ' ', text)
            temp=text.split()
            data[folder].append(temp)
            for token in temp:
                if token in vocab_dict:
                    vocab_dict[token] += 1
                elif len(token) >=5 and token not in stopWords:
                    vocab_dict[token] = 1
                    
            

len(vocab_dict)


90926

In [205]:
len(data[folders[0]][1])

5430

## Final Feature List

In [206]:
# Sort the dictionary based on frequency of each 'possible' vocabulary word
sorted_vocab=sorted(vocab_dict.items(),key=operator.itemgetter(1),reverse=True)
# Choosing top 2000 vocab words as features
feature_list=[]
for key in sorted_vocab:
    feature_list.append(key[0])
feature_list=feature_list[0:200] # K = 200 (number of words in vocab)
feature_list = set(feature_list)
feature_list

{'access',
 'actually',
 'agate',
 'always',
 'american',
 'andrew',
 'another',
 'anyone',
 'anything',
 'apple',
 'around',
 'article',
 'atheism',
 'athos',
 'autos',
 'available',
 'baseball',
 'based',
 'believe',
 'berkeley',
 'better',
 'called',
 'cantaloupe',
 'center',
 'change',
 'children',
 'christian',
 'clipper',
 'colorado',
 'columbia',
 'computer',
 'control',
 'could',
 'course',
 'crabapple',
 'crypt',
 'culture',
 'darwin',
 'david',
 'different',
 'distribution',
 'drive',
 'either',
 'electronics',
 'email',
 'enough',
 'europa',
 'every',
 'evidence',
 'example',
 'files',
 'first',
 'following',
 'followup',
 'forsale',
 'found',
 'games',
 'gatech',
 'general',
 'geneva',
 'getting',
 'given',
 'going',
 'government',
 'graphics',
 'great',
 'group',
 'gtefsd',
 'hardware',
 'harvard',
 'heard',
 'history',
 'hockey',
 'however',
 'howland',
 'human',
 'image',
 'information',
 'internet',
 'israel',
 'jesus',
 'jewish',
 'keywords',
 'least',
 'lines',
 'litt

## Data Preparation

In [207]:
def prepare_data():
    '''
    return : a dataframe of columns as features and rows as documents
    '''
    df = pd.DataFrame(columns = feature_list)
    Y=[]
    for folder in folders:
        for doc in data[folder]:
            Y.append(folder)
            # Add a new row for every file
            df.loc[len(df)] = np.zeros(len(feature_list))
            for txt in doc:
                for word in txt.split():
                    if word in feature_list:
                        df.loc[len(df)-1,word] += 1
        
    # add Y as a column to dataframe
    df['Y']=Y
    return df

    

## Getting Test Data and Training Data

In [208]:
df = prepare_data()

In [None]:
X = df.drop(['Y'],axis=1).values
#remove last column of X

Y = df['Y'].values
df.head()


Unnamed: 0,maybe,rochester,jesus,looking,support,heard,network,apple,atheism,example,...,crabapple,software,however,source,getting,given,research,electronics,culture,Y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
1,1.0,0.0,4.0,1.0,1.0,0.0,0.0,0.0,57.0,11.0,...,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,alt.atheism


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.25)

In [None]:
# count each distinct element in the y_train
from collections import Counter
Counter(y_train)


Counter({'comp.windows.x': 362,
         'comp.graphics': 396,
         'misc.forsale': 387,
         'comp.sys.ibm.pc.hardware': 376,
         'talk.politics.mideast': 378,
         'rec.autos': 374,
         'soc.religion.christian': 392,
         'comp.sys.mac.hardware': 372,
         'sci.crypt': 362,
         'sci.med': 364,
         'sci.electronics': 388,
         'talk.politics.guns': 356,
         'alt.atheism': 368,
         'talk.politics.misc': 375,
         'rec.motorcycles': 361,
         'rec.sport.hockey': 390,
         'talk.religion.misc': 384,
         'sci.space': 374,
         'comp.os.ms-windows.misc': 377,
         'rec.sport.baseball': 364})

## Using the inbuilt Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.7668

## Posteriors

In [None]:
# calculate posteriors for each type of folder
posteriors ={}
for folder in folders:
    posteriors[folder] = len(y_train[y_train==folder]) / len(y_train)
    print(folder,posteriors[folder])


alt.atheism 0.04906666666666667
comp.graphics 0.0528
comp.os.ms-windows.misc 0.05026666666666667
comp.sys.ibm.pc.hardware 0.050133333333333335
comp.sys.mac.hardware 0.0496
comp.windows.x 0.048266666666666666
misc.forsale 0.0516
rec.autos 0.04986666666666666
rec.motorcycles 0.048133333333333334
rec.sport.baseball 0.04853333333333333
rec.sport.hockey 0.052
sci.crypt 0.048266666666666666
sci.electronics 0.05173333333333333
sci.med 0.04853333333333333
sci.space 0.04986666666666666
soc.religion.christian 0.05226666666666667
talk.politics.guns 0.047466666666666664
talk.politics.mideast 0.0504
talk.politics.misc 0.05
talk.religion.misc 0.0512


## CCD

In [None]:
word= 'example'
folder=folders[0]
(df[word][df['Y']==folder].sum() + 1) / (len(df[df['Y']==folder]) + len(feature_list))

0.21714285714285714

In [None]:
# calculate class conditional probabilities p(x|y) where x is a word and y is a newsgroup for each word in the vocabulary as a dataframe
conditional_probabilities = pd.DataFrame(columns=feature_list)
i=0;
for folder in folders:
    # add a new row for each folder
    conditional_probabilities.loc[len(conditional_probabilities)] = np.zeros(len(feature_list))
    # use df calculated above to calculate conditional probabilities
    for word in feature_list:
        #print(word)
        conditional_probabilities[word][i] = (df[word][df['Y']==folder].sum() + 1) / (len(df[df['Y']==folder]) + len(feature_list))
    i+=1
conditional_probabilities.head()


Unnamed: 0,maybe,rochester,jesus,looking,support,heard,network,apple,atheism,example,...,state,crabapple,software,however,source,getting,given,research,electronics,culture
0,0.061429,0.014286,0.174286,0.027143,0.108571,0.05,0.048571,0.134286,1.341429,0.217143,...,0.48,0.281429,0.021429,0.151429,0.024286,0.044286,0.118571,0.021429,0.001429,0.014286
1,0.037143,0.1,0.001429,0.164286,0.122857,0.05,0.085714,0.062857,0.001429,0.032857,...,0.545714,0.377143,0.327143,0.065714,0.128571,0.042857,0.058571,0.134286,0.007143,0.004286
2,0.05,0.165714,0.001429,0.078571,0.1,0.047143,0.124286,0.04,0.002857,0.052857,...,0.46,0.151429,0.197143,0.09,0.028571,0.054286,0.02,0.061429,0.008571,0.008571
3,0.047143,0.058571,0.002857,0.072857,0.132857,0.071429,0.06,0.022857,0.001429,0.047143,...,0.495714,0.41,0.171429,0.092857,0.017143,0.065714,0.018571,0.071429,0.01,0.001429
4,0.034286,0.062857,0.001429,0.05,0.068571,0.084286,0.102857,0.657143,0.001429,0.022857,...,0.561429,0.461429,0.161429,0.06,0.018571,0.062857,0.021429,0.042857,0.018571,0.001429


In [None]:
# normalize each columns of conditional_probabilities
conditional_probabilities = conditional_probabilities.div(conditional_probabilities.sum(axis=0), axis=1)
# get the sum of each column
conditional_probabilities.sum(axis=0)

maybe          1.0
rochester      1.0
jesus          1.0
looking        1.0
support        1.0
              ... 
getting        1.0
given          1.0
research       1.0
electronics    1.0
culture        1.0
Length: 200, dtype: float64

## Multinomial Naive Bayes Classifier

In [None]:
a={'a':1,'b':2,'c':3}
max(a,key=a.get)

'c'

In [None]:
def predict(text):
    # calculate the probability of each word in the text belonging to each newsgroup
    probabilities = {}
    i=0
    for f_in in range(len(folders)):
        probabilities[folders[f_in]] = np.log(posteriors[folders[f_in]])
        for i in range(len(text)):
            probabilities[folders[f_in]] += np.log(conditional_probabilities.values[f_in][i])*text[i]
    # return the newsgroup with the highest probability
    # print(probabilities)
    return max(probabilities, key=probabilities.get)

In [None]:
def predict_data(x_test):
    predictions = []
    for i in range(len(x_test)):
        print(':',end=" ")
        predictions.append(predict(x_test[i]))
    return predictions
    

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
y_pred=predict_data(x_test)
# print(y_pred)

: : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 

In [None]:
sum(y_pred==y_test)/len(y_test)

0.596