## Implementing multinomial Naive Bayes classifier on ‘20 Newsgroups Dataset’

In [58]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\RAZORBLADE\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [60]:
pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp39-cp39-win_amd64.whl (7.2 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, scikit-learn, sklearn
    Running setup.py install for sklearn: started
    Running setup.py install for sklearn: finished with status 'done'
Successfully installed scikit-learn-1.0.2 sklearn-0.0 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\RAZORBLADE\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import os
import nltk
from string import punctuation
from nltk.corpus import stopwords

## Get the training data

In [2]:
mypath = r"C:\Users\RAZORBLADE\Desktop\20news-19997\20_newsgroups"
files = [f for f in os.listdir(mypath)]
files.sort()
files

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Stop Words

Stop words are words that show up a lot in every document (e.g. prepositions and pronouns). Since stop words are of no use for us we will not consider them


In [3]:
nltk.download('stopwords')
nltk.download('punkt')
punctuations=list(punctuation)
stopWords=stopwords.words('english')
stopWords+=punctuations 
len(stopWords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAZORBLADE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAZORBLADE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


211

## Our Vocabulary/Feature Set

In [6]:
# vocab_dict will be a dictionary of the form {word: frequency} over all documents
vocab_dict = {}
for file in files:
    for doc in os.listdir(os.path.join(mypath, file)):
        with open(os.path.join(mypath, file,doc), 'r') as f:
            text = f.read()
            text = text.lower()
            for token in text.split():
                if token in vocab_dict:
                    vocab_dict[token] += 1
                elif len(token) >=5 and token not in stopWords:
                    vocab_dict[token] = 1

len(vocab_dict)


390200

## Final Feature List

In [7]:
# Sort the dictionary based on frequency of each 'possible' vocabulary word
sorted_vocab=sorted(vocab_dict.items(),key=operator.itemgetter(1),reverse=True)
# Choosing top 2000 vocab words as features
feature_list=[]
for key in sorted_vocab:
    feature_list.append(key[0])
feature_list=feature_list[0:2000] # K = 2000 (number of words in vocab)

## Data Preparation

In [48]:
def prepare_data(folder_location,file_location):
    '''
    :param file_location : location of folder to distinguish between training and test folder
    :return : Y as an array conatining all corresponding newsgroups and X as dataframe values where each column represents a particular word
                and the value is the count of the word in a given document/article.
    '''

    my_path = os.path.join(r"C:\Users\RAZORBLADE\Desktop",folder_location,file_location)
    df = pd.DataFrame(columns = feature_list)
    Y=[]
    for file in files:
        for doc in os.listdir(os.path.join(my_path, file)):
            Y.append(file)
            # Add a new row for every file
            df.loc[len(df)] = np.zeros(len(feature_list))
            with open(os.path.join(my_path, file,doc), 'r') as f:
                text = f.read()
                text = text.lower()
                for token in text.split():
                    if token.lower() in feature_list:
                        df[token.lower()][len(df)-1] += 1 
        
    # add Y as a column to dataframe
    df['newsgroups']=Y
    return df

    

## Getting Test Data and Training Data

In [None]:
df = prepare_data("20news-19997","20_newsgroups")

In [89]:
X = df.drop(['newsgroups'],axis=1).values
#remove last column of X

Y = df['newsgroups'].values
#check if there is a NaN value in df
df.head()


Unnamed: 0,subject:,from:,date:,newsgroups:,message-id:,lines:,path:,organization:,would,writes:,...,(including,resulting,yesterday,planetary,pitching,previously,replies,helps,cities,newsgroups
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,alt.atheism
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alt.atheism


In [68]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.25)

## Using the inbuilt Multinomial Naive Bayes

In [69]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.829

## Posteriors

In [72]:
# calculate posteriors for each type of file
posteriors ={}
for file in files:
    posteriors[file] = len(y_train[y_train==file]) / len(y_train)
    print(file,posteriors[file])
    

alt.atheism 0.05114356204574248
comp.graphics 0.04980996199239848
comp.os.ms-windows.misc 0.05007668200306728
comp.sys.ibm.pc.hardware 0.05067680202707208
comp.sys.mac.hardware 0.050943522037740885
comp.windows.x 0.05067680202707208
misc.forsale 0.04927652197106088
rec.autos 0.04874308194972328
rec.motorcycles 0.047742881909715276
rec.sport.baseball 0.05014336200573448
rec.sport.hockey 0.05127692205107688
sci.crypt 0.05114356204574248
sci.electronics 0.05041008201640328
sci.med 0.04960992198439688
sci.space 0.05027672201106888
soc.religion.christian 0.04967660198706408
talk.politics.guns 0.05007668200306728
talk.politics.mideast 0.047942921917716874
talk.politics.misc 0.04940988197639528
talk.religion.misc 0.050943522037740885


## CCD

In [101]:
# calculate class conditional probabilities p(x|y) where x is a word and y is a newsgroup for each word in the vocabulary as a dataframe
conditional_probabilities = pd.DataFrame(columns=feature_list)
i=0;
for file in files:
    # add a new row for each file
    conditional_probabilities.loc[len(conditional_probabilities)] = np.zeros(len(feature_list))
    # use df calculated above to calculate conditional probabilities
    for word in feature_list:
        conditional_probabilities[word][i] = (df[word][df['newsgroups']==file].sum() + 1) / (len(df[df['newsgroups']==file]) + len(feature_list))
        i+=1
conditional_probabilities.head()


Unnamed: 0,subject:,from:,date:,newsgroups:,message-id:,lines:,path:,organization:,would,writes:,...,conditions,(including,resulting,yesterday,planetary,pitching,previously,replies,helps,cities
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Multinomial Naive Bayes Classifier

In [102]:
def predict(text):
    # calculate the probability of each word in the text belonging to each newsgroup
    probabilities = {}
    for file in files:
        probabilities[file] = np.log(posteriors[file])
        for word in text:
            if word in feature_list:
                probabilities[file] += np.log(conditional_probabilities[file][word])*vocab_dict[word]
    # return the newsgroup with the highest probability
    return max(probabilities, key=probabilities.get)

In [106]:
def predict_data(x_test):
    predictions = []
    for i in range(len(x_test)):
        predictions.append(predict(x_test[i]))
    return predictions

In [107]:
from sklearn.metrics import classification_report,confusion_matrix
y_pred=predict_data(x_test)
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))