In [1]:
import pandas as pd 
import numpy as np 
from collections import defaultdict

In [19]:
class NaiveBayes:
    
    def __init__(self, unique_classes):
        
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training set
       
    def addToBow(self, example, dict_index):
        column_name = example.name
        
        self.bow_dicts[dict_index][column_name] = example.value_counts() 
    
    
    def train(self, dataset, labels):
        
        
        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda: defaultdict(lambda: 0)) for index in range(self.classes.shape[0])])
        
        
        #constructing BoW for each category
        for cat_index, cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat] #filter all examples of category == cat
            columns = dataset.columns
            
            #now costruct BoW of this particular category
            #np.apply_along_axis(self.addToBow, 0, all_cat_examples, columns,cat_index)
            all_cat_examples.apply(self.addToBow, axis = 0, dict_index = cat_index)
            
        #self.prob_classes=np.array([defaultdict(lambda: defaultdict(lambda: 0)) for index in range(self.classes.shape[0])])
        
        #for cat_index,cat in enumerate(self.classes):
                
            #Calculating prior probability p(c) for each class
        #    for key, value in nb.bow_dicts[cat_index].items():
                
        #        self.prob_classes[cat_index][value.name] = value/dataset.shape[0]
        
        self.prob_classes=np.empty(self.classes.shape[0])
        self.cat_word_counts=np.array([defaultdict(lambda: defaultdict(lambda: 0)) for index in range(self.classes.shape[0])])
        self.all_words=np.array([defaultdict(lambda: list()) for index in range(self.classes.shape[0])])
        
        for cat_index,cat in enumerate(self.classes):
        
            #Calculating prior probability p(c) for each class
            self.prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            for key, value in nb.bow_dicts[cat_index].items():
                self.cat_word_counts[cat_index][value.name] = self.bow_dicts[cat_index][value.name] + 1
                values = self.bow_dicts[cat_index][value.name].keys()
                self.all_words[cat_index][value.name] += list(self.bow_dicts[cat_index][value.name].keys())
                self.all_words[cat_index][value.name] = list(set(self.all_words[cat_index][value.name]))
                
                
    
        
        
    def getExampleProb(self, test_example_row, columns):      
        
        likelihood_prob=np.zeros(self.classes.shape[0])
        
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes): 
            
            for column in columns:
                
                value_row_column = test_example_row[column]
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.bow_dicts[cat_index][column].get(value_row_column,0)+1
                
                #now get likelihood of this test_token word  
                denom = self.cat_word_counts[cat_index][column].get(value_row_column,0)+len(self.all_words[cat_index][column])+1
                test_token_prob=test_token_counts/float(denom) 
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index]+=np.log(test_token_prob)
                
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.prob_classes[cat_index])                                  
      
        return post_prob
    
    def test(self, test_set):
            
        predictions=[] #to store prediction of each test example
        columns = list(test_set.columns)
        print(test_set.shape)

        for index, row in test_data.iterrows(): 
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(row, columns) #get prob of this example for both classes

            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])

        return np.array(predictions) 


In [3]:
a = np.arange(1, 21)

In [4]:
a

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

In [5]:
np.split(a, [int(.7 * len(a)), int(.8 * len(a))])

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([15, 16]),
 array([17, 18, 19, 20])]

In [6]:
df = pd.read_csv("./data/mushrooms.csv")

In [7]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [8]:
train, validate, test = np.split(df, [int(.7 * len(df)), int(.8 * len(df))])

In [9]:
print(train.shape)
print(validate.shape)
print(test.shape)

(5686, 23)
(813, 23)
(1625, 23)


In [10]:
# Get all data and labels in particular dataframes
train_data = train.drop(columns='class')
train_labels = train['class']

validate_data = validate.drop(columns='class')
validate_labels = validate['class']

test_data = test.drop(columns='class')
test_labels = test['class']


In [11]:
print(train_data.shape)
print(train_labels.shape)

print(validate_data.shape)
print(validate_labels.shape)

print(test_data.shape)
print(test_labels.shape)


(5686, 22)
(5686,)
(813, 22)
(813,)
(1625, 22)
(1625,)


In [12]:
np.unique(train_labels)

array(['e', 'p'], dtype=object)

In [13]:

nb=NaiveBayes(np.unique(train_labels)) #instantiate a NB class object

print ("---------------- Training In Progress --------------------")
 
nb.train(train_data, train_labels) #start tarining by calling the train function

print ('----------------- Training Completed ---------------------')

---------------- Training In Progress --------------------
----------------- Training Completed ---------------------


In [14]:
pclasses=nb.test(test_data)

In [15]:
#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Examples: ",test_labels.shape[0])
print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Examples:  1625
Test Set Accuracy:  89.66153846153846 %


In [20]:
validatePclasses=nb.test(validate_data)


In [21]:
validate_data.shape

(813, 22)

In [22]:
validatePclasses.shape

(1625,)

In [None]:
validate_labels.shape

In [None]:

#check how many predcitions actually match original test labels
validate_acc=np.sum(pclasses_v==validate_labels)/float(validate_labels.shape[0]) 

print ("Validate Set Examples: ",validate_labels.shape[0])
print ("Validate Set Accuracy: ",validate_acc*100,"%")