## Naive Bayes

In [4]:
import numpy as np
import pandas as pd

In [5]:
#use mushroom dataset

In [6]:
df = pd.read_csv('bayes/mushrooms.csv')

In [7]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [8]:
df.shape

(8124, 23)

In [9]:
#convert in numeric data

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [10]:
 le = LabelEncoder()

In [11]:
ds = df.apply(func=le.fit_transform) # this will be applied to every column

In [12]:
ds.head()


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [13]:
data = ds.values
X = data[:,1:]
y = data[:,0]

In [14]:
X.shape, y.shape

((8124, 22), (8124,))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

# Custom NB

In [25]:
class CustomNB:
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    #label = which class you want this prob for
    #prior probability
    def prior_prob(self, label):
        total = self.y_train.shape[0]
        class_examples = np.sum(self.y_train==label) # np.sum([True, False, False, True, .... ]) -> counts true's
        return class_examples/float(total) #python2
    
    # P(xi=red|y=label) - ith feature (feature col = i) for a single example
    def conditional_probability(self, feature_col, feature_val, label):
        #out of all the examples, what mushrooms have feature as feature_val in the feature_col that belongs to class label
        X_filtered=self.X_train[self.y_train==label] # all the example with class=label
        
        numerator = np.sum(X_filtered[:, feature_col] == feature_val)
        denominator = len(X_filtered)
        return numerator/denominator
    
    #we have to do this for all 22 features of one example
    def predict_point(self, X_test):
        #X_test is a single example with n features
        classes = np.unique(self.y_train) #by default is 0
        no_of_features = self.X_train.shape[1]
        post_prob = []
        #find posterior probability for each class
        for label in classes:
            #post_prob = prior*likelihood
            prior_probability = self.prior_prob(label)
            likelihood = 1.0
            for feature in range(no_of_features):
                conditional = self.conditional_probability(feature, X_test[feature], label)
                likelihood *= conditional # multiplication of conditional prob of all the features belonging to a single class
            
            post = prior_probability*likelihood
            post_prob.append(post)
            
        #ans = max posterior probabilty out of all the labels
        return np.argmax(post_prob) # returns index of the largest value in the array which will be equal to the class number
    
    #predict function
    def predict(self, X_test):
        result = []
        for point in X_test:
            result.append(self.predict_point(point))
        return result
    
    def score(self, X_test, y_test):
        return(self.predict(X_test) == y_test).mean()     
    

In [26]:
model = CustomNB()

In [27]:
model.fit(X_train, y_train)


In [28]:
model.predict(X_test[:10])

[0, 1, 1, 0, 1, 1, 1, 1, 0, 0]

In [29]:
y_test[:10]

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [32]:
model.score(X_test[:1000], y_test[:1000])

0.999