In [34]:
import pandas as pd
import numpy as np

In [67]:
data = pd.read_csv('mushrooms.csv')

In [68]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [69]:
data.shape

(8124, 23)

In [70]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [71]:
le = LabelEncoder()
data = data.apply(func=le.fit_transform)

In [72]:
data_arr = data.values

In [73]:
X = data_arr[:,1:]

In [74]:
y = data_arr[:,0]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [76]:
y_train[:5]

array([0, 1, 0, 0, 1])

In [110]:
class CustomNB:
    def fit(self,X,y):
        self.__X = X
        self.__y = y
    
    def prior_prob(self, label): 
        total = self.__y.shape[0]
        class_examples = np.sum(self.__y == label)
        return class_examples / float(total)  
    
    def conditional_prob(self,feature_col,feature_val,label):
        X_filtered = self.__X[self.__y==label]
        numerator = np.sum(X_filtered[:,feature_col] == feature_val)
        denominator = len(X_filtered)
        return numerator/denominator
    
    def predict_point(self,X):
        classes = np.unique(self.__y)
        n_features = self.__X.shape[1]
        
        post_prob = []
        for label in classes:
            likehood = 1
            for feature in range(n_features):
                cond_prob = self.conditional_prob(feature,X[feature],label)
                likehood *= cond_prob
            prior = self.prior_prob(label)
            post_prob.append(prior*likehood)
        
        return np.argmax(post_prob)
    
    def predict(self,X):
        res = []
        for row in X:
            res.append(self.predict_point(row))
        return np.array(res)
    
    def score(self,X,y):
        return (self.predict(X) == y).mean()
    
    

In [105]:
class CustomNB2:
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    # label = which class you want this prob for
    def prior_prob(self, label): 
        total = self.y_train.shape[0]
        class_examples = np.sum(self.y_train == label)
        return class_examples / float(total)  # python 2
    
    # P(Xi=red|y=label) - ith feature (feature col = i) for a single example
    def conditional_prob(self, feature_col, feature_val, label):
        # out of all the examples, what mushrooms have feature as feature_val in the feature_col that belongs to that class label
        X_filtered = self.X_train[self.y_train==label] # all the examples in class label
        numerator = np.sum(X_filtered[:, feature_col] == feature_val)
        denominator = len(X_filtered)
        return numerator / denominator
    
    # we are going to do this for all the 22 features that we have for each example
    def predict_point(self, X_test):
        # X_test is a single example with n features
        classes = np.unique(self.y_train) # By default from 0
        n_features = self.X_train.shape[1]
        post_pro = []
        # post prob for each class
        for label in classes:
            # post_prob = prior * likelihood
            likehood = 1.0
            for feature in range(n_features):
                cond = self.conditional_prob(feature, X_test[feature], label)
                likehood *= cond
            prior = self.prior_prob(label)
            post = prior * likehood
            post_pro.append(post)
        
        # ans = max value from all labels
        return np.argmax(post_pro) # return the index of the largest value in array
    
    def predict(self, X_test):
        result = []
        for point in X_test:
            result.append(self.predict_point(point))
        return np.array(result)
    
    def score(self, X_test, y_test):
        return (self.predict(X_test) == y_test).mean()

In [111]:
model = CustomNB()

In [112]:
model.fit(X_train,y_train)


In [113]:
model.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int32)

In [114]:
model.score(X_test,y_test)

0.9973890339425587

In [115]:
X_test[1]

array([2, 2, 2, 0, 8, 1, 0, 1, 0, 1, 0, 2, 2, 6, 6, 0, 2, 1, 0, 7, 4, 2])