In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
data = pd.read_csv('mushrooms.csv')

In [68]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [69]:
data.shape

(8124, 23)

In [70]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [71]:
le = LabelEncoder()
data = data.apply(func=le.fit_transform)

In [72]:
data_arr = data.values

In [73]:
X = data_arr[:,1:]

In [74]:
y = data_arr[:,0]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [76]:
y_train[:5]

array([0, 1, 0, 0, 1])

In [110]:
class CustomNB:
    def fit(self,X,y):
        self.__X = X
        self.__y = y
    
    def prior_prob(self, label): 
        total = self.__y.shape[0]
        class_examples = np.sum(self.__y == label)
        return class_examples / float(total)  
    
    def conditional_prob(self,feature_col,feature_val,label):
        X_filtered = self.__X[self.__y==label]
        numerator = np.sum(X_filtered[:,feature_col] == feature_val)
        denominator = len(X_filtered)
        return numerator/denominator
    
    def predict_point(self,X):
        classes = np.unique(self.__y)
        n_features = self.__X.shape[1]
        
        post_prob = []
        for label in classes:
            likehood = 1
            for feature in range(n_features):
                cond_prob = self.conditional_prob(feature,X[feature],label)
                likehood *= cond_prob
            prior = self.prior_prob(label)
            post_prob.append(prior*likehood)
        
        return np.argmax(post_prob)
    
    def predict(self,X):
        res = []
        for row in X:
            res.append(self.predict_point(row))
        return np.array(res)
    
    def score(self,X,y):
        return (self.predict(X) == y).mean()
    
    

In [111]:
model = CustomNB()

In [112]:
model.fit(X_train,y_train)


In [113]:
model.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int32)

In [114]:
model.score(X_test,y_test)

0.9973890339425587

In [115]:
X_test[1]

array([2, 2, 2, 0, 8, 1, 0, 1, 0, 1, 0, 2, 2, 6, 6, 0, 2, 1, 0, 7, 4, 2])

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
le = LabelEncoder()

In [5]:
data = data.apply(func=le.fit_transform)

In [6]:
data = data.values

In [42]:
class NBMD:
    def fit(self,data,class_index):
        self.__data_dict = dict()
        self.class_indx = class_index
        for each in data:
            clss = each[class_index]
            if clss not in self.__data_dict:
                self.__data_dict[clss] = list()
            self.__data_dict[clss].append(np.delete(each,class_index))
        
        self.calc()
        
    def mean(self,nums):
        ans = sum(nums) / float(len(nums))
        return ans
    
    def stdev(self,nums):
        avg = self.mean(nums)
        var = np.sum((nums - avg)**2) / float(len(nums) - 1)
        ans = math.sqrt(var)
        return ans
    
    def calc(self):
        self.__Summary = dict()
        for clss in self.__data_dict.keys():
            summary = [[self.mean(column),self.stdev(column),len(column)] for column in zip(*self.__data_dict[clss])]
            if(clss not in self.__Summary):
                self.__Summary[clss] = list()
            self.__Summary[clss] = summary
    
    def predict_single(self,X):
        prob = list()
        for key in self.__data_dict.keys():
            likehood = 1
            for column,feature in enumerate(X):
                likehood *= self.gaussian_pdf(feature,column,key)
            prob.append(likehood)
                
        return np.argmax(prob)
    
    def predict(self,X):
        #print(self.__Summary)
        ans = []
        for each in X:
            each = np.delete(each,self.class_indx)
            ans.append(self.predict_single(each))
        return np.array(ans)
    
    def gaussian_pdf(self,feature,col,key):
        avg = self.__Summary[key][col][0]
        stdev = self.__Summary[key][col][1]
        if(stdev!=0):
            #print(f'keys : {self.__Summary.keys()} , {len(self.__Summary[key][0])}')
            p2 = math.exp(-(feature - avg)**2 / 2 * stdev**2)
            #print(f'stdev : {stdev} , col : {col} , avg : {avg} , len(sum) : {len(self.__Summary[key])}')
            return (1 / stdev * math.sqrt(2*math.pi)) * p2
        return 1
    
    def score(self,X_test,y_test):
        return (self.predict(X_test) == y_test).mean()
        

In [19]:
arr = np.array([1,2,3])
np.sum((arr-1)**2)

5

In [7]:
dic = {1:[
    np.array([1,2,3,4]),
    np.array([4,5,6,7]),
    np.array([10,20,30,40]),
    np.array([11,12,13,14])
        ],
      0:[
    np.array([0,12,14,16]),
    np.array([100,101,102,103]),
    np.array([10,20,30,40]),
    np.array([11,12,13,14])
      ]}

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = data

In [10]:
y = data[:,0]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [43]:
model = NBMD()

In [44]:
model.fit(X_train,0)

In [45]:
model.predict(X_test[:10])

array([1, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)

In [46]:
y_test[:10]

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [47]:
model.score(X_test,y_test)

0.7094367773218948

In [73]:
def fun(t):
    return t if t else 1

In [75]:
fun(4)

4