In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../datasets/bayes/mushrooms.csv")

In [5]:
df["type"].unique()

array(['p', 'e'], dtype=object)

In [6]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
le = LabelEncoder()

In [9]:
out = df.apply(le.fit_transform)

In [10]:
le.classes_

array(['d', 'g', 'l', 'm', 'p', 'u', 'w'], dtype=object)

In [11]:
out.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [21]:
class NB:
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        
        probs = {}

        klasses, counts = np.unique(y, return_counts=True)

        for klass, count in zip(klasses, counts):

            klass_data = {
                "prior" : count/len(X),
                "columns" : {}
            }

            klass_cut = X.loc[y == klass]

            for column in klass_cut.columns:
                vals, counts = np.unique(klass_cut[column], return_counts=True)
                val_dict = dict(zip(vals, counts/len(X)))
                klass_data["columns"][column] = val_dict

            probs[klass] = klass_data
        
            
        self.probs = probs
        
        
    def predict(self, X):
        result = []
        for _, row in X.iterrows():
            result.append(self.predict_point(row))
            
        return result
    
    def predict_point(self, row):
        
        results = []
        
        for klass in self.probs:
            klass_d = self.probs[klass]
            total = klass_d["prior"]
            
            for column in klass_d["columns"]:
                
                pro = klass_d["columns"][column]
                if row[column] in pro:
                    total *= pro[row[column]]
                else:
                    total *= 0
            
            results.append((total, klass))
            
        return sorted(results, reverse=True)[0][1]
        
    def score(self, X, y):
        yp = self.predict(X)
        return np.mean(yp == y)
            
        

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
...     out.iloc[:, 1:], out.iloc[:, 0], test_size=0.33, random_state=42)

In [23]:
model = NB()

In [24]:
model.fit(X_train, y_train)

In [25]:
model.predict(X_test[:10])

[0, 1, 1, 0, 1, 1, 1, 1, 0, 0]

In [26]:
model.score(X_test, y_test)

0.999627004848937

In [28]:
out.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [29]:
out.shape

(8124, 23)