In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [31]:
df = pd.read_csv("../datasets/bays/mushrooms.csv")

In [32]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [33]:
le = LabelEncoder()

In [34]:
df = df.apply(le.fit_transform)

In [35]:
X = df.drop(["type"], axis=1)
y = df["type"]

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
cap_shape                   8124 non-null int64
cap_surface                 8124 non-null int64
cap_color                   8124 non-null int64
bruises                     8124 non-null int64
odor                        8124 non-null int64
gill_attachment             8124 non-null int64
gill_spacing                8124 non-null int64
gill_size                   8124 non-null int64
gill_color                  8124 non-null int64
stalk_shape                 8124 non-null int64
stalk_root                  8124 non-null int64
stalk_surface_above_ring    8124 non-null int64
stalk_surface_below_ring    8124 non-null int64
stalk_color_above_ring      8124 non-null int64
stalk_color_below_ring      8124 non-null int64
veil_type                   8124 non-null int64
veil_color                  8124 non-null int64
ring_number                 8124 non-null int64
ring_type                   8

In [37]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [38]:
le.classes_

array(['d', 'g', 'l', 'm', 'p', 'u', 'w'], dtype=object)

In [39]:
set(df["habitat"])

{0, 1, 2, 3, 4, 5, 6}

In [40]:
set(df["type"])

{0, 1}

In [74]:
def check_prior(data, value):
    return np.sum(data == value) / len(data)

In [75]:
X.loc[0]

cap_shape                   5
cap_surface                 2
cap_color                   4
bruises                     1
odor                        6
gill_attachment             1
gill_spacing                0
gill_size                   1
gill_color                  4
stalk_shape                 0
stalk_root                  3
stalk_surface_above_ring    2
stalk_surface_below_ring    2
stalk_color_above_ring      7
stalk_color_below_ring      7
veil_type                   0
veil_color                  2
ring_number                 1
ring_type                   4
spore_print_color           2
population                  3
habitat                     5
Name: 0, dtype: int64

In [77]:
prior

{0: 0.517971442639094, 1: 0.48202855736090594}

In [124]:
class NB:
    
    def __init__(self):
        pass

    def fit(self, X, y):
        model = {}   
        prior = {}
        klasses = set(y)
        for klass in klasses:
            model[klass] = {}
            for col in X.columns:
                model[klass][col] = {}
                for val in set(X[col]):
                    select = X.loc[y==klass]
                    prob = np.sum(select[col] == val)/len(select)
                    model[klass][col][val] = prob
        for klass in klasses:
            prior[klass] = np.sum(y == klass) / len(y)
    
        self.model = model
        self.prior = prior
        
    def predict_point(self, point):
        
        probs = []
        
        for klass in self.prior:
            p = self.prior[klass]
            for col in self.model[klass]:
                p *= self.model[klass][col][point[col]]
            
            probs.append(p)
        return np.argmax(probs)
    
    def predict(self, X):
        y = []
        for index, row in X.head().iterrows():
            y.append(self.predict_point(row))

In [125]:
nb = NB()

In [126]:
nb.fit(X, y)

In [127]:
for i in range(10):
    print(nb.predict_point(X.loc[i]))

1
0
0
1
0
0
0
0
1
0


In [108]:
y[:10]

0    1
1    0
2    0
3    1
4    0
5    0
6    0
7    0
8    1
9    0
Name: type, dtype: int64

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
5,5,3,9,1,0,1,0,0,5,0,...,2,7,7,0,2,1,4,2,2,1
6,0,2,8,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,2,3
7,0,3,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,3,3
9,0,2,9,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,3,3
10,5,3,9,1,3,1,0,0,2,0,...,2,7,7,0,2,1,4,3,2,1
11,5,3,9,1,0,1,0,0,5,0,...,2,7,7,0,2,1,4,2,3,3
12,0,2,9,1,0,1,0,0,10,0,...,2,7,7,0,2,1,4,3,3,1


In [122]:
nb.model[0]['cap_shape']

{0: 0.09600760456273764,
 1: 0.0,
 2: 0.37927756653992395,
 3: 0.05418250950570342,
 4: 0.0076045627376425855,
 5: 0.4629277566539924}

In [123]:
nb.model[1]['cap_shape']

{0: 0.012257405515832482,
 1: 0.0010214504596527069,
 2: 0.39734422880490294,
 3: 0.15321756894790603,
 4: 0.0,
 5: 0.43615934627170583}

0
1
2
3
4
