In [42]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [26]:
data_orig = pd.read_csv('./mushrooms.csv')

In [27]:
print(data_orig.shape)

(8124, 23)


In [28]:
data_orig.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [29]:
data = data_orig.values

In [30]:
print(data.shape)

(8124, 23)


# LabelEncoder example

In [31]:
xtemp = data[:,0]
print(xtemp.shape)

(8124,)


In [32]:
print(xtemp[:10])

['p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e']


In [33]:
letemp = LabelEncoder()

In [34]:
letemp.fit(xtemp)
xconv = letemp.transform(xtemp)

In [35]:
print(xconv)

[1 0 0 ..., 0 1 0]


In [36]:
letemp.classes_

array(['e', 'p'], dtype=object)

In [37]:
orig = letemp.inverse_transform(xconv)

In [38]:
print(orig[:10])
print(xtemp[:10])

['p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e']
['p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e']


# Dataset preprocessing

In [39]:
le = LabelEncoder()

In [40]:
data = data_orig.apply(le.fit_transform)

In [41]:
print(data)

      type  cap_shape  cap_surface  cap_color  bruises  odor  gill_attachment  \
0        1          5            2          4        1     6                1   
1        0          5            2          9        1     0                1   
2        0          0            2          8        1     3                1   
3        1          5            3          8        1     6                1   
4        0          5            2          3        0     5                1   
5        0          5            3          9        1     0                1   
6        0          0            2          8        1     0                1   
7        0          0            3          8        1     3                1   
8        1          5            3          8        1     6                1   
9        0          0            2          9        1     0                1   
10       0          5            3          9        1     3                1   
11       0          5       

In [43]:
print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [44]:
data = data.values

In [45]:
xdata = data[:,1:]
ydata = data[:,0]
print(xdata.shape,ydata.shape)

(8124, 22) (8124,)


In [46]:
xtrain, xval, ytrain, yval = train_test_split(xdata,ydata,test_size=0.2,random_state = 2)

In [47]:
print(xtrain.shape,xval.shape,ytrain.shape,yval.shape)

(6499, 22) (1625, 22) (6499,) (1625,)


# Naive Bayes Classifier

In [77]:
def prior_prob(ytrain,label):
    
    num = np.sum( ytrain == label )
    
    return num/float(ytrain.shape[0])

In [78]:
prior_prob(ytrain,0)

0.51515617787351897

In [79]:
def cond_prob(xtrain,ytrain,feature_col,feature_val,label):
    
    xfilt = xtrain[ytrain==label]
    constraint= np.sum(xfilt[:,feature_col]==feature_val)
    prob_value = constraint/float(xfilt.shape[0])
    return prob_value

In [96]:
def predictions(xtrain,ytrain,xtest):
    
    result = []
    classes = np.unique(ytrain)
    
    for ix in xtest:      
        
        preds = []
        
        for jx in classes:
            
            priorpb = prior_prob(ytrain,jx)
   
            likelihood = 1.0
            
            for fx in range(xtrain.shape[1]):
                
                cond = cond_prob(xtrain,ytrain,fx,ix[fx],jx) 
                likelihood *= cond
            
            
            posterior_prob = priorpb * likelihood
            
            preds.append([posterior_prob,jx])
        
        preds = np.array(preds)
        
        index = np.argmax(preds[:,0])
        
        final_class = preds[index][1]
        
        result.append(final_class)
        
    return result

In [97]:
pred = predictions(xtrain,ytrain,xval)

In [98]:
def score(ytest,pred):
    return np.sum(ytest==pred)/ytest.shape[0]

In [99]:
score(yval,pred)

0.99446153846153851