#### Mushroom classification using the naive bayes
- Data import
- Data preprocessing
- Data visulization(Not in this case)
- Model
- Accuracy

In [1]:
### Data import
import pandas as pd

data = pd.read_csv('mushrooms.csv')
data.head(n=10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [2]:
## Data preprocessing

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data = data.apply(le.fit_transform)
data.head(n=10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
5,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1
6,0,0,2,8,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,2,3
7,0,0,3,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,3,3
8,1,5,3,8,1,6,1,0,1,7,...,2,7,7,0,2,1,4,2,4,1
9,0,0,2,9,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,3,3


In [3]:
data = data.values
X = data[:,1:]
Y = data[:,0]

print("The shape of X is {}".format(X.shape))
print("The shape of Y is {}".format(Y.shape))

The shape of X is (8124, 22)
The shape of Y is (8124,)


In [4]:
from sklearn.model_selection import train_test_split
X_Train,X_Test,Y_Train,Y_Test = train_test_split(X,Y,test_size=0.2)

print("The shape of X_Train is {}".format(X_Train.shape))
print("The shape of X_Test is {}".format(X_Test.shape))
print("The shape of Y_Train is {}".format(Y_Train.shape))
print("The shape of X_Test is {}".format(Y_Test.shape))

The shape of X_Train is (6499, 22)
The shape of X_Test is (1625, 22)
The shape of Y_Train is (6499,)
The shape of X_Test is (1625,)


In [10]:
## Number of classes in the classification
import numpy as np
print("The classes in classification is {}".format(np.unique(Y_Train)))

The classes in classification is [0 1]


In [21]:
### Model

def priorProb(y,label):
    """Return the prior probablity"""
    total_no = y.shape[0]
    matching_the_label = np.sum(y==label)
    return matching_the_label/total_no

def conditional_prob(x_train,y_train,feature_col,feature_val,label):
    """Return the conditional probablity"""
    x_filtered = x_train[y_train==label]
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    return numerator/float(denominator)

def predict(x_train,y_train,x_test):
    """Return the prediction of the label"""
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_prob = []
    
    for labels in classes:
        likelihood = 1.0
        for f in range(n_features):
            cond = conditional_prob(x_train,y_train,f,x_test[f],labels)
            likelihood *= cond
        prior = priorProb(y_train,labels)
        post = likelihood*prior
        post_prob.append(post)
        
    pred = np.argmax(post_prob)
    return pred

In [26]:
###Accuracy
y_pred = []

for i in range(Y_Test.shape[0]):
    y_pred.append(predict(X_Train,Y_Train,X_Test[i]))

print("The accuracy is {}".format((y_pred==Y_Test).sum()/Y_Test.shape[0]))

The accuracy is 0.9975384615384615
