# Naive Bayes - Mushroom Classification
Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model for this classification.

## Load Dataset

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [10]:
df = pd.read_csv("../Datasets/Mushrooms/mushrooms.csv")
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Label the Categorical Data into Numerical Data

In [13]:
le = LabelEncoder()
# Applies transformations of each columns
ds = df.apply(le.fit_transform)
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [15]:
data = ds.values
print(data.shape)

dataX = data[:,1:]
dataY = data[:,0]

print(dataX.shape)
print(dataY.shape)

(8124, 23)
(8124, 22)
(8124,)


## Break the Data into train and test

In [43]:
xTrain,xTest,yTrain,yTest = train_test_split(dataX,dataY,test_size=0.1)
print(xTrain.shape,yTrain.shape)
print(xTest.shape,yTest.shape)

(7311, 22) (7311,)
(813, 22) (813,)


In [21]:
np.unique(dataY)

array([0, 1])

## Building Classifier

In [23]:
def prior_prob(y_train, label):
    total_examples = y_train.shape[0]
    class_examples = sum(y_train==label)
    return class_examples/total_examples

In [34]:
def cond_prob(x_train, y_train, feature_col, feature_val, label):
    x_filtered = x_train[y_train==label]
    numerator = sum(x_filtered[:,feature_col]==feature_val)
    denominator = x_filtered.shape[0]
    return numerator/denominator

## Compute Posterior prob for each test example and make prediction

In [35]:
def predict(x_train, y_train, xTest):
    """xTest is a single testing point with n features"""
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = [] # List of probs for all classes and a given testing point
    # Compute posterior prob
    for label in classes:
        # Post_c = likelihood*prior
        likelihood = 1.0
        for i in range(n_features):
            likelihood *= cond_prob(x_train, y_train, i, xTest[i], label)
        prior = prior_prob(y_train, label)
        post = likelihood*prior
        post_probs.append(post)
    pred = np.argmax(post_probs)
    return pred  
    

In [44]:
output = predict(xTrain, yTrain, xTest[2])
print(output)
print(yTest[2])

0
0


In [45]:
def score(x_train, y_train, x_test, y_test):
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train, y_train, x_test[i])
        pred.append(pred_label)
    # Accuracy
    return sum(pred==y_test)/len(x_test)

In [46]:
score(xTrain,yTrain,xTest,yTest)

0.99753997539975403