# Naive Bayes - Mushroom Dataset
Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model for this classification.

## Import Libraries

In [142]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load the dataset

In [143]:
df = pd.read_csv('dataset/mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [144]:
df.shape

(8124, 23)

## Encode the categorical data into numerical data

In [145]:
le = LabelEncoder()
ds = df.apply(le.fit_transform) # Applies transformation on each columns (converts categorical data into numerical data)

In [146]:
ds.head()
data = ds.values
X = data[:,1:]
Y = data[:,0]
print(data.shape)
print(X.shape)
print(Y.shape)
print(data[:5,:])

(8124, 23)
(8124, 22)
(8124,)
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


## Break the dataset into train and test

In [147]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

In [148]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [149]:
np.unique(Y,return_counts=True)

(array([0, 1], dtype=int64), array([4208, 3916], dtype=int64))

## Building our classifier

In [150]:
def prior_prob(y_train,label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    
    return class_examples/total_examples


In [151]:
y = np.array([1,1,1,1,1,1,1,0,0,0])
print(y.shape)
print(prior_prob(y,0))
print(prior_prob(y,1))

(10,)
0.3
0.7


In [152]:
'''def cond_prob(x_train,y_train,feature_col,feature_val,label):
    print(x_train.shape,y_train.shape,feature_col,feature_val,label)
    y = np.sum(y_train == label)
    x = x_train[y_train == label]
    x = x[:,feature_col]
    print(x)
    x = np.sum(x == feature_val)
    print(x)
    print(y)
    print(x,y)
    return x/y
    '''
def cond_prob(x_train,y_train,feature_col,feature_val,label):   # P(Xi|y=label)
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    
    return numerator/float(denominator)

In [153]:
'''x = np.array([1,1,0,1,0])
x = x.reshape((-1,1))
y = np.array([1,0,1,1,0])
cond_prob(x,y,0,1,1)
'''

'x = np.array([1,1,0,1,0])\nx = x.reshape((-1,1))\ny = np.array([1,0,1,1,0])\ncond_prob(x,y,0,1,1)\n'

## Next Step : Compute Posterior Prob for each test example and make predictions

In [154]:
def predict(x_train,y_train,xtest):
    '''xtest is a single testing point, n features'''
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = [] # list of posterior prob for all the classes given a single testing point
    # Compute posterior prob for each class
    for label in classes:
        
        #post_c = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,xtest[f],label) # P(Xi|y=label)
            likelihood*=cond
        
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        post_probs.append(post)
    pred = np.argmax(post_probs)
    return pred

In [155]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1


## Accuracy

In [156]:
def score(x_train,y_train,x_test,y_test):
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
    pred = np.array(pred)
    
    accuracy = np.sum(pred == y_test)/y_test.shape[0]
    return accuracy

In [157]:
print(score(x_train,y_train,x_test,y_test))

0.9987692307692307
