### Goal: To predict the class of mushrooms, given some features of the mushrooms using Naive Bayes Model for classification

#### Load the Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.shape

(8124, 23)

In [4]:
df.columns

Index(['type', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [5]:
df.isnull().sum()

type                        0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

In [6]:
df['type'].value_counts()

e    4208
p    3916
Name: type, dtype: int64

#### Encode the Categorical Data into Numerical Data

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [9]:
# Applies transformation on each columns

ds = df.apply(le.fit_transform)

In [10]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


#### Converting into numpy array and splitting dataset into train and test data

In [11]:
data = ds.values
data

array([[1, 5, 2, ..., 2, 3, 5],
       [0, 5, 2, ..., 3, 2, 1],
       [0, 0, 2, ..., 3, 2, 3],
       ...,
       [0, 2, 2, ..., 0, 1, 2],
       [1, 3, 3, ..., 7, 4, 2],
       [0, 5, 2, ..., 4, 1, 2]])

In [12]:
x_data = data[: , 1:]
y_data = data[: , 0]

print(x_data.shape , y_data.shape)

(8124, 22) (8124,)


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train , x_test , y_train , y_test = train_test_split(x_data , y_data , test_size = 0.2 , random_state = 0)

print(x_train.shape , x_test.shape , y_train.shape , y_test.shape)

(6499, 22) (1625, 22) (6499,) (1625,)


In [15]:
np.unique(y_train)

array([0, 1])

In [16]:
x_test

array([[5, 3, 4, ..., 2, 3, 5],
       [2, 3, 3, ..., 2, 5, 0],
       [5, 3, 9, ..., 3, 2, 3],
       ...,
       [2, 3, 2, ..., 7, 4, 4],
       [0, 2, 8, ..., 2, 2, 3],
       [5, 3, 4, ..., 3, 4, 0]])

#### Building the Naive Bayes Classifier from scratch

In [17]:
def prior_prob(y_train , label):
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    
    return class_examples/float(total_examples)

In [18]:
prior_prob(y_train , 1)

0.48361286351746424

In [19]:
def cond_prob(x_train , y_train , feature_col, feature_val , label):    # conditional probability
    
    x_filtered = x_train[y_train == label]   # filter all rows in x data where y train = label (say 2)
    
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)   # get those columns which have the specified feature value
    
    denominator = np.sum(y_train == label)    # get y train column with particular label (say 2)
    
    return numerator/float(denominator)

In [20]:
print(x_test[0])

[5 3 4 1 6 1 0 1 7 0 3 2 2 7 7 0 2 1 4 2 3 5]


#### Compute Posterior Probability for each test sample and make predictions

In [21]:
# given a new mushroom with n features , predict its class

def predict(x_train , y_train , x_test):
    
    classes = np.unique(y_train)     # 0 and 1
    n_features = x_train.shape[1]    # no of features
    
    posterior_probs = []            # List of probability for all classes and given a single testing point
    
    # Compute posterior probability for each class - likelihood*prior
    for label in classes:
        
        likelihood = 1.0
        
        # conditional probability across all features - f is feature column and x_test[f] all feature values
        for f in range(n_features):
            
            cond = cond_prob(x_train , y_train , f , x_test[f] , label)
            likelihood *= cond
        
        prior = prior_prob(y_train , label)
        
        # posterior is the probability for current class
        posterior = likelihood*prior
        posterior_probs.append(posterior)
        
    pred = np.argmax(posterior_probs)
    return pred , posterior_probs

In [22]:
output = predict(x_train , y_train , x_test[0])
print(output)
print(y_test[0])

(1, [0.0, 1.8344429953101966e-12])
1


#### Checking Accuracy

In [23]:
def model_score(x_train , y_train , x_test , y_test):
    
    prediction = []     # make an array of predictions of all training samples and match it with actual y_test predictions
    
    for i in range(x_test.shape[0]):
        
        pred_label = predict(x_train , y_train , x_test[i])
        prediction.append(pred_label[0])
        
    prediction = np.array(prediction)
    
    accuracy = np.sum(prediction == y_test)/y_test.shape[0]    # both are vectors
    return accuracy

In [25]:
# Total number of items which are classified accurately

accuracy = model_score(x_train , y_train , x_test , y_test)
print(accuracy*100)

99.93846153846154
