### Load Dataset

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('mushrooms.csv')

In [5]:
df.shape

(8124, 23)

In [6]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Converting(Encode the) Catogorical data into Numerical data
- one way is by using dictionary and giving every categorical value 
  a numerical value

- Another way is by using Scikit learn library

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [10]:
#Applies transform on each Column

ds = df.apply(le.fit_transform)

In [19]:
type(ds)
ds.head()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [20]:
np.unique(ds['class'])

array([0, 1])

In [13]:
type(ds)

pandas.core.frame.DataFrame

#### Changing data into array 

In [17]:
data=ds.values
print(data[:5,:])



[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


### Break Down Data into train and test

In [22]:
#it is necessary the data is in form of array to use test and train

data_x = data[:,1:]
data_y = data[:,0]

In [83]:
data_y

array([1, 0, 0, ..., 0, 1, 0])

In [25]:
data_y.shape

(8124,)

- train test split using sklear

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [28]:
x_train.shape,y_train.shape

((6499, 22), (6499,))

In [29]:
x_test.shape,y_test.shape

((1625, 22), (1625,))

# Building Our Classifier (Naive Bayes)

In [82]:
y_train

array([1, 0, 0, ..., 0, 1, 1])

In [45]:
y_train.shape

(6499,)

In [56]:
def prior_prob(y_train,label):
    return np.sum(y_train==label)/float(y_train.shape[0])

In [69]:
x=np.array([0,0,0,0,1,1,1,0,0,1])
prior_prob(x,0)

0.6

In [74]:
#As formula for conditional probability is P(x(i)|y==c) 


def conditional_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered = x_train[y_train==label]
    
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    
    return numerator/float(denominator)

# Make Prediction

In [75]:
def predict(x_train,y_train,x_test):
    #xtest is a test mushroom with all features.
    '''Xtest is a single testing point n features'''
    #posterior probality = likelihood*prior probability
    
    #  likelihood = continous product(conditional probability)
    post_prob=[]  #List of probability for all classes given a single testing point
    
    n_features = x_train.shape[1]
    classes = np.unique(y_train)  # computing no of different type of mushrooms
    
    #Compute posterior for each class
    #posterior_c = likelihood * prior
    for label in classes:
        likelihood=1.0
        for f in range(n_features):
            cond=conditional_prob(x_train,y_train,f,x_test[f],label)
            likelihood*= cond
        
        prior = prior_prob(y_train,label)
        
        post = prior * likelihood
        
        post_prob.append(post)
        
    pred = np.argmax(post_prob)         
    return pred

In [76]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1


In [79]:
def score(x_train,y_train,x_test,y_test):
    predicted=[]
    
    no_of_test = x_test.shape[0]
    
    for i in range(no_of_test):
        output = predict(x_train,y_train,x_test[i])
        predicted.append(output)
        
    length = y_test.shape[0]
    val = np.sum(np.array(predicted) == y_test)
    
    return val/length

In [81]:
score(x_train,y_train,x_test,y_test)

0.9975384615384615