# MushRoom Classification
**Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model for this classification.**

## Importing Required Libraries

In [0]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Data Preparation

#### Loading DataSet

In [2]:
df=pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


#### Converting catergorical into numerical

In [3]:
le=LabelEncoder()
ds=df.apply(le.fit_transform)
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,0,3,2,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,0,2,2,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,0,2,2,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,0,3,2,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,1,3,2,2,7,7,0,2,1,0,3,0,1


The first row are the y values

In [0]:
data=ds.values 
data_y=data[:,0]
data_x=data[:,1:]

#### Breaking Data into train and test

In [5]:
xtrain,xtest,ytrain,ytest=train_test_split(data_x,data_y,test_size=0.2)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(6499, 22)
(6499,)
(1625, 22)
(1625,)


## Naive Bayes Classifier

In [0]:
def prior_prob(y_train,label):
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    
    return (class_examples)/float(total_examples)
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    
    x_filtered = x_train[y_train==label]
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    
    return numerator/float(denominator)

In [0]:
#Prediction
def predict(x,y,xtest):
  noclasses=np.unique(y)
  nofeatures=x.shape[1]
  prob=[]
  for label in noclasses:
    likelihood=1.0 
    for i in range(nofeatures):
      conditionalprob=cond_prob(x,y,i,xtest[i],label)
      likelihood*=conditionalprob
    priorprob=prior_prob(y,label)
    prob.append(likelihood*priorprob)
  pred=np.argmax(prob)
  return pred

In [0]:
def score(xtrain,ytrain,xtest,ytest):
  pred=[]
  for i in range(xtest.shape[0]):
    pred.append(predict(xtrain,ytrain,xtest[i]))
  pred=np.array(pred)
  accuracy=np.sum(pred==ytest)/ytest.shape[0]
  return accuracy

In [9]:
print(score(xtrain,ytrain,xtest,ytest))

0.9981538461538462


In [0]:
pred=[]
for i in range(xtest.shape[0]):
  pred.append(predict(xtrain,ytrain,xtest[i]))
pred=np.array(pred)
pd.DataFrame(pred).to_csv('predictions.csv')