In [42]:
import pandas as  pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./mushrooms.csv")

In [4]:
df.shape

(8124, 23)

In [5]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
le = LabelEncoder()

In [7]:
li = ['c','a','b','a','b','b','c','b','a']

In [8]:
le.fit_transform(li)

array([2, 0, 1, 0, 1, 1, 2, 1, 0])

In [9]:
df =df.apply(le.fit_transform)

In [10]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
df = df.values

In [19]:
X = df[:, 1:]
y = df[:, 0]
# y = y.reshape(-1,1)

In [20]:
X.shape, y.shape

((8124, 22), (8124,))

### Train and test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Naive Bayes Classifier
 - posterior
 - prior
 - likelihood
  - product of conditional probabs


In [58]:
def prior_probability(y_train, label):
    m = y_train.shape[0]
    return  float(np.sum(y_train == label)/ m)

In [59]:
prior_probability(y_train, 1)

0.48222803508232037

In [66]:
def cond_probability(x_train, y_train, feature_col, feature_val, label):

    x_filtered = x_train[y_train==label]
    
    num = np.sum(x_filtered[:,feature_col] == feature_val)
    denom = np.sum(y_train==label)
    
    return float(num/denom)

In [67]:
def predict(x_train, y_tain, x_test):
    """
    x_test is just one example.
    """
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    posterior_probab = [] # 
    
    # computing post probab for all the classes
    for label in classes:
        
        # find the likelihood
        likelihood = 1.0
        for fea in range(n_features):
            cond = cond_probability(x_train, y_train, fea, x_test[fea] ,label)
            likelihood *= cond
        
        prior = prior_probability(y_train, label)
        
        post = likelihood*prior

        posterior_probab.append(post)
        
    pred = np.argmax(posterior_probab)
    return pred

In [74]:
predict(X_train, y_train, X_test[3])

0

### Accuracy Measure

In [81]:
def accuracy(x_train, y_train, x_test, y_test):
    """
    x_test : m examples
    """
    
    pred = []
    for i in range(x_test.shape[0]):
        p = predict(x_train, y_train, x_test[i])
        pred.append(p)
    
    y_pred = np.array(pred)
    
    acc = np.sum(y_pred==y_test)/y_pred.shape[0]
    return acc

In [83]:
acc =  accuracy(X_train, y_train, X_test, y_test)
acc

[0 1 1 ... 1 1 1] [0 1 1 ... 1 1 1]


0.9963076923076923

## Sklearn Naive Bayes

In [92]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB, CategoricalNB

In [93]:
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb = GaussianNB()
cnb = CategoricalNB()

In [96]:
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)
gnb.fit(X_train, y_train)
cnb.fit(X_train, y_train)

CategoricalNB()

In [89]:
mnb.score(X_test, y_test)

0.8073846153846154

In [90]:
bnb.score(X_test, y_test)

0.8418461538461538

In [91]:
gnb.score(X_test, y_test)

0.9218461538461539

In [97]:
cnb.score(X_test, y_test)

0.9507692307692308