In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("mushrooms.csv")

In [4]:
df.shape

(8124, 23)

In [8]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Encode Categorical data into numbers

In [18]:
le = LabelEncoder()

In [20]:
df = df.apply(le.fit_transform)

In [21]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [22]:
data = df.values

In [26]:
data[:, 0]

array([1, 0, 0, ..., 0, 1, 0])

In [29]:
X = data[:, 1:]
y = data[:, 0]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classifier

In [34]:
def prior_prob(y_train, label):
    total_examples = y_train.shape[0]
    classes_examples = np.sum(y_train == label)
    
    return (classes_examples)/float(total_examples)s

In [36]:
prior_prob(y_train, 1)

0.48222803508232037

In [40]:
prior_prob(y_train , 0)

0.5177719649176796

In [46]:
a = np.array([1, 0,0,0,0,1,1,0,0,0 ])
prior_prob(a, 1)

0.3

In [49]:
def cond_prob(X_train, y_train, feature_col, feature_val, label):
    X_filtered = X_train[y_train==label]
    
    numerator = np.sum(X_filtered[:, feature_col] == feature_val)
    denom = np.sum(y_train==label)
    
    return numerator/float(denom)

## Compute : Likelihood and Posterior Probability

In [56]:
def predict(X_train, y_train, x_query):
    "x_query is a single testing point having n features"

    classes = np.unique(y_train)
    n_features = X_train.shape[1]
    post_prob = []
    
    for label in classes:
        likelihood = 1.0
        # likelihood
        for f in range(n_features):
            cond = cond_prob(X_train, y_train, f, x_query[f], label)
            likelihood = likelihood*cond
            
        
        # prior
        prior = prior_prob(y_train, label)
        post = likelihood*prior
        
        post_prob.append(post)
        
    pred = np.argmax(post_prob)
    return pred

In [60]:
predict(X_train, y_train, X_test[79])

1

In [61]:
y_test[79]

1

In [64]:
X_test.shape

(1625, 22)

In [65]:
def accuracy(X_train, y_train, X_test, y_test):
    y_pred = []
    
    for i in range(X_test.shape[0]):
        pred_label = predict(X_train, y_train, X_test[i])
        y_pred.append(pred_label)
        
    y_pred = np.array(y_pred)
    
    acc = np.sum(y_pred == y_test)/ y_test.shape[0]
    
    return acc

In [66]:
accuracy(X_train, y_train, X_test, y_test)

0.9963076923076923

# Gaussian Naive Bayes

In [67]:
from sklearn.datasets import load_iris 
iris = load_iris() 

In [69]:
X = iris.data
y = iris.target

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) 

In [73]:
from sklearn.naive_bayes import GaussianNB

In [74]:
gnb = GaussianNB()

In [75]:
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [76]:
gnb.predict(X_test)

array([0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       1, 2, 0, 0, 0, 1, 0, 0, 2, 2, 2, 2, 2, 1, 2, 1])

In [77]:
gnb.score(X_test, y_test)

0.95

In [79]:
X_test[0]

array([5.8, 4. , 1.2, 0.2])

# MNIST DATA

In [80]:
from sklearn.datasets import load_digits

In [81]:
digit = load_digits()

In [82]:
X = digit.data
y = digit.target

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [84]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [85]:
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb = GaussianNB()

In [86]:
mnb.fit(X_train, y_train)
gnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [88]:
mnb.score(X_test, y_test)

0.8970792767732962

In [89]:
bnb.score(X_test, y_test)

0.8567454798331016

In [90]:
gnb.score(X_test, y_test)

0.8303198887343533