In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [9]:
df = pd.read_csv("../datasets/bayes/mushrooms.csv")

In [10]:
df.shape

(8124, 23)

In [11]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [13]:
le = LabelEncoder()

In [14]:
le.fit_transform(["a", "a", "b", "a"])

array([0, 0, 1, 0])

In [15]:
df = df.apply(le.fit_transform)

In [17]:
data = df.values

In [18]:
y = data[:, 0]
X = data[:, 1:]

In [19]:
X.shape, y.shape

((8124, 22), (8124,))

In [20]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
np.unique(y_train)

array([0, 1])

# Classifier

In [25]:
a = np.array([1,0,0,0,1,1,1,1,0,1])

In [26]:
def prior_prob(y_train, label):
    total_examples = len(y_train)
    label_examples = (y_train == label).sum()
    
    return label_examples/total_examples

In [30]:
prior_prob(a, 1), prior_prob(a, 0)

(0.6, 0.4)

In [32]:
prior_prob(y_train, 0), prior_prob(y_train, 1)

(0.5199338600036745, 0.4800661399963256)

In [36]:
def cond_prob(X_train, y_train, feature_col, feature_val, label):
    X_filtered = X_train[y_train == label]
    numerator = (X_filtered[:, feature_col] == feature_val).sum()
    denominator = (y_train == label).sum()
    
    return numerator/denominator

In [34]:
X_train[(y_train == 1)].shape

(2613, 22)

# Compute posterior probability / prediction

In [39]:
def predict(X_train, y_train, xtest):
#     xtext is a single testing point, n_features
    classes = np.unique(y_train)
    n_features = X_train.shape[1]
    post_prob = []
    
    for label in classes:
        likelihood = 1.0
        prior = prior_prob(y_train, label)
        
        for f in range(n_features):
            cond = cond_prob(X_train, y_train, f, xtest[f], label)
            likelihood *= cond
            
        post = likelihood * prior
        post_prob.append(post)
        
    prediction = np.argmax(post_prob)
    return prediction

In [42]:
predict(X_train, y_train, X_test[2])

1