In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import naive_bayes
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('mushrooms.csv')

In [3]:
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [4]:
data.shape

(8124, 23)

In [5]:
labels = data['class']

In [6]:
labels

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [7]:
features = data.copy()
features = features.drop(['class'], axis=1)

In [8]:
# Remove unnecessary features
removed_features = \
    ['gill-attachment', 'stalk-shape', 'stalk-root', 'veil-type', 'veil-color', 'ring-number']

In [9]:
features = features.drop(removed_features, axis=1)

In [10]:
features

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,gill-size,gill-color,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,c,n,k,s,s,w,w,p,k,s,u
1,x,s,y,t,a,c,b,k,s,s,w,w,p,n,n,g
2,b,s,w,t,l,c,b,n,s,s,w,w,p,n,n,m
3,x,y,w,t,p,c,n,n,s,s,w,w,p,k,s,u
4,x,s,g,f,n,w,b,k,s,s,w,w,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,c,b,y,s,s,o,o,p,b,c,l
8120,x,s,n,f,n,c,b,y,s,s,o,o,p,b,v,l
8121,f,s,n,f,n,c,b,n,s,s,o,o,p,b,c,l
8122,k,y,n,f,y,c,n,b,s,k,w,w,e,w,v,l


In [11]:
# Since data is nominal, we should use dummy variables to pass into a Naive Bayes classifier
_features = features.copy()
_features = pd.get_dummies(_features)
_labels = labels.copy()
_features

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [12]:
# Make the outcome class binary by mapping 'p' to 0 and 'e' to 1
a = {'p': 0, 'e': 1}
binary_labels = _labels.map(a)

In [13]:
binary_labels

0       0
1       1
2       1
3       0
4       1
       ..
8119    1
8120    1
8121    1
8122    0
8123    1
Name: class, Length: 8124, dtype: int64

In [14]:
# Split data into training and validation sets, where the validation set contains 30% of the total dataset, or 2438 observations
# Note that we will be using K-fold cross validation so there is no need to split data into three separate groups.

In [15]:
x_train, x_val, y_train, y_val = train_test_split(
    _features, binary_labels, test_size=0.3, random_state=42) 

In [16]:
print("Training shape: ", x_train.shape)
print("Validation shape: ", x_val.shape)

Training shape:  (5686, 100)
Validation shape:  (2438, 100)


In [17]:
cv = KFold(n_splits=5, random_state=42, shuffle=True)

In [18]:
multinomialNB = naive_bayes.MultinomialNB()

In [19]:
multinomialNB

MultinomialNB()

In [20]:
multinomialNB_f1scores = cross_val_score(
    multinomialNB, x_train, y_train, cv=cv, scoring='f1')

In [21]:
multinomialNB_f1scores

array([0.96661367, 0.96045198, 0.95631454, 0.96147404, 0.94276094])

In [22]:
multinomialNB_f1scores.mean()

0.9575230329709064

In [23]:
categoricalNB = naive_bayes.CategoricalNB()

In [24]:
categoricalNB

CategoricalNB()

In [25]:
categoricalNB_f1scores = cross_val_score(
    categoricalNB, x_train, y_train, cv=cv, scoring='f1')

In [26]:
categoricalNB_f1scores

array([0.95382166, 0.95092518, 0.94413847, 0.94684385, 0.93178037])

In [27]:
categoricalNB_f1scores.mean()

0.9455019061169206

In [28]:
# Try imposing a different topological structure onto the data i.e. label encoding

In [29]:
le = preprocessing.LabelEncoder()

In [30]:
encoded_features = pd.DataFrame()
copy_features = features.copy()
for column in copy_features.columns: 
    encoded_features[column] = le.fit_transform(copy_features[column])
#encoded_features = le.fit_transform(encoded_features)
#encoded_features

In [31]:
encoded_features

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,gill-size,gill-color,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,0,1,4,2,2,7,7,4,2,3,5
1,5,2,9,1,0,0,0,4,2,2,7,7,4,3,2,1
2,0,2,8,1,3,0,0,5,2,2,7,7,4,3,2,3
3,5,3,8,1,6,0,1,5,2,2,7,7,4,2,3,5
4,5,2,3,0,5,1,0,4,2,2,7,7,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,3,2,4,0,5,0,0,11,2,2,5,5,4,0,1,2
8120,5,2,4,0,5,0,0,11,2,2,5,5,4,0,4,2
8121,2,2,4,0,5,0,0,5,2,2,5,5,4,0,1,2
8122,3,3,4,0,8,0,1,0,2,1,7,7,0,7,4,2


In [32]:
encoded_labels = labels.copy()
encoded_labels = le.fit_transform(encoded_labels)

In [33]:
encoded_labels

array([1, 0, 0, ..., 0, 1, 0])

In [34]:
xenc_train, xenc_val, yenc_train, yenc_val = train_test_split(
    encoded_features, encoded_labels, test_size=0.3, random_state=42) 

In [35]:
print("Train shape: ", xenc_train.shape)
print("Val shape: ", xenc_val.shape)

Train shape:  (5686, 16)
Val shape:  (2438, 16)


In [36]:
enc_multinomialNB = naive_bayes.MultinomialNB()

In [37]:
enc_multinomialNB_f1scores = cross_val_score(
    enc_multinomialNB, xenc_train, yenc_train, cv=cv, scoring='f1')

In [38]:
enc_multinomialNB_f1scores

array([0.79574468, 0.77848101, 0.7615894 , 0.77333333, 0.75203252])

In [39]:
enc_multinomialNB_f1scores.mean()

0.7722361902282676

In [40]:
enc_categoricalNB = naive_bayes.CategoricalNB()

In [41]:
enc_categoricalNB_f1scores = cross_val_score(
    enc_categoricalNB, xenc_train, yenc_train, cv=cv, scoring='f1')

In [42]:
enc_categoricalNB_f1scores

array([0.95874263, 0.952657  , 0.94581281, 0.95740741, 0.9373849 ])

In [43]:
enc_categoricalNB_f1scores.mean()

0.9504009502887861