In [1]:
# For a much more robust algorithm exploration and visualization with this dataset, I'd check out the post by raghuchaudhary at this
# link: https://www.kaggle.com/raghuchaudhary/mushroom-classification
# I got a lot of the sklearn stuff from this notebook from there.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import util

In [3]:
from sklearn.naive_bayes import GaussianNB

In [4]:
# Load in the breast cancer train and test sets with pandas.
mushroom_train_full = pd.read_csv("../data/mushroom_train.csv")
mushroom_test_full = pd.read_csv("../data/mushroom_test.csv")

In [5]:
mushroom_train_full.head()

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,y,n,t,n,f,c,b,p,...,s,w,g,p,w,o,p,n,y,d
1,e,b,y,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m
2,e,x,f,w,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
3,e,x,f,n,t,n,f,c,b,u,...,s,p,w,p,w,o,p,n,v,d
4,e,x,y,e,t,n,f,c,b,p,...,s,p,w,p,w,o,p,n,y,d


In [6]:
mushroom_train_full.shape

(2999, 23)

In [7]:
mushroom_train_full.describe()

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,2999,2999,2999,2999,2999,2999,2999,2999,2999,2999,...,2999,2999,2999,2999,2999,2999,2999,2999,2999,2999
unique,2,6,4,8,2,7,2,2,2,9,...,4,7,7,1,2,3,4,6,6,6
top,e,x,y,g,t,n,f,c,b,p,...,s,w,w,p,w,o,p,n,v,d
freq,1856,1510,1165,865,1679,1461,2992,2445,2629,719,...,1858,1666,1644,2999,2993,2921,1843,1006,1118,1288


In [8]:
# Split off the label column to make the y train vector. The iloc indexes mean: "Take all rows, and only the first column, to make a new
# dataframe."
y_train = mushroom_train_full.iloc[:, 0]
# Split of the rest for the feature matrix. Take all rows, and every column but the first.
x_train = mushroom_train_full.iloc[:, 1:]

# Repeat for the test sets.
#y_test = mushroom_test_full.iloc[:, 0]
#x_test = mushroom_test_full.iloc[:, 1:]

In [9]:
y_train

0       e
1       e
2       e
3       e
4       e
       ..
2994    e
2995    e
2996    e
2997    p
2998    p
Name: label, Length: 2999, dtype: object

In [10]:
# In this step, we are using a label encoder to transform the character features (letters like "p" and "e") to numbers.
# This will allow the sklearn algorithms to handle the data. Just note that it also can impose an order on the data.
# For example, if we transform the column ['yellow', 'red', 'white', 'pink'] to [0, 1, 2, 3], there is an implication that
# pink is greater than white, and so on. With categories like 'bad', 'medium', 'good', this is okay, but in this case, we
# don't want to leave it this way.
# Note: if you're coding you're own Naive Bayes classifier, and just calculating the probabilities from the dataset, you'll be
# able to avoid this while keeping the data in this form. But with other algorithms, it may be best to use one-hot encoding,
# explained in the next cell.
from sklearn.preprocessing import LabelEncoder
encoder_x = LabelEncoder() 
for col in x_train.columns:
    x_train[col] = encoder_x.fit_transform(x_train[col])
encoder_y = LabelEncoder()
y_train = encoder_y.fit_transform(y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[col] = encoder_x.fit_transform(x_train[col])


In [11]:
# Now, we'll transform the data again with one-hot encoding. This means if there are n unique discrete values a feature/column can take,
# we divide the column into n columns with either a 0 or 1 indicating the value. So if a column can assume the values 0, 1, 2, 3, a 3 looks
# like [0][0][0][1]. This eliminates the risk of categorical values becoming ordinal after numerical encoding.
x_train = pd.get_dummies(x_train, columns = x_train.columns,drop_first=True)
x_train.head()

Unnamed: 0,cap-shape_1,cap-shape_2,cap-shape_3,cap-shape_4,cap-shape_5,cap-surface_1,cap-surface_2,cap-surface_3,cap-color_1,cap-color_2,...,population_1,population_2,population_3,population_4,population_5,habitat_1,habitat_2,habitat_3,habitat_4,habitat_5
0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split
# We're actually going to split our train dataset into train and test splits again here. So we'll replace the old train split with the new,
# smaller one.
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=17)

In [13]:
# Instantiate a Gaussian Naive Bayes model, train it, then check how it performs on the test set.
gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_test)
# I tried these two statements to show the results. Not sure why they show differing results on logisitic regression.
print("Number of mislabeled points out of a total %d points : %d"% (x_test.shape[0], (y_test != y_pred).sum()))
print('Accuracy with Gaussian Naive Bayes: {0}'.format(gnb.score(x_test, y_test)))

Number of mislabeled points out of a total 900 points : 0
Accuracy with Gaussian Naive Bayes: 1.0


In [14]:
# Now let's see how a Decision Tree performs.
from sklearn.tree import DecisionTreeClassifier as DT

decision_tree = DT(criterion='entropy',random_state=17)
decision_tree.fit(x_train,y_train)
y_pred = decision_tree.predict(x_test)
# I tried these two statements to show the results. Not sure why they show differing results on logisitic regression.
print("Number of mislabeled points out of a total %d points : %d"% (x_test.shape[0], (y_test != y_pred).sum()))
print('Accuracy with Decision Tree: {0}'.format(decision_tree.score(x_test, y_test)))

Number of mislabeled points out of a total 900 points : 0
Accuracy with Decision Tree: 1.0


In [15]:
# We'll compare to logistic regression as well.
from sklearn.linear_model import LogisticRegression as sklogreg
sklearn_logreg = sklogreg(fit_intercept=True, max_iter=5000)
sklearn_logreg.fit(x_train, y_train)
# I tried these two statements to show the results. Not sure why they show differing results on logisitic regression.
print("Number of mislabeled points out of a total %d points : %d"% (x_test.shape[0], (y_test != y_pred).sum()))
print('Accuracy with sk-learn Logistic Regression: {0}'.format(sklearn_logreg.score(x_test, y_test)))

Number of mislabeled points out of a total 900 points : 0
Accuracy with sk-learn Logistic Regression: 0.9988888888888889
