In [4]:
#Imports 
import numpy as np 
import pandas as pd
# visualize
import matplotlib.pyplot as plt
import seaborn as sns 
import graphviz
from graphviz import Graph
# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")
# .py modules to acquire and prep the data


# hypothesis tests for data exploration
from scipy.stats import chi2_contingency
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
# train, validate, test
from sklearn.model_selection import train_test_split
# evaluating models
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support 
# creating models for classification ML:
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# K-Nearest Neighbor(KNN)  
from sklearn.neighbors import KNeighborsClassifier
# Logistic Regression
from sklearn.linear_model import LogisticRegression

## Acquire Data

In [5]:
# Acquire dataset
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
df.shape

(8124, 23)

In [7]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [8]:
df.dtypes

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

In [13]:
#Target Varialbe - Edible (e) or Poisonous (p)

df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [12]:
# Check for null values 
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [14]:
for col in df.columns: 
    print(df[col].value_counts())

e    4208
p    3916
Name: class, dtype: int64
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64
y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
r      16
u      16
Name: cap-color, dtype: int64
f    4748
t    3376
Name: bruises, dtype: int64
n    3528
f    2160
s     576
y     576
l     400
a     400
p     256
c     192
m      36
Name: odor, dtype: int64
f    7914
a     210
Name: gill-attachment, dtype: int64
c    6812
w    1312
Name: gill-spacing, dtype: int64
b    5612
n    2512
Name: gill-size, dtype: int64
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: gill-color, dtype: int64
t    4608
e    3516
Name: stalk-shape, dtype: int64
b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64
s    5176
k    2372
f     552
y      24
Name: stalk-surf

In [15]:
#Re-assign all binary columns to 0 and 1
df['bruises']=np.where(df['bruises']=='t',1,df['bruises'])
df['bruises']=np.where(df['bruises']=='f',0,df['bruises'])

In [17]:
df['bruises'].value_counts()

0    4748
1    3376
Name: bruises, dtype: int64

In [19]:
df['gill-attachment']=np.where(df['gill-attachment']=='f',0,df['gill-attachment'])
df['gill-attachment']=np.where(df['gill-attachment']=='a',1,df['gill-attachment'])

In [20]:
df['gill-spacing']=np.where(df['gill-spacing']=='c',0,df['gill-spacing'])
df['gill-spacing']=np.where(df['gill-spacing']=='w',1,df['gill-spacing'])

In [21]:
df['gill-size']=np.where(df['gill-size']=='b',0,df['gill-size'])
df['gill-size']=np.where(df['gill-size']=='n',1,df['gill-size'])

In [22]:
df['stalk-shape']=np.where(df['stalk-shape']=='e',0,df['stalk-shape'])
df['stalk-shape']=np.where(df['stalk-shape']=='t',1,df['stalk-shape'])

In [23]:
#Assign missing values to 'other'
df['stalk-root']=np.where(df['stalk-root']=='?','other',df['stalk-root'])

In [24]:
#Dropping veil-type column since values are all the same and will not be used in the model

df.drop(columns='veil-type',inplace=True)

In [25]:
#re-assign ring-number to actual ring number, 0, 1, 2  numerical data
df['ring-number']=np.where(df['ring-number']=='n',0,df['ring-number'])
df['ring-number']=np.where(df['ring-number']=='o',1,df['ring-number'])
df['ring-number']=np.where(df['ring-number']=='t',2,df['ring-number'])

In [27]:
df.shape

(8124, 22)