Data on North American mushrooms from https://archive.ics.uci.edu/ml/datasets/Mushroom. We're answering questions based on the data:
* Can a machine learning model reliably identify poisonous mushrooms based on the data?
* Does any one feature reliably classify mushroom toxicity?
* Can we formulate simple, memorizable rules for reliably classifying mushroom toxicity?

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from functools import reduce

We are importing the "expanded" data file, which contains more samples than the single-character version.

In [17]:
# Input column names, which aren't included in the expanded data file
names = [
    'Toxicity',
    'Cap Shape',
    'Cap Surface',
    'Cap Color',
    'Bruises?',
    'Odor',
    'Gill Attachment',
    'Gill Spacing',
    'Gill Size',
    'Gill Color',
    'Stalk Shape',
    'Stalk Root',
    'Stalk Surface Above Ring',
    'Stalk Surface Below Ring',
    'Stalk Color Above Ring',
    'Stalk Color Below Ring',
    'Veil Type',
    'Veil Color',
    'Ring Number',
    'Ring Type',
    'Spore Print Color',
    'Population',
    'Habitat'
]
df = pd.read_csv('data/expanded', skiprows=9, names=names, index_col=None, engine='python', skipfooter=1); df

Unnamed: 0,Toxicity,Cap Shape,Cap Surface,Cap Color,Bruises?,Odor,Gill Attachment,Gill Spacing,Gill Size,Gill Color,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Ring Number,Ring Type,Spore Print Color,Population,Habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES


In [18]:
# Assess data variability
df.describe().loc['unique']

Toxicity                     2
Cap Shape                    6
Cap Surface                  4
Cap Color                   10
Bruises?                     2
Odor                         9
Gill Attachment              2
Gill Spacing                 2
Gill Size                    2
Gill Color                  12
Stalk Shape                  2
Stalk Root                   5
Stalk Surface Above Ring     4
Stalk Surface Below Ring     4
Stalk Color Above Ring       9
Stalk Color Below Ring       9
Veil Type                    1
Veil Color                   4
Ring Number                  3
Ring Type                    5
Spore Print Color            9
Population                   6
Habitat                      7
Name: unique, dtype: object

Veil type has only one value, so we can remove that feature later.

In [19]:
# Assess missing data
df.isna().sum()

Toxicity                    0
Cap Shape                   0
Cap Surface                 0
Cap Color                   0
Bruises?                    0
Odor                        0
Gill Attachment             0
Gill Spacing                0
Gill Size                   0
Gill Color                  0
Stalk Shape                 0
Stalk Root                  0
Stalk Surface Above Ring    0
Stalk Surface Below Ring    0
Stalk Color Above Ring      0
Stalk Color Below Ring      0
Veil Type                   0
Veil Color                  0
Ring Number                 0
Ring Type                   0
Spore Print Color           0
Population                  0
Habitat                     0
dtype: int64

There appears to be no missing data.

In [20]:
# Transform features

# Convert binary bruised state to boolean values
def bool_bruises(dfin):
    dfin_no_bruises = dfin.drop(columns='Bruises?')
    dfin_bool_bruises = dfin['Bruises?'].apply(lambda x: x == 'BRUISES')
    return pd.concat([dfin_no_bruises, dfin_bool_bruises], axis=1)

# Drop veil type, because it has one value
drop_veil_type = lambda dfin: dfin.drop(columns='Veil Type')

# Convert class to boolean values
def bool_toxicity(dfin):
    dfin_no_toxicity = dfin.drop(columns='Toxicity')
    dfin_toxic = dfin['Toxicity'].apply(lambda x: x == 'POISONOUS')
    dfin_toxic.name = 'Toxic?'
    return pd.concat([dfin_toxic, dfin_no_toxicity], axis=1)

# One-hot encode
one_hot_encode = lambda dfin: pd.get_dummies(dfin)

fns = [bool_bruises, bool_toxicity, drop_veil_type, one_hot_encode]
df_trans = reduce(lambda res, fn: fn(res), fns, df); df_trans

Unnamed: 0,Toxic?,Bruises?,Cap Shape_BELL,Cap Shape_CONICAL,Cap Shape_CONVEX,Cap Shape_FLAT,Cap Shape_KNOBBED,Cap Shape_SUNKEN,Cap Surface_FIBROUS,Cap Surface_GROOVES,...,Population_SCATTERED,Population_SEVERAL,Population_SOLITARY,Habitat_GRASSES,Habitat_LEAVES,Habitat_MEADOWS,Habitat_PATHS,Habitat_URBAN,Habitat_WASTE,Habitat_WOODS
0,False,True,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,False,True,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,False,True,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,False,True,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,False,True,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,False,False,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8412,False,False,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
8413,False,False,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8414,False,False,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0


Although other categories beside _Bruised?_ may have only two categories, _Bruised?_ was the only column treated as binary in the data, so we have transformed it to boolean values to reflect that.

In [21]:
# Assess distribution of class
df_trans['Toxic?'].mean()

0.4667300380228137

We have similarly sized edible and poisonous samples.

In [22]:
# Create machine learning model
X = df_trans.drop(columns='Toxic?')
y = df_trans['Toxic?']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
random_forest_model = RandomForestClassifier(random_state=0).fit(X_train, y_train); random_forest_model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [23]:
f1_score(random_forest_model.predict(X_test), y_test)

1.0

That's extremely accurate!

In [24]:
# Save the DOT data and convert it to PNG
! mkdir dot
! mkdir images
export_graphviz(random_forest_model.estimators_[0], out_file='dot/subestimatortree.dot', feature_names=X.columns, class_names=['Edible', 'Poisonous'])
! dot -Tpng dot/subestimatortree.dot -o images/subestimatortree.png

A subdirectory or file dot already exists.
A subdirectory or file images already exists.


In [25]:
# Create a decision tree with a depth of 1
decision_tree_model1 = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y); decision_tree_model1

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [26]:
# Save the DOT data and convert it to PNG
export_graphviz(decision_tree_model1, out_file='dot/tree1.dot', feature_names=X.columns, class_names=['Edible', 'Poisonous'])
! dot -Tpng dot/tree1.dot -o images/tree1.png

In [27]:
# Create a decision tree with a depth of 2
decision_tree_model2 = DecisionTreeClassifier(random_state=0, max_depth=2).fit(X, y)
dot_data2 = export_graphviz(decision_tree_model2, out_file='dot/tree2.dot', feature_names=X.columns, class_names=['Edible', 'Poisonous'])
graph2 = graphviz.Source(dot_data2)
! dot -Tpng dot/tree2.dot -o images/tree2.png

In [28]:
# Create a decision tree with a depth of 3
decision_tree_model3 = DecisionTreeClassifier(random_state=0, max_depth=3).fit(X, y)
dot_data3 = export_graphviz(decision_tree_model3, out_file='dot/tree3.dot', feature_names=X.columns, class_names=['Edible', 'Poisonous'])
graph3 = graphviz.Source(dot_data3)
! dot -Tpng dot/tree3.dot -o images/tree3.png

In [29]:
# Since odor is important, what were the different odors?
df['Odor'].value_counts()

NONE        3808
FOUL        2160
FISHY        576
SPICY        576
ANISE        400
ALMOND       400
PUNGENT      256
CREOSOTE     192
MUSTY         48
Name: Odor, dtype: int64