Mushroom data from https://archive.ics.uci.edu/ml/datasets/Mushroom. We're answering questions based on the data: Can we reliably identify poisonous mushrooms? Can we reliably identify non-poisonous mushrooms? Can we formulate simple, memorizable rules for either strategy?

In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from functools import reduce
import matplotlib

In [13]:
names = [
    'Toxicity',
    'Cap Shape',
    'Cap Surface',
    'Cap Color',
    'Bruises?',
    'Odor',
    'Gill Attachment',
    'Gill Spacing',
    'Gill Size',
    'Gill Color',
    'Stalk Shape',
    'Stalk Root',
    'Stalk Surface Above Ring',
    'Stalk Surface Below Ring',
    'Stalk Color Above Ring',
    'Stalk Color Below Ring',
    'Veil Type',
    'Veil Color',
    'Ring Number',
    'Ring Type',
    'Spore Print Color',
    'Population',
    'Habitat'
]
df = pd.read_csv('data/expanded', skiprows=9, names=names, index_col=None, engine='python', skipfooter=1); df

Unnamed: 0,Toxicity,Cap Shape,Cap Surface,Cap Color,Bruises?,Odor,Gill Attachment,Gill Spacing,Gill Size,Gill Color,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Ring Number,Ring Type,Spore Print Color,Population,Habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NO,NONE,ATTACHED,CLOSE,BROAD,BROWN,...,SMOOTH,ORANGE,ORANGE,PARTIAL,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES


In [3]:
# Assess data variability
df.describe().loc['unique']

Toxicity                     2
Cap Shape                    6
Cap Surface                  4
Cap Color                   10
Bruises?                     2
Odor                         9
Gill Attachment              2
Gill Spacing                 2
Gill Size                    2
Gill Color                  12
Stalk Shape                  2
Stalk Root                   5
Stalk Surface Above Ring     4
Stalk Surface Below Ring     4
Stalk Color Above Ring       9
Stalk Color Below Ring       9
Veil Type                    1
Veil Color                   4
Ring Number                  3
Ring Type                    5
Spore Print Color            9
Population                   6
Habitat                      7
Name: unique, dtype: object

Veil type has only one value.

In [4]:
# Assess missing data
df.isna().sum()

Toxicity                    0
Cap Shape                   0
Cap Surface                 0
Cap Color                   0
Bruises?                    0
Odor                        0
Gill Attachment             0
Gill Spacing                0
Gill Size                   0
Gill Color                  0
Stalk Shape                 0
Stalk Root                  0
Stalk Surface Above Ring    0
Stalk Surface Below Ring    0
Stalk Color Above Ring      0
Stalk Color Below Ring      0
Veil Type                   0
Veil Color                  0
Ring Number                 0
Ring Type                   0
Spore Print Color           0
Population                  0
Habitat                     0
dtype: int64

There appears to be no missing data.

In [6]:
# Transform features

# Convert binary bruised state to boolean values
def bool_bruises(dfin):
    dfin_no_bruises = dfin.drop(columns='Bruises?')
    dfin_bool_bruises = dfin['Bruises?'].apply(lambda x: x == 'BRUISES')
    return pd.concat([dfin_no_bruises, dfin_bool_bruises], axis=1)

# Drop veil type, because it has one value
drop_veil_type = lambda dfin: dfin.drop(columns='Veil Type')

fns = [bool_bruises, drop_veil_type]
df_trans = reduce(lambda res, fn: fn(res), fns, df); df_trans

Unnamed: 0,Toxicity,Cap Shape,Cap Surface,Cap Color,Odor,Gill Attachment,Gill Spacing,Gill Size,Gill Color,Stalk Shape,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Color,Ring Number,Ring Type,Spore Print Color,Population,Habitat,Bruises?
0,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,True
1,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,WHITE,TAPERING,...,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,True
2,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,True
3,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,PINK,TAPERING,...,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS,True
4,EDIBLE,CONVEX,SMOOTH,WHITE,ALMOND,FREE,CROWDED,NARROW,BROWN,TAPERING,...,SMOOTH,WHITE,WHITE,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8411,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,BROWN,CLUSTERED,LEAVES,False
8412,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,ORANGE,SEVERAL,LEAVES,False
8413,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,ORANGE,CLUSTERED,LEAVES,False
8414,EDIBLE,KNOBBED,SMOOTH,BROWN,NONE,ATTACHED,CLOSE,BROAD,BROWN,ENLARGING,...,SMOOTH,ORANGE,ORANGE,BROWN,ONE,PENDANT,BUFF,SEVERAL,LEAVES,False


Although other categories beside _Bruised?_ may have only two categories, _Bruised?_ was the only column treated as binary in the data, so we have transformed it to boolean values to reflect that.

In [7]:
# Assess distribution of outcomes
df_trans['Toxicity'].value_counts()

EDIBLE       4488
POISONOUS    3928
Name: Toxicity, dtype: int64

We have similarly sized edible and poisonous samples.

In [8]:
# Create machine learning model
X = df_trans.drop(columns='Toxicity')
y = df_trans['Toxicity']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = RandomForestClassifier(random_state=0); model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)