In [44]:
import matplotlib.pyplot as plt 
import seaborn           as sns
import pandas            as pd
import numpy             as np

## Exploratory Data Analysis (EDA)

In [45]:
# Load dataset
df = pd.read_csv('./mushroom-dataset/mushrooms.csv')

### Observing the data

In [46]:
# Format attribute values
class_map                    = {'e':'edible', 'p':'poisonous'}
cap_shape_map                = {'b':'bell', 'c':'conical', 'x':'convex', 'f':'flat', 'k':'knobbed', 's':'sunken'}
cap_surface_map              = {'f':'fibrous', 'g':'grooves', 'y':'scaly', 's':'smooth'}
cap_color_map                = {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'r':'green', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'}
gill_attachment_map          = {'a':'attached', 'd':'descending', 'f':'free', 'n':'notched'}
gill_spacing_map             = {'c':'close', 'w':'crowded', 'd':'distant'}
gill_size_map                = {'b':'broad', 'n':'narrow'}
gill_color_map               = {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'g':'gray', 'r':'green', 'o':'orange', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 'y':'yellow'}
stalk_shape_map              = {'e':'enlarging', 't':'tapering'}
stalk_root_map               = {'b':'bulbous', 'c':'club', 'u':'cup', 'e':'equal', 'z':'rhizomorphs', 'r':'rooted', '?':'missing'}
stalk_surface_above_ring_map = {'f':'fibrous', 'y':'scaly', 'k':'silky', 's':'smooth'}
stalk_surface_below_ring_map = {'f':'fibrous', 'y':'scaly', 'k':'silky', 's':'smooth'}
stalk_color_above_ring_map   = {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'o':'orange', 'p':'pink', 'e':'red', 'w':'white', 'y':'yellow'}
stalk_color_below_ring_map   = {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 'o':'orange', 'p':'pink', 'e':'red', 'w':'white', 'y':'yellow'}
veil_type_map                = {'p':'partial', 'u':'universal'}
veil_color_map               = {'n':'brown', 'o':'orange', 'w':'white', 'y':'yellow'}
ring_number_map              = {'n':'none', 'o':'one', 't':'two'}
ring_type_map                = {'c':'cobwebby', 'e':'evanescent', 'f':'flaring', 'l':'large', 'n':'none', 'p':'pendant', 's':'sheathing', 'z':'zone'}
bruises_map                  = {'t':'bruises', 'f':'no'}
odor_map                     = {'a':'almond', 'l':'anise', 'c':'creosote', 'y':'fishy', 'f':'foul', 'm':'musty', 'n':'none', 'p':'pungent', 's':'spicy' }
spore_print_color_map        = {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'r':'green', 'o':'orange', 'u':'purple', 'w':'white', 'y':'yellow'}
population_map               = {'a':'abundant', 'c':'clustered', 'n':'numerous', 's':'scattered', 'v':'several', 'y':'solitary'}
habitat_map                  = {'g':'grasses', 'l':'leaves', 'm':'meadows', 'p':'paths', 'u':'urban', 'w':'waste', 'd':'woods'}

attr_map = {
    'class':                    class_map,
    'cap-shape':                cap_shape_map,
    'cap-surface':              cap_surface_map,
    'cap-color':                cap_color_map,
    'gill-attachment':          gill_attachment_map,
    'gill-spacing':             gill_spacing_map,
    'gill-size':                gill_size_map,
    'gill-color':               gill_color_map,
    'stalk-shape':              stalk_shape_map,
    'stalk-root':               stalk_root_map,
    'stalk-surface-above-ring': stalk_surface_above_ring_map,
    'stalk-surface-below-ring': stalk_surface_below_ring_map,
    'stalk-color-above-ring':   stalk_color_above_ring_map,
    'stalk-color-below-ring':   stalk_color_below_ring_map,
    'veil-type':                veil_type_map,
    'veil-color':               veil_color_map,
    'ring-number':              ring_number_map,
    'ring-type':                ring_type_map,
    'bruises':                  bruises_map,
    'odor':                     odor_map,
    'spore-print-color':        spore_print_color_map,
    'population':               population_map,
    'habitat':                  habitat_map,
}

for col, attr in attr_map.items():
    df[col] = df[col].replace(attr)

In [47]:
# Graph dataset
# TODO

In [48]:
# View head
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses


In [49]:
# View tail
df.tail()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,orange,orange,partial,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,orange,orange,partial,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,orange,orange,partial,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,silky,white,white,partial,white,one,evanescent,white,several,leaves
8123,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,orange,orange,partial,orange,one,pendant,orange,clustered,leaves


In [50]:
# Count rows and columns
rows, cols = df.shape
print('Total rows:',    rows)
print('Total columns:', cols)

Total rows: 8124
Total columns: 23


| Target Attribute | Values
|------------------|---------
| `class`          |`e`=edible, `p`=poisonous

| Cap Features  | Values
|---------------|--------
| `cap-shape`   |`b`=bell,    `c`=conical, `x`=convex,   `f`=flat,  `k`=knobbed, `s`=sunken
| `cap-surface` |`f`=fibrous, `g`=grooves, `y`=scaly,    `s`=smooth
| `cap-color`   |`n`=brown,   `b`=buff,    `c`=cinnamon, `g`=gray,  `r`=green,   `p`=pink, `u`=purple, `e`=red, `w`=white, `y`=yellow

| Gill Features     | Values
|-------------------|--------
| `gill-attachment` |`a`=attached, `d`= descending, `f`=free, `n`=notched
| `gill-spacing`    |`c`=close,    `w`= crowded,    `d`=distant
| `gill-size`       |`b`=broad,    `n`= narrow
| `gill-color`      |`k`=black,    `n`= brown,      `b`=buff, `h`=chocolate, `g`=gray, `r`=green, `o`=orange, `p`=pink, `u`=purple, `e`=red, `w`=white , `y`=yellow

| Stalk Features             | Values
|----------------------------|--------
| `stalk-shape`              |`e`=enlarging, `t`= tapering
| `stalk-root`               |`b`=bulbous,   `c`=club,  `u`=cup,      `e`=equal, `z`=rhizomorphs, `r`=rooted, `?`=missing
| `stalk-surface-above-ring` |`f`=fibrous,   `y`=scaly, `k`=silky,    `s`=smooth
| `stalk-surface-below-ring` |`f`=fibrous,   `y`=scaly, `k`=silky,    `s`=smooth
| `stalk-color-above-ring`   |`n`=brown,     `b`=buff,  `c`=cinnamon, `g`=gray,  `o`=orange, `p`=pink, `e`=red, `w`=white, `y`=yellow
| `stalk-color-below-ring`   |`n`=brown,     `b`=buff,  `c` cinnamon, `g`=gray,  `o`=orange, `p`=pink, `e`=red, `w`=white, `y`=yellow


| Veil Features | Values
|-------------- |--------
| `veil-type`   |`p`=partial,  `u`=universal
| `veil-color`  |`n`=brown,    `o`=orange,     `w`=white,   `y`=yellow
| `ring-number` |`n`=none,     `o`=one,        `t`=two
| `ring-type`   |`c`=cobwebby, `e`=evanescent, `f`=flaring, `l`=large , `n`=none, `p`=pendant, `s`= heathing, `z`=zone



| Miscellaneous Features | Values
|------------------------|--------
| `bruises`              |`t`=yes,      `f`=no
| `odor`                 |`a`=almond,   `l`=anise,     `c`=creosote, `y`=fishy,     `f`=foul,    `m`=musty,   `n`=none,   `p`=pungent, `s`=spicy
| `spore-print-color`    |`k`=black,    `n`=brown,     `b`=buff,     `h`=chocolate, `r`=green,   `o`=orange,  `u`=purple, `w`=white ,  `y`=yellow 
| `population`           |`a`=abundant, `c`=clustered, `n`=numerous, `s`=scattered, `v`=several, `y`=solitary
| `habitat`              |`g`=grasses,  `l`=leaves,    `m`=meadows,  `p`=paths,     `u`=urban,   `w`=waste,   `d`=woods

### Data cleaning

In [51]:
# Check for missing values
df.isnull().sum()

# According to agaricus-lepiota.names file, some missing values were denoted by "?"
# Search for ? symbol in dataset
print(df[df == '?'].count())
print("Total missing values denoted as null/NaN: ", df.isnull().sum().sum())
print("Total missing values denoted as '?': ",  df[df == '?'].count().sum())

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
Total missing values denoted as null/NaN:  0
Total missing values denoted as '?':  0


The dataset reports 0 null values. However, the agaricus-lepiota.names file makes a note: "Missing Attribute Values: 2480 of them (denoted by "?"), all for attribute #11." \
Attribute #11 is `stalk-root`, which is confirmed in the code above that searches for all the '?' values in the data frame.

In [52]:
# Description of dataset
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,edible,convex,scaly,brown,no,none,free,close,broad,buff,...,smooth,white,white,partial,white,one,pendant,white,several,woods
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [53]:
# Column breakdown
for col in df:
    pct = df[col].value_counts() / rows * 100
    print(round(pct, 2), "\n")

class
edible       51.8
poisonous    48.2
Name: count, dtype: float64 

cap-shape
convex     45.00
flat       38.80
knobbed    10.19
bell        5.56
sunken      0.39
conical     0.05
Name: count, dtype: float64 

cap-surface
scaly      39.93
smooth     31.46
fibrous    28.56
grooves     0.05
Name: count, dtype: float64 

cap-color
brown       28.11
gray        22.65
red         18.46
yellow      13.20
white       12.80
buff         2.07
pink         1.77
cinnamon     0.54
purple       0.20
green        0.20
Name: count, dtype: float64 

bruises
no         58.44
bruises    41.56
Name: count, dtype: float64 

odor
none        43.43
foul        26.59
fishy        7.09
spicy        7.09
almond       4.92
anise        4.92
pungent      3.15
creosote     2.36
musty        0.44
Name: count, dtype: float64 

gill-attachment
free        97.42
attached     2.58
Name: count, dtype: float64 

gill-spacing
close      83.85
crowded    16.15
Name: count, dtype: float64 

gill-size
broad     69.08
na

In [54]:
df.select_dtypes('object').nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

| Attribute          | Observations
|--------------------|----------------------
| `gill-attachment`  | This is represented with two distinct values, where **97.42%** of the mushrooms have **free (`f`)** gills and **2.58%** have **attached (`a`)** gills. With a large majority of the samples having free gills, there is not enough unique information that this attribute can provide. Thus, we eliminate it from the dataset.
|`veil-type`         | **100%** of the mushrooms are partial (`p`) veil types. Since this attribute only consists of one unique value to observe, it is not a detailed enough category to train our model with. This will also be dropped from the dataset.
|`veil-color`        | **97.54%** of the mushrooms have **brown (`n`)** veil colors while **2.46%** make up the others. With little variation in this feature, we can leave it out of the dataset.





The other features are sufficiently varied and distributed in a way that will effectively train our model.