In [62]:
import pandas as pd
import numpy  as np

## Exploratory Data Analysis (EDA)

In [63]:
# Load dataset
df = pd.read_csv('./mushroom-dataset/mushrooms.csv')

### Observing the data

In [64]:
# View head
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [65]:
# View tail
df.tail()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l
8123,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,o,c,l


**cap-shape**: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s

**cap-surface**: fibrous=f, grooves=g, scaly=y, smooth=s

**cap-color**: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y

**bruises**: bruises=t, no=f

**odor**: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s

**gill-attachment**: attached=a, descending=d, free=f, notched=n

**gill-spacing**: close=c, crowded=w, distant=d



### Data cleaning

In [70]:
# Count rows and columns
rows, cols = df.shape
print('rows:',    rows)
print('columns:', cols)

# Check for missing values
df.isnull().sum()

rows: 8124
columns: 23


class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

The dataset reports 0 missing or null values, meaning all 8,124 rows are filled.

In [71]:
# Description of dataset
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [91]:
# Column breakdown
for col in df:
    pct = df[col].value_counts() / rows * 100
    print(round(pct, 2), "\n")

class
e    51.8
p    48.2
Name: count, dtype: float64 

cap-shape
x    45.00
f    38.80
k    10.19
b     5.56
s     0.39
c     0.05
Name: count, dtype: float64 

cap-surface
y    39.93
s    31.46
f    28.56
g     0.05
Name: count, dtype: float64 

cap-color
n    28.11
g    22.65
e    18.46
y    13.20
w    12.80
b     2.07
p     1.77
c     0.54
u     0.20
r     0.20
Name: count, dtype: float64 

bruises
f    58.44
t    41.56
Name: count, dtype: float64 

odor
n    43.43
f    26.59
y     7.09
s     7.09
a     4.92
l     4.92
p     3.15
c     2.36
m     0.44
Name: count, dtype: float64 

gill-attachment
f    97.42
a     2.58
Name: count, dtype: float64 

gill-spacing
c    83.85
w    16.15
Name: count, dtype: float64 

gill-size
b    69.08
n    30.92
Name: count, dtype: float64 

gill-color
b    21.27
p    18.37
w    14.80
n    12.90
g     9.26
h     9.01
u     6.06
k     5.02
e     1.18
y     1.06
o     0.79
r     0.30
Name: count, dtype: float64 

stalk-shape
t    56.72
e    43.28
Name: 

Since **veil type** consists of only one unique value to observe, it is not a detailed enough category to train our model with. \
Therefore, we'll opt to drop this feature from the dataset.