In [26]:
import pandas as pd
import numpy as np
import hvplot.pandas
from scipy.stats import chi2_contingency

In [3]:
mushroom_data = pd.read_csv("mushrooms.csv")
mushroom_df = pd.DataFrame(mushroom_data)

In [22]:
missing_val = mushroom_df[mushroom_df["stalk-root"] == "?"]
missing_val
# stalk root is missing over 2,000 values. Might be a good idea
# to cut out the whole column, or we lose all those rows.
mushroom_df["stalk-root"].value_counts()
# mushroom_df.groupby("stalk-root")["class"].mean()

mushroom_df["label"] = np.where(mushroom_df["class"] == "p", 0, 1)

In [5]:
mushroom_df.columns

Index(['Unnamed: 0', 'id', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat', 'class'],
      dtype='object')

In [6]:
mushroom_df["cap-shape"].value_counts()

cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64

In [7]:
mushroom_df["cap-surface"].value_counts()

cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64

In [8]:
mushroom_df["cap-color"].value_counts()

cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64

In [9]:
mushroom_df["bruises"].value_counts()

bruises
f    4748
t    3376
Name: count, dtype: int64

In [10]:
mushroom_df["odor"].value_counts()

odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64

In [11]:
mushroom_df["gill-attachment"].value_counts()

gill-attachment
f    7914
a     210
Name: count, dtype: int64

In [12]:
mushroom_df["gill-spacing"].value_counts()

gill-spacing
c    6812
w    1312
Name: count, dtype: int64

In [13]:
mushroom_df["gill-size"].value_counts()

gill-size
b    5612
n    2512
Name: count, dtype: int64

In [14]:
mushroom_df["gill-color"].value_counts()

gill-color
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: count, dtype: int64

In [15]:
mushroom_df[mushroom_df["odor"] == "a"]

Unnamed: 0.1,Unnamed: 0,id,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
1,1,2,x,s,y,t,a,f,c,b,...,w,w,p,w,o,p,n,n,g,e
5,5,6,x,y,y,t,a,f,c,b,...,w,w,p,w,o,p,k,n,g,e
6,6,7,b,s,w,t,a,f,c,b,...,w,w,p,w,o,p,k,n,m,e
9,9,10,b,s,y,t,a,f,c,b,...,w,w,p,w,o,p,k,s,m,e
11,11,12,x,y,y,t,a,f,c,b,...,w,w,p,w,o,p,k,s,m,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,2018,2019,f,f,w,t,a,f,w,n,...,w,w,p,w,o,p,n,v,d,e
2045,2045,2046,x,y,y,t,a,f,c,b,...,w,w,p,w,o,p,n,s,p,e
2051,2051,2052,x,y,w,t,a,f,c,b,...,w,w,p,w,o,p,k,s,m,e
2059,2059,2060,x,s,w,t,a,f,c,b,...,w,w,p,w,o,p,k,n,m,e


In [None]:
mushroom_df.groupby("gill-size")["label"].mean()

In [17]:
contingency_table = pd.crosstab(mushroom_df["odor"], mushroom_df["class"])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(p_value)



0.0


In [18]:
mushroom_df["stalk-shape"].value_counts()

stalk-shape
t    4608
e    3516
Name: count, dtype: int64

In [20]:
mushroom_df["stalk-surface-above-ring"].value_counts()

stalk-surface-above-ring
s    5176
k    2372
f     552
y      24
Name: count, dtype: int64

In [27]:
stalk_shape = mushroom_df.groupby("stalk-shape")["label"].mean()
stalk_shape_plt = stalk_shape.hvplot.barh(title = "change later")
stalk_shape_plt

In [29]:
stalk_surface_above_ring = mushroom_df.groupby("stalk-surface-above-ring")["label"].mean()
stalk_surface_above_ring_plt = stalk_surface_above_ring.hvplot.barh(title = "change later")
stalk_surface_above_ring_plt

In [31]:
stalk_surface_below_ring = mushroom_df.groupby("stalk-surface-below-ring")["label"].mean()
stalk_surface_below_ring_plt = stalk_surface_below_ring.hvplot.barh(title = "change later")
stalk_surface_below_ring_plt

In [32]:
stalk_color_above_ring = mushroom_df.groupby("stalk-color-above-ring")["label"].mean()
stalk_color_above_ring_plt = stalk_color_above_ring.hvplot.barh(title = "change later")
stalk_color_above_ring_plt

In [33]:
stalk_color_below_ring = mushroom_df.groupby("stalk-color-below-ring")["label"].mean()
stalk_color_below_ring_plt = stalk_color_below_ring.hvplot.barh(title = "change later")
stalk_color_below_ring_plt

In [34]:
veil_type = mushroom_df.groupby("veil-type")["label"].mean()
veil_type = veil_type.hvplot.barh(title = "change later")
veil_type

In [35]:
veil_color = mushroom_df.groupby("veil-color")["label"].mean()
veil_color = veil_color.hvplot.barh(title = "change later")
veil_color

In [36]:
ring_number = mushroom_df.groupby("ring-number")["label"].mean()
ring_number = ring_number.hvplot.barh(title = "change later")
ring_number