In [17]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import hvplot.pandas

In [18]:
mushroom_data = pd.read_csv("mushrooms.csv")
mushroom_df = pd.DataFrame(mushroom_data)

In [19]:
missing_val = mushroom_df[mushroom_df["stalk-root"] == "?"]
missing_val
mushroom_df["label"] = np.where(mushroom_df["class"] == "p", 0, 1)

In [20]:
# Function to create charts featuring the counts of each feature value.
def count_values(features):
  for x in features:
    feature_values = mushroom_df[x].value_counts()
    values_plot = feature_values.hvplot.barh(title=f"Count of {x} Values")
    hvplot.save(values_plot, f"plot_htmls/{x}_count_plot.html")
    hvplot.save(values_plot, f"plot_pngs/{x}_count_plot.png")

In [21]:
# Function to create charts featuring the means of each feature value.
def mean_values(features):
  for x in features:
    mean_val = mushroom_df.groupby(x)["label"].mean()
    mean_val_plot = mean_val.hvplot.barh(title=f"Mean of Edibility Value for {x}")
    hvplot.save(mean_val_plot, f"plot_htmls/{x}_mean_plot.html")
    hvplot.save(mean_val_plot, f"plot_pngs/{x}_mean_plot.png")

In [22]:
# Identifying the features of the dataset.
features = mushroom_df.columns
features

Index(['Unnamed: 0', 'id', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat', 'class',
       'label'],
      dtype='object')

In [23]:
# Running functions for chart creation (expect 1.5 minutes)
count_values(features[2:24])
mean_values(features[2:24])

In [28]:
# function to check the p-value of specific features.
def p_value_check(feature):
  contingency_table = pd.crosstab(mushroom_df[feature], mushroom_df["class"])
  chi2, p_value, dof, expected = chi2_contingency(contingency_table)
  print(f"p-value: {p_value}")
  print(f"chi2: {chi2}")
  print(f"dof: {dof}")
  print(f"expected: {expected}")

In [30]:
p_value_check("ring-type")

p-value: 0.0
chi2: 2956.6192780575316
dof: 4
expected: [[1437.88872477 1338.11127523]
 [  24.86262925   23.13737075]
 [ 671.29098966  624.70901034]
 [  18.64697194   17.35302806]
 [2055.31068439 1912.68931561]]


In [61]:
# Function for grouping together rare feature values (< 200)
def rare_feature_grouping(feature):
  temp_array = []
  print(f"Original: {mushroom_df[feature].value_counts()}")
  for count in mushroom_df[feature].unique():
    if mushroom_df[feature].value_counts()[count] < 200:
      temp_array.append(count)
  for item in temp_array:
    mushroom_df[feature] = mushroom_df[feature].replace(item, "Other")
  print(f"Updated: {mushroom_df[feature].value_counts()}")

In [65]:
# Ran function for grouping rare types of features.
for features in mushroom_df.columns[2:]:
  rare_feature_grouping(features)

Original: cap-shape
x        3656
f        3152
k         828
b         452
Other      36
Name: count, dtype: int64
Updated: cap-shape
x        3656
f        3152
k         828
b         452
Other      36
Name: count, dtype: int64
Original: cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64
Updated: cap-surface
y        3244
s        2556
f        2320
Other       4
Name: count, dtype: int64
Original: cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64
Updated: cap-color
n        2284
g        1840
e        1500
y        1072
w        1040
Other     388
Name: count, dtype: int64
Original: bruises
f    4748
t    3376
Name: count, dtype: int64
Updated: bruises
f    4748
t    3376
Name: count, dtype: int64
Original: odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64
Updated: odor
n        3528
f        2160
y  

In [72]:
mushroom_df['odor'].value_counts()

odor
n        3528
f        2160
y         576
s         576
a         400
l         400
p         256
Other     228
Name: count, dtype: int64