In [117]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import hvplot.pandas

In [118]:
# Extracting data for cleaning and preparation.
mushroom_data = pd.read_csv("mushrooms.csv")
mushroom_df = pd.DataFrame(mushroom_data)

In [119]:
# Added a Label class to numerical pinpoint edible vs poisonous.
missing_val = mushroom_df[mushroom_df["stalk-root"] == "?"]
missing_val
mushroom_df["label"] = np.where(mushroom_df["class"] == "p", 0, 1)

In [120]:
# Function to create charts featuring the counts of each feature value.
def count_values(features):
  for x in features:
    feature_values = mushroom_df[x].value_counts()
    values_plot = feature_values.hvplot.barh(title=f"Count of {x} Values")
    hvplot.save(values_plot, f"plot_htmls/{x}_count_plot.html")
    hvplot.save(values_plot, f"plot_pngs/{x}_count_plot.png")

In [121]:
# Function to create charts featuring the means of each feature value.
def mean_values(features):
  for x in features:
    mean_val = mushroom_df.groupby(x)["label"].mean()
    mean_val_plot = mean_val.hvplot.barh(title=f"Mean of Edibility Value for {x}")
    hvplot.save(mean_val_plot, f"plot_htmls/{x}_mean_plot.html")
    hvplot.save(mean_val_plot, f"plot_pngs/{x}_mean_plot.png")

In [122]:

# function to check the p-value of specific features.
def p_value_check(feature):
  contingency_table = pd.crosstab(mushroom_df[feature], mushroom_df["class"])
  chi2, p_value, dof, expected = chi2_contingency(contingency_table)
  print(f"Count of Feature Values: {mushroom_df[feature].value_counts()}")
  print("---------------------------------------------------")
  print(f"p-value: {p_value}")
  print(f"Chi2: {chi2}")
  print(f"DoF: {dof}")
  print(f"Expected: {expected}")

In [123]:
# Function for grouping features that return 100% certainty of poisonous or edible.
# This function may be better ran BEFORE the rare_feature_grouping function.
def certain_edible(feature):
  temp_array = []
  print(f"Original: {mushroom_df[feature].value_counts()}")
  print("---------------------------------------------------")
  for value in feature:
    if mushroom_df.groupby(value)["label"].mean() == 1:
      temp_array.append(value)
  for item in temp_array:
    mushroom_df[feature] = mushroom_df[feature].replace(item, "Certain Edible")
  print(f"Updated: {mushroom_df[feature].value_counts()}")

In [124]:
mushroom_df['cap-color'].value_counts()

cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64

In [136]:
# This is a work in progress! It will not function properly!

temp_array = []
means = mushroom_df.groupby('cap-color')['label'].mean()
print(f"Original: {means}")
print("---------------------------------------------------")


for value in mushroom_df['cap-color'].unique():
  if mushroom_df.groupby(value)['label'].mean() == 1:
    temp_array.append(value)
    print(f"{value} will be replaced with Certain Edible.")
print(f"Items to replace: {temp_array}")
for item in temp_array:
  mushroom_df['cap-color'] = mushroom_df['cap-color'].replace(item, "Certain Edible")
  print(f"{item} has been replaced with Certain Edible.")
print("---------------------------------------------------")
print(f"Updated: {mushroom_df.groupby('cap-color')['label'].mean()}")

Original: cap-color
b    0.285714
c    0.727273
e    0.416000
g    0.560870
n    0.553415
p    0.388889
r    1.000000
u    1.000000
w    0.692308
y    0.373134
Name: label, dtype: float64
---------------------------------------------------


KeyError: 'n'

In [132]:
mushroom_df['cap-color'].value_counts()

cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64

In [84]:
# Function for grouping together rare feature values (< 200)
def rare_feature_grouping(feature):
  temp_array = []
  print(f"Original: {mushroom_df[feature].value_counts()}")
  print("---------------------------------------------------")
  for count in mushroom_df[feature].unique():
    if mushroom_df[feature].value_counts()[count] < 200:
      temp_array.append(count)
      print(f"{count} will be replaced.")
  for item in temp_array:
    mushroom_df[feature] = mushroom_df[feature].replace(item, "Other")
    print(f"{item} has been replaced with Other.")
  print("---------------------------------------------------")
  print(f"Updated: {mushroom_df[feature].value_counts()}")

In [53]:
# Running functions for chart creation (expect 1.5 minutes)
# Ran after grouping features for rarity.
count_values(mushroom_df.columns[2:24])
mean_values(mushroom_df.columns[2:24])

In [34]:
# Ran function for grouping rare types of features.
for features in mushroom_df.columns[2:]:
  rare_feature_grouping(features)

Original: cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64
Updated: cap-shape
x        3656
f        3152
k         828
b         452
Other      36
Name: count, dtype: int64
Original: cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64
Updated: cap-surface
y        3244
s        2556
f        2320
Other       4
Name: count, dtype: int64
Original: cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64
Updated: cap-color
n        2284
g        1840
e        1500
y        1072
w        1040
Other     388
Name: count, dtype: int64
Original: bruises
f    4748
t    3376
Name: count, dtype: int64
Updated: bruises
f    4748
t    3376
Name: count, dtype: int64
Original: odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64
Updated: odor
n        3528
f        2160
y         576

In [47]:
p_value_check("cap-shape")

Count of Feature Values: cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64
p-value: 1.196456568593578e-103
Chi2: 489.91995361895573
DoF: 5
Expected: [[ 234.12309207  217.87690793]
 [   2.07188577    1.92811423]
 [1632.6459872  1519.3540128 ]
 [ 428.88035451  399.11964549]
 [  16.57508616   15.42491384]
 [1893.70359429 1762.29640571]]
