In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import hvplot.pandas

In [2]:
# Extracting data for cleaning and preparation.
mushroom_data = pd.read_csv("mushrooms.csv")
mushroom_df = pd.DataFrame(mushroom_data)

In [3]:
# Added a Label class to numerical pinpoint edible vs poisonous.
missing_val = mushroom_df[mushroom_df["stalk-root"] == "?"]
missing_val
mushroom_df["label"] = np.where(mushroom_df["class"] == "p", 0, 1)

In [4]:
# Function to create charts featuring the counts of each feature value.
def count_values(features):
  for x in features:
    feature_values = mushroom_df[x].value_counts()
    values_plot = feature_values.hvplot.barh(title=f"Count of {x} Values")
    hvplot.save(values_plot, f"plot_htmls/{x}_count_plot.html")
    hvplot.save(values_plot, f"plot_pngs/{x}_count_plot.png")

In [5]:
# Function to create charts featuring the means of each feature value.
def mean_values(features):
  for x in features:
    mean_val = mushroom_df.groupby(x)["label"].mean()
    mean_val_plot = mean_val.hvplot.barh(title=f"Mean of Edibility Value for {x}")
    hvplot.save(mean_val_plot, f"plot_htmls/{x}_mean_plot.html")
    hvplot.save(mean_val_plot, f"plot_pngs/{x}_mean_plot.png")

In [6]:

# function to check the p-value of specific features.
def p_value_check(feature):
  contingency_table = pd.crosstab(mushroom_df[feature], mushroom_df["class"])
  chi2, p_value, dof, expected = chi2_contingency(contingency_table)
  print(f"Count of Feature Values: {mushroom_df[feature].value_counts()}")
  print("---------------------------------------------------")
  print(f"p-value: {p_value}")
  print(f"Chi2: {chi2}")
  print(f"DoF: {dof}")
  print(f"Expected: {expected}")

In [7]:
# Function for grouping features that return 100% certainty of poisonous or edible.
# This function may be better ran BEFORE the rare_feature_grouping function.
def certain_status(feature):
  means_of_feature = pd.DataFrame(mushroom_df.groupby(feature)['label'].mean())
  print(f"Original: {means_of_feature}")
  print("---------------------------------------------------")
  mean_values = mushroom_df.groupby(feature)['label'].mean()
  edible_values = mean_values[mean_values == 1].index
  mushroom_df.loc[mushroom_df[feature].isin(edible_values), feature] = "Certain Edible"
  poison_value = mean_values[mean_values == 0].index
  mushroom_df.loc[mushroom_df[feature].isin(poison_value), feature] = "Certain Poisonous"
  means_of_feature = pd.DataFrame(mushroom_df.groupby(feature)['label'].mean())
  print(f"Updated: {means_of_feature}")

In [8]:
# Function for grouping together rare feature values (< 200)
def rare_feature_grouping(feature):
  temp_array = []
  print(f"Original: {mushroom_df[feature].value_counts()}")
  print("---------------------------------------------------")
  for count in mushroom_df[feature].unique():
    if count != "Certain Edible" and count != "Certain Poisonous":
      if mushroom_df[feature].value_counts()[count] < 200:
        temp_array.append(count)
        print(f"{count} will be replaced.")
  for item in temp_array:
    mushroom_df[feature] = mushroom_df[feature].replace(item, "Other")
    print(f"{item} has been replaced with Other.")
  print("---------------------------------------------------")
  print(f"Updated: {mushroom_df[feature].value_counts()}")

In [9]:
# Running functions for chart creation (expect 1.5 minutes)
# Ran after grouping features for rarity.
# count_values(mushroom_df.columns[2:24])
# mean_values(mushroom_df.columns[2:24])

In [10]:
for features in mushroom_df.columns[2:23]:
  certain_status(features)

Original:               label
cap-shape          
b          0.893805
c          0.000000
f          0.506345
k          0.275362
s          1.000000
x          0.532823
---------------------------------------------------
Updated:                       label
cap-shape                  
Certain Edible     1.000000
Certain Poisonous  0.000000
b                  0.893805
f                  0.506345
k                  0.275362
x                  0.532823
Original:                 label
cap-surface          
f            0.672414
g            0.000000
s            0.447574
y            0.463625
---------------------------------------------------
Updated:                       label
cap-surface                
Certain Poisonous  0.000000
f                  0.672414
s                  0.447574
y                  0.463625
Original:               label
cap-color          
b          0.285714
c          0.727273
e          0.416000
g          0.560870
n          0.553415
p          0.388889
r   

In [11]:
# Ran function for grouping rare types of features.
for features in mushroom_df.columns[2:23]:
  rare_feature_grouping(features)

Original: cap-shape
x                    3656
f                    3152
k                     828
b                     452
Certain Edible         32
Certain Poisonous       4
Name: count, dtype: int64
---------------------------------------------------
---------------------------------------------------
Updated: cap-shape
x                    3656
f                    3152
k                     828
b                     452
Certain Edible         32
Certain Poisonous       4
Name: count, dtype: int64
Original: cap-surface
y                    3244
s                    2556
f                    2320
Certain Poisonous       4
Name: count, dtype: int64
---------------------------------------------------
---------------------------------------------------
Updated: cap-surface
y                    3244
s                    2556
f                    2320
Certain Poisonous       4
Name: count, dtype: int64
Original: cap-color
n                 2284
g                 1840
e                 15

In [12]:
p_value_check("cap-shape")

Count of Feature Values: cap-shape
x                    3656
f                    3152
k                     828
b                     452
Certain Edible         32
Certain Poisonous       4
Name: count, dtype: int64
---------------------------------------------------
p-value: 1.196456568593578e-103
Chi2: 489.91995361895573
DoF: 5
Expected: [[  16.57508616   15.42491384]
 [   2.07188577    1.92811423]
 [ 234.12309207  217.87690793]
 [1632.6459872  1519.3540128 ]
 [ 428.88035451  399.11964549]
 [1893.70359429 1762.29640571]]


In [13]:
mushroom_df[mushroom_df['habitat'] == 'Other']

Unnamed: 0.1,Unnamed: 0,id,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class,label


In [14]:
mushroom_df.to_csv("cleaned_mushroom_df.csv")

In [15]:
mushroom_df.to_json("cleaned_mushroom_df.json")