## Install Dependencies

In [18]:
!pip install pandas matplotlib s3fs plotly[express]




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import Packages

In [3]:
# Import Statements
import s3fs
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates as pc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"

## Load & Prepare Data

In [8]:
#data_filepath = "s3a://cap4770-2025-burke/mushroom/agaricus-lepiota.data"  # S3 Data
data_filepath = "agaricus-lepiota.data"  # Local Data
df = pd.read_csv("./data/mushrooms.csv", header=None, na_values="Missing")


# value dictionaries
edibility = {'p': "poisonous", 'e': "edible"}
cap_shape = {'b': "bell", 'c': "conical", 'x': "convex", 'f': "flat", 'k': "knobbed", 's': "sunken"}
cap_surface = {'f': "fibrous", 'g': "grooves", 'y': "scaly", 's': "smooth"}
cap_color = {'n': "brown", 'b': "buff", 'c': "cinnamon", 'g': "gray", 'r': "green", 'p': "pink", 'u': "purple",
                  'e': "red", 'w': "white", 'y': "yellow"}
bruises = {'t': "bruises", 'f': "no"}
odor = {'a': "almond", 'l': "anise", 'c': "creosote", 'y': "fishy", 'f': "foul", 'm': "musty", 'n': "none",
             'p': "pungent", 's': "spicy"}
gill_attachment = {'a': "attached", 'd': "descending", 'f': "free", 'n': "notched"}
gill_spacing = {'c': "close", 'w': "crowded", 'd': "distant"}
gill_size = {'b': "broad", 'n': "narrow"}
gill_color = {'k': "black", 'n': "brown", 'b': "buff", 'h': "chocolate", 'g': "gray", 'r': "green", 'o': "orange",
                   'p': "pink", 'u': "purple", 'e': "red", 'w': "white", 'y': "yellow"}
stalk_shape = {'e': "enlarging", 't': "tapering"}
stalk_root = {'b': "bulbous", 'c': "club", 'u': "cup", 'e': "equal", 'z': "rhizomorphs", 'r': "rooted"}
stalk_surface_above_ring = {'f': "fibrous", 'y': "scaly", 'k': "silky", 's': "smooth"}
stalk_surface_below_ring = {'f': "fibrous", 'y': "scaly", 'k': "silky", 's': "smooth"}
stalk_color_above_ring = {'n': "brown", 'b': "buff", 'c': "cinnamon", 'g': "gray", 'o': "orange", 'p': "pink",
                               'e': "red", 'w': "white", 'y': "yellow"}
stalk_color_below_ring = {'n': "brown", 'b': "buff", 'c': "cinnamon", 'g': "gray", 'o': "orange", 'p': "pink",
                               'e': "red", 'w': "white", 'y': "yellow"}
veil_type = {'p': "partial", 'u': "universal"}
veil_color = {'n': "brown", 'o': "orange", 'w': "white", 'y': "yellow"}
ring_number = {'n': "none", 'o': "one", 't': "two"}
ring_type = {'c': "cobwebby", 'e': "evanescent", 'f': "flaring", 'l': "large", 'n': "none", 'p': "pendant",
                  's': "sheathing", 'z': "zone"}
spore_print_color = {'k': "black", 'n': "brown", 'b': "buff", 'h': "chocolate", 'r': "green", 'o': "orange",
                          'u': "purple", 'w': "white", 'y': "yellow"}
population = {'a': "abundant", 'c': "clustered", 'n': "numerous", 's': "scattered", 'v': "several",
                   'y': "solitary"}
habitat = {'g': "grasses", 'l': "leaves", 'm': "meadows", 'p': "paths", 'u': "urban", 'w': "waste", 'd': "woods"}

# Create named DataFrame from mapping and numeric edibility column
df_named = df.copy()
df_named.columns = list(data_column_maps.keys())
for col, mapping in data_column_maps.items():
    df_named[col] = df_named[col].map(mapping)
df_named['edibility_num'] = df_named['edibility'].map({'edible': 1, 'poisonous': 0})
df_named

Unnamed: 0,edibility,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat,edibility_num
0,,,,,,,,,,,...,,,,,,,,,,
1,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,white,white,partial,white,one,pendant,black,scattered,urban,0.0
2,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,white,white,partial,white,one,pendant,brown,numerous,grasses,1.0
3,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,white,white,partial,white,one,pendant,brown,numerous,meadows,1.0
4,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,white,white,partial,white,one,pendant,black,scattered,urban,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8120,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,orange,orange,partial,orange,one,pendant,buff,clustered,leaves,1.0
8121,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,orange,orange,partial,brown,one,pendant,buff,several,leaves,1.0
8122,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,orange,orange,partial,orange,one,pendant,buff,clustered,leaves,1.0
8123,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,white,white,partial,white,one,evanescent,white,several,leaves,0.0


## Histograms

In [19]:
# Make subplot grid, define colors
fig = make_subplots(rows=2, cols=2, subplot_titles=("Edibility", "Cap Shape", "Cap Surface", "Cap Color"))
edible_color, poisonous_color = "#FCDE9C", "#7C1D6F"

# Edibility - get counts and create histogram
edible_counts = df_named[df_named.edibility == 'edible'].edibility.value_counts()
poisonous_counts = df_named[df_named.edibility == 'poisonous'].edibility.value_counts()
edible_plot = go.Bar(x=edible_counts.index, y=edible_counts.values, name='edible', marker_color=edible_color)
poisonous_plot = go.Bar(x=poisonous_counts.index, y=poisonous_counts.values, name='poisonous', marker_color=poisonous_color) 

# Cap Shape - get counts and create plots
cap_shape_edible_counts = df_named[df_named.edibility == 'edible'].cap_shape.value_counts()
cap_shape_poisonous_counts = df_named[df_named.edibility == 'poisonous'].cap_shape.value_counts()
cap_shape_edible_plot = go.Bar(x=cap_shape_edible_counts.index, y=cap_shape_edible_counts.values, name='edible', marker_color=edible_color, showlegend=False)
cap_shape_poisonous_plot = go.Bar(x=cap_shape_poisonous_counts.index, y=cap_shape_poisonous_counts.values, name='poisonous', marker_color=poisonous_color, showlegend=False) 

# Cap Surface - get counts and create plots
cap_surf_edible_counts = df_named[df_named.edibility == 'edible'].cap_surface.value_counts()
cap_surf_poisonous_counts = df_named[df_named.edibility == 'poisonous'].cap_surface.value_counts()
cap_surf_edible_plot = go.Bar(x=cap_surf_edible_counts.index, y=cap_surf_edible_counts.values, name='edible', marker_color=edible_color, showlegend=False)
cap_surf_poisonous_plot = go.Bar(x=cap_surf_poisonous_counts.index, y=cap_surf_poisonous_counts.values, name='poisonous', marker_color=poisonous_color, showlegend=False) 

# Cap Color - get counts and create plots
cap_color_edible_counts = df_named[df_named.edibility == 'edible'].cap_color.value_counts()
cap_color_poisonous_counts = df_named[df_named.edibility == 'poisonous'].cap_color.value_counts()
cap_color_edible_plot = go.Bar(x=cap_color_edible_counts.index, y=cap_color_edible_counts.values, name='edible', marker_color=edible_color, showlegend=False)
cap_color_poisonous_plot = go.Bar(x=cap_color_poisonous_counts.index, y=cap_color_poisonous_counts.values, name='poisonous', marker_color=poisonous_color, showlegend=False) 

# Add traces
fig.add_trace(edible_plot, row=1, col=1)
fig.add_trace(poisonous_plot, row=1, col=1)
fig.add_trace(cap_shape_edible_plot, row=1, col=2)
fig.add_trace(cap_shape_poisonous_plot, row=1, col=2)
fig.add_trace(cap_surf_edible_plot, row=2, col=1)
fig.add_trace(cap_surf_poisonous_plot, row=2, col=1)
fig.add_trace(cap_color_edible_plot, row=2, col=2)
fig.add_trace(cap_color_poisonous_plot, row=2, col=2)

fig.update_layout(
    title='Mushroom Feature Distributions Colored by Edibility',
    barmode='stack',
    legend_title='Edibility',
    template='plotly_dark'
)

fig.show()

In [20]:
# Make subplot grid, define colors
n_rows, n_cols = 6, 4
fig = make_subplots(rows=n_rows, cols=n_cols,
                    subplot_titles=("Edibility", "Cap Shape", "Cap Surface", "Cap Color",
                                    "Bruises", "Odor", "Gill Attachment", "Gill Spacing",
                                    "Gill Size", "Gill Color", "Stalk Shape", "Stalk Root",
                                    "Stalk Surface Above", "Stalk Surface Below",
                                    "Stalk Color Above", "Stalk Color Below", "Veil Type",
                                    "Veil Color", "Ring Number", "Ring Type", "Spore Print Color",
                                    "Population", "Habitat"
                                   ))
edible_color, poisonous_color = "#FCDE9C", "#7C1D6F"

# Define features and their positions in the subplot grid
features = [
    # (feature, row, col, show_legend)
    ('edibility',                1, 1, True),
    ('cap_shape',                1, 2, False),
    ('cap_surface',              1, 3, False),
    ('cap_color',                1, 4, False),
    ('bruises',                  2, 1, False),
    ('odor',                     2, 2, False),
    ('gill_attachment',          2, 3, False),
    ('gill_spacing',             2, 4, False),
    ('gill_size',                3, 1, False),    
    ('gill_color',               3, 2, False),
    ('stalk_shape',              3, 3, False),
    ('stalk_root',               3, 4, False),
    ('stalk_surface_above_ring', 4, 1, False),
    ('stalk_surface_below_ring', 4, 2, False),
    ('stalk_color_above_ring',   4, 3, False),
    ('stalk_color_below_ring',   4, 4, False),
    ('veil_type',                5, 1, False),
    ('veil_color',               5, 2, False),
    ('ring_number',              5, 3, False),
    ('ring_type',                5, 4, False),
    ('spore_print_color',        6, 1, False),
    ('population',               6, 2, False),
    ('habitat',                  6, 3, False),
]


# Create plots for each feature
for feature, row, col, show_legend in features:
    # Get counts for edible and poisonous
    edible_counts = df_named[df_named.edibility == 'edible'][feature].value_counts()
    poisonous_counts = df_named[df_named.edibility == 'poisonous'][feature].value_counts()
    
    # Create and add bar plots
    fig.add_trace(
        go.Bar(x=edible_counts.index, y=edible_counts.values, 
               name='edible', marker_color=edible_color, showlegend=show_legend),
        row=row, col=col
    )
    fig.add_trace(
        go.Bar(x=poisonous_counts.index, y=poisonous_counts.values, 
               name='poisonous', marker_color=poisonous_color, showlegend=show_legend),
        row=row, col=col
    )

fig.update_layout(
    title='Mushroom Feature Distributions Colored by Edibility',
    barmode='stack',
    legend_title='Edibility',
    template='plotly_dark',
    height=1000,
)

fig.show()

## Parallel Coordinates Plot

In [24]:
fig = px.parallel_categories(df_named, 
                            color="edibility_num", 
                            color_continuous_scale=px.colors.sequential.Agsunset,
                            dimensions=["edibility", "cap_surface", "cap_shape", "cap_color"])
fig.update_layout(coloraxis_showscale=False, template='plotly_dark')
fig.show()