# Project 2: mushroom classification
<br>
Use the SHAP analysis to answer the following questions:
<ol>
<li> For the first prediction, which feature has the most significant contibution?
<li> Overall, which feature has the most significant contributions? 
<li> Which odors are associated with poisonous mushrooms? 
</ol>

<b>Dataset:</b> https://www.kaggle.com/datasets/uciml/mushroom-classification

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

import shap

from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
#load data 
data = pd.read_csv("../data/mushrooms.csv")

#get features
y = data['class']
y = y.astype('category').cat.codes
X = data.drop('class', axis=1)

# replace all categorical features with integer values
for col in X.columns:
    X[col] = X[col].astype('category').cat.codes


print(len(data))
X.head()

# Standard SHAP values

In [4]:
# get shap values
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
#For the first prediction, which feature has the most significant contribution?
#Answer: odor
shap.plots.waterfall(shap_values[0],max_display=5)

In [None]:
#Overall, which feature has the most significant contributions?
#Answer: odor
shap.plots.bar(shap_values,show=False)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
#Which odors are associated with poisonous mushrooms?
#All the odors with SHAP values > 0 

In [None]:
#get shaply values and data
odor_values = shap_values[:,4].values
odor_data = X['odor']
unique_odor = set(X['odor'])

#split odor shap values based on odor category
odor_categories = list(set(odor_data))

odor_groups = []
for o in odor_categories:
    relevant_values = odor_values[odor_data == o]
    odor_groups.append(relevant_values)
    
#replace categories with labels
odor_labels = {'a':'almond',
               'l':'anise', 
               'c':'creosote', 
               'y':'fishy', 
               'f':'foul', 
               'm':'musty', 
               'n':'none', 
               'p':'pungent', 
               's':'spicy'}

labels = [odor_labels[u] for u in unique_odor]

#plot boxplot
plt.figure(figsize=(8, 5))

plt.boxplot(odor_groups,labels=labels)

plt.ylabel('SHAP values',size=15)
plt.xlabel('Odor',size=15)