# Project 1: salary bonus
<br>
Use the SHAP analysis to answer the following questions:
<ol>
<li> Which features does NOT have a significant relationship with bonus?
<li> What tends to happens to an employee's bonus as they gain more experience? 
<li> Are there any potential interactions in the dataset? 
</ol>
<br>
<b>Dataset:</b> https://www.kaggle.com/conorsully1/interaction-dataset

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor

import shap
shap.initjs()

path = "/Users/conorosully/Google Drive/My Drive/Medium/SHAP Interactions/Figures/{}"

## Dataset

In [None]:
#import dataset
data = pd.read_csv("../data/interaction_dataset.csv",sep='\t')

y = data['bonus']
X = data.drop('bonus', axis=1)

print(len(data))
data.head()

## Modelling

In [3]:
#Train model
model = RandomForestRegressor(n_estimators=100) 
model.fit(X, y)

#Get predictions
y_pred = model.predict(X)

In [None]:
#Model evaluation
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8))

plt.scatter(y,y_pred)
plt.plot([0, 400], [0, 400], color='r', linestyle='-', linewidth=2)

plt.ylabel('Predicted',size=20)
plt.xlabel('Actual',size=20)

## Standard SHAP  values

In [7]:
#Get SHAP values
explainer = shap.Explainer(model,X[0:10])
shap_values = explainer(X)

In [None]:
# waterfall plot for first observation
shap.plots.waterfall(shap_values[0])

In [None]:
# Which features do NOT have a significant relationship with bonus?
# Answer: days_late
shap.plots.bar(shap_values)

In [None]:
#What tends to happens to an employee's bonus as they gain more experience? 
# Answer: their bonus increases
# You could have also used a dependency plot
shap.plots.beeswarm(shap_values)

## SHAP interaction values

In [9]:
#Get SHAP interaction values
explainer = shap.Explainer(model)
shap_interaction = explainer.shap_interaction_values(X)

In [None]:
# Are there any potential interactions in the dataset? 
# Answer: yes - experience.degree & performance.sales

In [None]:
# Get absolute mean of matrices
mean_shap = np.abs(shap_interaction).mean(0)
df = pd.DataFrame(mean_shap,index=X.columns,columns=X.columns)

# times off diagonal by 2
df.where(df.values == np.diagonal(df),df.values*2,inplace=True)

# display 
plt.figure(figsize=(10, 10), facecolor='w', edgecolor='k')
sns.set(font_scale=1.5)
sns.heatmap(df,cmap='coolwarm',annot=True,fmt='.3g',cbar=False)
plt.yticks(rotation=0) 

In [None]:
# Experience-degree depenence plot
shap.dependence_plot(
    ("experience", "degree"),
    shap_interaction, X,
    display_features=X)

In [None]:
# Performance-sales depenence plot
shap.dependence_plot(
    ("performance", "sales"),
    shap_interaction, X,
    display_features=X)