In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt


In [2]:
# load data
df = pd.read_csv('data/BankChurners.csv')
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(df.columns[-2:], axis=1, inplace=True)

df.loc[df['Attrition_Flag'] == "Existing Customer",["Attrition_Flag"]] = 0
df.loc[df['Attrition_Flag'] == "Attrited Customer",["Attrition_Flag"]] = 1

df[["Attrition_Flag"]] = df[["Attrition_Flag"]].astype(int)

In [3]:


np.random.seed(42)

#find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']

encoded = pd.get_dummies(df[categorical], prefix=categorical)
df_enc = pd.concat([encoded, df], axis=1)
df_enc.drop(categorical, axis=1, inplace=True)

X = df_enc.drop(["Attrition_Flag"], axis=1)
y = df_enc["Attrition_Flag"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
def train_and_store_model(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {'model': model, 'predictions': y_pred}

models = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
}

models_dict = {}

for model_name, model in models.items():
    models_dict[model_name] = train_and_store_model(model, X_train, y_train, X_test)

# Print classification reports
for model_name, model_info in models_dict.items():
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_test, model_info['predictions']))

Classification Report for RandomForest:

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1699
           1       0.93      0.75      0.83       327

    accuracy                           0.95      2026
   macro avg       0.94      0.87      0.90      2026
weighted avg       0.95      0.95      0.95      2026

Classification Report for DecisionTree:

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1699
           1       0.80      0.78      0.79       327

    accuracy                           0.93      2026
   macro avg       0.88      0.87      0.88      2026
weighted avg       0.93      0.93      0.93      2026

Classification Report for GradientBoosting:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1699
           1       0.93      0.85      0.89       327

    accuracy                           0.97      2026
 

In [5]:
shap_values_dict = {}

# Calculate SHAP values
for model_name, model_info in models_dict.items():
    model = model_info['model']
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Reshape SHAP values if needed
    if isinstance(shap_values, list):
        shap_values = shap_values[0]

    shap_values_dict[model_name] = shap_values

In [6]:
# create a df with the shap values' means
shap_mean_df = pd.DataFrame()

for model_name, shap_values in shap_values_dict.items():
    shap_mean_df[model_name] = np.abs(shap_values).mean(axis=0)
    #normalize
    shap_mean_df[model_name] = shap_mean_df[model_name] / shap_mean_df[model_name].sum()

shap_mean_df.index = X_test.columns
#sort by sum
shap_mean_df = shap_mean_df.sort_values(by=list(shap_mean_df.columns), ascending=False)
shap_mean_df.head(10)

Unnamed: 0,RandomForest,DecisionTree,GradientBoosting
Total_Trans_Ct,0.25238,0.355082,0.343311
Total_Revolving_Bal,0.145293,0.141205,0.128633
Total_Trans_Amt,0.127243,0.173111,0.211888
Total_Ct_Chng_Q4_Q1,0.095349,0.065044,0.069666
Total_Relationship_Count,0.08724,0.079089,0.057421
Avg_Utilization_Ratio,0.046004,0.024136,0.001858
Months_Inactive_12_mon,0.039551,0.010668,0.047179
Total_Amt_Chng_Q4_Q1,0.037168,0.033245,0.054486
Contacts_Count_12_mon,0.028271,0.011519,0.030788
Credit_Limit,0.018979,0.01678,0.00559


In [7]:
fig = px.bar(shap_mean_df.sort_values(by='RandomForest', ascending=False),
                title="Top SHAP features", barmode='group') 

fig.show()

In [None]:
# Plot the feature dependences
for model, shap_values in shap_values_dict.items():
    # Create dependence plots for each feature
    for feature_name in X_test.columns:
        shap.dependence_plot(feature_name, shap_values, X_test, feature_names=X_test.columns,
                             interaction_index='auto', show=False)
        plt.title(f'SHAP Dependence Plot - {model_name} - {feature_name}')
        plt.show()
