In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
import seaborn as sns
import dalex as dx
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.feature_selection import RFECV

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 

from statsmodels.tools.tools import add_constant

from matplotlib import style

plt.style.use('dark_background')
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=['red'])

In [3]:
df = pd.read_csv('./BankChurners.csv')
df.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], inplace=True)

#Preparing data

df['Attrition_Flag'].mask(df['Attrition_Flag'] == 'Existing Customer', 1, inplace=True)
df['Attrition_Flag'].mask(df['Attrition_Flag'] == 'Attrited Customer', 0, inplace=True)
df['Gender'].mask(df['Gender'] == 'M', 1, inplace=True)
df['Gender'].mask(df['Gender'] == 'F', 0, inplace=True)

#Sort education from the lowest level to the highest

df['Education_Level'].mask(df['Education_Level'] == 'Unknown', 0, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Uneducated', -3, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'High School', -2, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'College', -1, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Graduate', 1, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Post-Graduate', 2, inplace=True)
df['Education_Level'].mask(df['Education_Level'] == 'Doctorate', 3, inplace=True)

#marital status (decided to add binary variables)

df['Marital_Status_Married'] = (df['Marital_Status'] == 'Married').astype(int)
df['Marital_Status_Single'] = (df['Marital_Status'] == 'Single').astype(int)
df['Marital_Status_Divorced'] = (df['Marital_Status'] == 'Divorced').astype(int)
df = df[df['Marital_Status'] != 'Unknown']

#card category values can be sorted from the worst to the best

df['Card_Category'].mask(df['Card_Category'] == 'Blue', 0, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Silver', 1, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Gold', 2, inplace=True)
df['Card_Category'].mask(df['Card_Category'] == 'Platinum', 3, inplace=True)

#binary variables for income level

df['Income_Category_less40'] = (df['Income_Category'] == 'Less than $40K').astype(int)
df['Income_Category_40-60'] = (df['Income_Category'] == '$40K - $60K').astype(int)
df['Income_Category_60-80'] = (df['Income_Category'] == '$60K - $80K').astype(int)
df['Income_Category_80-120'] = (df['Income_Category'] == '$80K - $120K').astype(int)
df['Income_Category_120+'] = (df['Income_Category'] == '$120K +').astype(int)
df = df[df['Income_Category'] != 'Unknown']

In [4]:
X = df.drop(columns=['CLIENTNUM', 'Marital_Status', 'Income_Category', 'Attrition_Flag'])
y = df['Attrition_Flag'].astype('int')

#splitting into train and test sets

idx_train, idx_test = train_test_split(X.index, stratify=y, random_state=999)

X_train, X_test = X.loc[idx_train], X.loc[idx_test]
y_train, y_test = y[idx_train], y[idx_test]

In [35]:
rfc = RandomForestClassifier(max_depth=3, min_samples_split=5, min_samples_leaf=15, max_leaf_nodes=11).fit(X_train, y_train)

rfc_exp = dx.Explainer(rfc, X_train, y_train, label='RandomForest Pipeline')

single_datapoint = add_constant(X_train.sample())

Preparation of a new explainer is initiated

  -> data              : 6261 rows 25 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 6261 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : RandomForest Pipeline
  -> predict function  : <function yhat_proba_default at 0x0000022A19C8D000> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.242, mean = 0.841, max = 0.971
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.931, mean = 0.000238, max = 0.641
  -> model_info        : package sklearn

A new explainer has been created!



X does not have valid feature names, but RandomForestClassifier was fitted with feature names



In [33]:
shap_single_datapoint = rfc_exp.predict_parts(single_datapoint, type = 'shap')


Unnamed: 0,variable,contribution,variable_name,variable_value,sign,label,B
0,Total_Ct_Chng_Q4_Q1 = 0.706,0.016037,Total_Ct_Chng_Q4_Q1,0.706,1.0,RandomForest Pipeline,1
1,Credit_Limit = 4060.0,-0.011343,Credit_Limit,4060.0,-1.0,RandomForest Pipeline,1
2,Months_on_book = 34.0,0.000676,Months_on_book,34,1.0,RandomForest Pipeline,1
3,Marital_Status_Married = 1.0,0.001552,Marital_Status_Married,1,1.0,RandomForest Pipeline,1
4,Contacts_Count_12_mon = 2.0,0.005426,Contacts_Count_12_mon,2,1.0,RandomForest Pipeline,1
...,...,...,...,...,...,...,...
20,Income_Category_40-60 = 0.0,0.000329,Income_Category_40-60,0,1.0,RandomForest Pipeline,0
21,Income_Category_60-80 = 0.0,0.000320,Income_Category_60-80,0,1.0,RandomForest Pipeline,0
22,Income_Category_80-120 = 0.0,0.000203,Income_Category_80-120,0,1.0,RandomForest Pipeline,0
23,Customer_Age = 50.0,-0.000128,Customer_Age,50,-1.0,RandomForest Pipeline,0


In [36]:
shap_single_datapoint.plot()

In [38]:
bd_single_datapoint = rfc_exp.predict_parts(single_datapoint, type = 'break_down', 
                                            order=np.array(['Total_Trans_Ct', 'Total_Relationship_Count', 'Total_Revolving_Bal',
                                                            'Total_Ct_Chng_Q4_Q1', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
                                                            'Credit_Limit', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon']))
bd_single_datapoint.plot(max_vars=10)