In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import dice_ml
from dice_ml import Dice
from dice_ml.utils import helpers  # helper functions

In [19]:
# read in data for the bank churn dataset from data folder
bank_churn_data = pd.read_csv('data/BankChurners.csv')
bank_churn_data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [20]:
# first column and the last 2 columns are not useful for our task, so we drop them
bank_churn_data.drop(bank_churn_data.columns[0], axis=1, inplace=True)
bank_churn_data.drop(bank_churn_data.columns[-2:], axis=1, inplace=True)

In [21]:
bank_churn_data.columns

Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')

In [22]:
# change the target variable "Attrition Flag" to 0 and 1
bank_churn_data['Attrition_Flag'] = bank_churn_data['Attrition_Flag'].map(
    {
        'Existing Customer': 0,
        'Attrited Customer': 1
        }
        )

In [23]:
# for our models we don't need these columns: Gender, Education_Level and Marital_Status, so they are dropped
bank_churn_data.drop(['Gender', 'Education_Level', 'Marital_Status'],  axis=1, inplace=True)

bank_churn_data.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Dependent_count,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,45,3,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,0,49,5,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,0,51,3,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,0,40,4,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,0,40,3,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [24]:
numerical = []
categorical = []
for col in bank_churn_data.columns:
    if bank_churn_data[col].dtypes == 'O':
        categorical.append(col)
    elif col == 'Attrition_Flag':
        continue
    else:
        numerical.append(col)


display(numerical)
display(categorical)

['Customer_Age',
 'Dependent_count',
 'Months_on_book',
 'Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Avg_Open_To_Buy',
 'Total_Amt_Chng_Q4_Q1',
 'Total_Trans_Amt',
 'Total_Trans_Ct',
 'Total_Ct_Chng_Q4_Q1',
 'Avg_Utilization_Ratio']

['Income_Category', 'Card_Category']

In [25]:
encoded = pd.get_dummies(bank_churn_data[categorical], prefix=categorical)
df_enc = pd.concat([encoded, bank_churn_data], axis=1)
df_enc.drop(categorical, axis=1, inplace=True)
X = df_enc.drop(["Attrition_Flag"], axis=1)
y = df_enc["Attrition_Flag"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
# train a random forest classifier as one black box model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# get the predictions on test data and show the classification report and plot confusion matrix
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1693
           1       0.94      0.79      0.86       333

    accuracy                           0.96      2026
   macro avg       0.95      0.89      0.92      2026
weighted avg       0.96      0.96      0.96      2026

[[1675   18]
 [  70  263]]


In [27]:
# train a gradient boosting classifier as another black box model
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# get the predictions on test data and show the classification report and plot confusion matrix
y_pred = gb.predict(X_test)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1693
           1       0.93      0.83      0.87       333

    accuracy                           0.96      2026
   macro avg       0.95      0.91      0.92      2026
weighted avg       0.96      0.96      0.96      2026

[[1671   22]
 [  58  275]]


In [28]:
# now we create diverse Counterfactual explanations
# first we create a Dice data object
data_dice = dice_ml.Data(
    dataframe=df_enc, continuous_features=numerical,
    outcome_name='Attrition_Flag'
    )

# then we create a Dice model object for the random forest model
rf_dice = dice_ml.Model(model=rf, backend='sklearn')
explainer = dice_ml.Dice(data_dice, rf_dice, method='random')

In [36]:
# show true labels for first 5 test data points
print(y_test.head())
X_test.head()

2564    0
2008    0
3890    0
410     0
4926    1
Name: Attrition_Flag, dtype: int64


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
2564,1,0,0,0,0,0,1,0,0,0,...,3,2,27499.0,1188,26311.0,0.544,2202,46,0.394,0.043
2008,0,1,0,0,0,0,1,0,0,0,...,1,3,1614.0,925,689.0,0.598,2659,63,0.75,0.573
3890,0,1,0,0,0,0,1,0,0,0,...,3,1,3812.0,1103,2709.0,0.757,4080,65,0.625,0.289
410,0,1,0,0,0,0,1,0,0,0,...,3,2,3106.0,1675,1431.0,0.781,1512,38,0.407,0.539
4926,0,0,0,1,0,0,1,0,0,0,...,2,4,4428.0,0,4428.0,0.865,2331,36,0.286,0.0


In [37]:
# generate counterfactual explanations for the first instance in the test set
dice_exp = explainer.generate_counterfactuals(
    X_test[0:1], total_CFs=3, desired_class="opposite"
    )

# visualize the counterfactual explanations
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  3.44it/s]

Query instance (original outcome : 0)





Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,1,0,0,0,0,0,1,0,0,0,...,2,27499.0,1188,26311.0,0.544,2202,46,0.394,0.043,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-,1.0,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1.0
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,0.35,-,-,-,-,1.0
2,0.0,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1.0


As the origninal prediction was 0, which means it was an existing customers. It can be used for risk detection to avoid losing existing customers

In [38]:
# generate counterfactual explanations for the fifth instance in the test set as it is an attrited customer
dice_exp = explainer.generate_counterfactuals(
    X_test[4:5], total_CFs=3, desired_class="opposite"
    )

# visualize the counterfactual explanations
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  3.19it/s]

Query instance (original outcome : 1)





Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,0,0,0,1,0,0,1,0,0,0,...,4,4428.0,0,4428.0,0.865,2331,36,0.286,0.0,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,98.0,-,-,0.0
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,122.0,1.722,-,0.0
2,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,129.0,-,-,0.0


Increasing total transactions count increases the probability of keeping a customer. Using this information, such customers can be offered various rewards for using the card

In [49]:
y_test

2564    0
2008    0
3890    0
410     0
4926    1
       ..
9046    0
8733    0
8404    0
4584    1
1403    0
Name: Attrition_Flag, Length: 2026, dtype: int64

In [50]:
# now we create diverse Counterfactual explanations for another example where the customer is an attrited customer
# reset indes of the test set
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# get the index of the second attrited customer in the test set
attrited_index = y_test[y_test == 1].index[1]
attrited_index

9

In [51]:
display(X_test.loc[attrited_index].to_frame().T)
y_test.loc[attrited_index]

Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,15809.0,0.0,15809.0,0.992,3109.0,48.0,0.412,0.0


1

In [53]:
X_test[attrited_index:attrited_index+1]

Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
9,0,1,0,0,0,0,0,0,0,1,...,2,2,15809.0,0,15809.0,0.992,3109,48,0.412,0.0


In [54]:
# generate counterfactual explanations for the second attrited customer in the test set
dice_exp = explainer.generate_counterfactuals(
    X_test[attrited_index:attrited_index+1], total_CFs=3, desired_class="opposite"
    )

# visualize the counterfactual explanations
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  3.28it/s]

Query instance (original outcome : 1)





Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,0,1,0,0,0,0,0,0,0,1,...,2,15809.0,0,15809.0,0.992,3109,48,0.412,0.0,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,82.0,-,-,0.0
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,1.62,-,0.0
2,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,118.0,-,-,0.0


In [55]:
# now for the gradient boosting model
gb_dice = dice_ml.Model(model=gb, backend='sklearn')
explainer = dice_ml.Dice(data_dice, gb_dice, method='random')

# generate counterfactual explanations for the first instance in the test set
dice_exp = explainer.generate_counterfactuals(
    X_test[0:1], total_CFs=3, desired_class="opposite"
    )

# visualize the counterfactual explanations
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  4.02it/s]

Query instance (original outcome : 1)





Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,1,0,0,0,0,0,1,0,0,0,...,2,27499.0,1188,26311.0,0.544,2202,46,0.394,0.043,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,3.035,-,-,-,-,0.0
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,0.0
2,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,0.0


In [56]:
# generate counterfactual explanations for the fifth instance in the test set as it is an attrited customer
dice_exp = explainer.generate_counterfactuals(
    X_test[4:5], total_CFs=3, desired_class="opposite"
    )

# visualize the counterfactual explanations
dice_exp.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  4.34it/s]

Query instance (original outcome : 1)





Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,0,0,0,1,0,0,1,0,0,0,...,4,4428.0,0,4428.0,0.865,2331,36,0.286,0.0,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,-,-,-,-,-,1.0,-,-,-,-,...,-,-,-,-,-,-,103.0,-,-,0.0
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,12128.9,-,-,77.0,-,-,0.0
2,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,99.0,-,0.5,0.0


In [57]:
# it is possible to give the features that we want to change in the counterfactual explanations (features_to_vary)
# and it is possible to give the range of values that are permitted