### Importing Libraries and Data Frames

In [80]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [81]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions

import pickle
from sklearn.metrics import accuracy_score,  recall_score, precision_score, f1_score, roc_auc_score, roc_curve, auc

In [82]:
# Load the dataframe from a csv file
df = pd.read_excel(r'C:\Users\dideu\OneDrive\Documents\DDB\thesis\Thesis_Project\Credit Card Clients\default of credit card clients.xls', header=1)


### Exploring the Data Frames

In [83]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          30000 non-null  int64
 1   LIMIT_BAL                   30000 non-null  int64
 2   SEX                         30000 non-null  int64
 3   EDUCATION                   30000 non-null  int64
 4   MARRIAGE                    30000 non-null  int64
 5   AGE                         30000 non-null  int64
 6   PAY_0                       30000 non-null  int64
 7   PAY_2                       30000 non-null  int64
 8   PAY_3                       30000 non-null  int64
 9   PAY_4                       30000 non-null  int64
 10  PAY_5                       30000 non-null  int64
 11  PAY_6                       30000 non-null  int64
 12  BILL_AMT1                   30000 non-null  int64
 13  BILL_AMT2                   30000 non-null  int64
 14  BILL_A

Print count and percentage of classes variable

In [85]:
# Calculate the counts of unique values in the 'class' column 
class_counts = df['default payment next month'].value_counts()

# Calculate the percentage of each unique value in the 'class' column by dividing 'class_counts' by its sum and then multiplying by 100.
class_percentages = class_counts / class_counts.sum() * 100

print('Class counts:\n' ,class_counts, '\n')
print('Percentage of each class: \n' ,class_percentages)
print('\nTotal number of rows: ', df.shape[0])

# Saving this for future use
a = df.shape[0]

Class counts:
 0    23364
1     6636
Name: default payment next month, dtype: int64 

Percentage of each class: 
 0    77.88
1    22.12
Name: default payment next month, dtype: float64

Total number of rows:  30000


#### Cleaning features dataframe

In [86]:
#Some values are like '?'. Replacing them as NaN 
df[df == '?'] = np.nan

In [87]:
# Dropping the rows with NaN values in  'workclass', 'occupation', 'native-country' for the test dataset
df.dropna( inplace=True)

In [88]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Creating X and y sets

In [89]:
# From X set, education column is also dropped as it is similar to Education-num
X = df.drop(['default payment next month'], axis=1)

y = df['default payment next month']

In [90]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         30000 non-null  int64
 1   LIMIT_BAL  30000 non-null  int64
 2   SEX        30000 non-null  int64
 3   EDUCATION  30000 non-null  int64
 4   MARRIAGE   30000 non-null  int64
 5   AGE        30000 non-null  int64
 6   PAY_0      30000 non-null  int64
 7   PAY_2      30000 non-null  int64
 8   PAY_3      30000 non-null  int64
 9   PAY_4      30000 non-null  int64
 10  PAY_5      30000 non-null  int64
 11  PAY_6      30000 non-null  int64
 12  BILL_AMT1  30000 non-null  int64
 13  BILL_AMT2  30000 non-null  int64
 14  BILL_AMT3  30000 non-null  int64
 15  BILL_AMT4  30000 non-null  int64
 16  BILL_AMT5  30000 non-null  int64
 17  BILL_AMT6  30000 non-null  int64
 18  PAY_AMT1   30000 non-null  int64
 19  PAY_AMT2   30000 non-null  int64
 20  PAY_AMT3   30000 non-null  int64
 21  PAY_AMT4   3

In [91]:
y.value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

### Split data into separate fitting and test set

In [92]:
# Split the dataset into fitting data (80%) and test set (20%)
X_fit, X_test, y_fit, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


#### The percentage of each class in the target variable for each set

In [93]:
# Defined a function to calculate  and print the percentage of each class in the target variable
def calculate_class_percentage(y):
    class_percentage = {}
    total_samples = len(y)
    unique_classes = set(y)
    
    for cls in unique_classes:
        class_count = sum(y == cls)
        percentage = (class_count / total_samples) * 100
        class_percentage[cls] = percentage
    
    return class_percentage

In [94]:
# Calculate class percentages for each dataset
fit_class_percentage = calculate_class_percentage(y_fit)
test_class_percentage = calculate_class_percentage(y_test)

# Print class percentages for each dataset
print("Fit set class percentages:")
print(fit_class_percentage )
print("\nTest set class percentages:")
print(test_class_percentage)
print('\n Number of rows of X fit', X_fit.shape[0], '\n Number of rows of X test', X_test.shape[0],'\n Number of rows of Y fit', y_fit.shape[0],'\n Number of rows of y test', y_test.shape[0],)


Fit set class percentages:
{0: 77.92380952380952, 1: 22.076190476190476}

Test set class percentages:
{0: 77.77777777777779, 1: 22.22222222222222}

 Number of rows of X fit 21000 
 Number of rows of X test 9000 
 Number of rows of Y fit 21000 
 Number of rows of y test 9000


In [95]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 4936 to 29733
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         21000 non-null  int64
 1   LIMIT_BAL  21000 non-null  int64
 2   SEX        21000 non-null  int64
 3   EDUCATION  21000 non-null  int64
 4   MARRIAGE   21000 non-null  int64
 5   AGE        21000 non-null  int64
 6   PAY_0      21000 non-null  int64
 7   PAY_2      21000 non-null  int64
 8   PAY_3      21000 non-null  int64
 9   PAY_4      21000 non-null  int64
 10  PAY_5      21000 non-null  int64
 11  PAY_6      21000 non-null  int64
 12  BILL_AMT1  21000 non-null  int64
 13  BILL_AMT2  21000 non-null  int64
 14  BILL_AMT3  21000 non-null  int64
 15  BILL_AMT4  21000 non-null  int64
 16  BILL_AMT5  21000 non-null  int64
 17  BILL_AMT6  21000 non-null  int64
 18  PAY_AMT1   21000 non-null  int64
 19  PAY_AMT2   21000 non-null  int64
 20  PAY_AMT3   21000 non-null  int64
 21  PAY_AMT4 

In [96]:
# Defining the columns' categories
numerical = ['LIMIT_BAL' ,'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
categorical = ['SEX','EDUCATION',  'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
continuous_features = numerical

In [97]:
#Defining Random Forest model
def run_RF (X_fit, y_fit, X_test, y_test, model_name):
        # Preprocessing for cataegorical data - OneHotEncoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Preprocessing for numerical data - StandardScaler
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])
    
# Bundle preprocessing for numerical and categorical data
    transformations = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical),  
            ('cat', categorical_transformer, categorical)],
        remainder='passthrough')
    
    clf_RF = Pipeline(steps=[('preprocessor', transformations), ('classifier', RandomForestClassifier())])
    model_RF = clf_RF.fit(X_fit, y_fit)
    y_pred_rf = model_RF.predict(X_test)
    y_prob_rf = model_RF.predict_proba(X_test)[:, 1]

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    recall_rf = recall_score(y_test, y_pred_rf)
    precision_rf = precision_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf)
    roc_rf = roc_auc_score(y_test, y_pred_rf)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

    models_rf = [('RF {}'.format(model_name), accuracy_rf, recall_rf, precision_rf, f1_rf, roc_rf)]
    model_perf_metrics_rf = pd.DataFrame(models_rf, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC(%)'])
        
    return model_RF, fpr_rf, tpr_rf, model_perf_metrics_rf   

In [98]:
#Defining Logistic Regression without scaling
def run_LR(X_fit, y_fit, X_test, y_test, model_name):
    # Preprocessing for categorical data - OneHotEncoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Preprocessing for numerical data - StandardScaler
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    # Bundle preprocessing for numerical and categorical data
    transformations = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical),  
            ('cat', categorical_transformer, categorical)], 
        remainder='passthrough')

# Logistic Regression
    clf_LR = Pipeline(steps=[('preprocessor', transformations), ('classifier', LogisticRegression(max_iter=1000))])
    model_LR = clf_LR.fit(X_fit, y_fit)
    y_pred_lr = model_LR.predict(X_test)
    y_prob_lr = model_LR.predict_proba(X_test)[:, 1]

    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    recall_lr = recall_score(y_test, y_pred_lr)
    precision_lr = precision_score(y_test, y_pred_lr)
    f1_lr = f1_score(y_test, y_pred_lr)
    roc_lr = roc_auc_score(y_test, y_pred_lr)
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
    row_num = len(X_fit)

    models_lr = [('LR {}'.format(model_name), accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr, row_num)]
    model_perf_metric_lr = pd.DataFrame(models_lr, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC(%)', 'X_fit Size'])

    return model_LR, model_perf_metric_lr

In [99]:
#Running Random Forest
model_RF, fpr_rf, tpr_rf, model_perf_metric_rf = run_RF(X_fit, y_fit, X_test, y_test, 'default')


In [100]:
#Running Logistic Regression
model_LR, model_perf_metric_lr = run_LR(X_fit, y_fit, X_test, y_test, 'default')

In [101]:
# Merging the two dataframes and printing the result
model_perf_metrics_merged_creditcard = model_perf_metric_rf.merge(model_perf_metric_lr, how='outer')
model_perf_metrics_merged_creditcard

Unnamed: 0,Model,Accuracy (%),Recall (%),Precision (%),F1 (%),AUC(%),X_fit Size
0,RF default,0.814444,0.3565,0.650547,0.460594,0.650893,
1,LR default,0.813222,0.323,0.663926,0.434578,0.638143,21000.0


In [102]:
def generate_counterfactuals(X_fit, y_fit, model,continuous_features, sample_size, total_CFs):
    # Create a Data object
    d = dice_ml.Data(dataframe=X_fit.assign(income=y_fit), continuous_features=continuous_features, outcome_name='income')

    # Create a Model object
    m = dice_ml.Model(model=model, backend="sklearn")

    # Generate counterfactuals
    exp = dice_ml.Dice(d, m, method="random")

    e1 = exp.generate_counterfactuals(X_fit[0:sample_size], total_CFs=total_CFs, desired_class="opposite")
    #Commented out the below line as it is not needed to see the changes
    #e1.visualize_as_dataframe(show_only_changes=True)

    cf_df = pd.DataFrame()
    
    for i in range(0, sample_size):
        xd = e1.cf_examples_list[i].final_cfs_df
        cf_df = pd.concat([cf_df, xd])

    else:
        cf_df.reset_index(drop=True, inplace=True)
        new_start_index = 40000*i + len(cf_df)
        cf_df.index += new_start_index 
        cf_df.to_csv('cf_df_creditcard_{}_{}.csv'.format(sample_size, total_CFs))
        X_fit_cf = cf_df.drop(['income'], axis=1)
        y_fit_cf = cf_df['income']

    return e1, X_fit_cf, y_fit_cf

In [104]:
# Experiment 1- low sample size and medium-high total CFs
for j in [10,15]:
    for i in [10, 50]:
        e1, X_fit_cf, y_fit_cf = generate_counterfactuals(X_fit, y_fit, model_RF,continuous_features, j, i)
        new_X_fit = pd.concat([X_fit, X_fit_cf])
        new_y_fit = pd.concat([y_fit, y_fit_cf])
        model_LR_cf, model_perf_metric_lr_cf = run_LR(new_X_fit, new_y_fit, X_test, y_test, '_{}_{}'.format(j, i))
        model_perf_metrics_merged_creditcard = model_perf_metrics_merged_creditcard.append(model_perf_metric_lr_cf, ignore_index=True)

100%|██████████| 10/10 [00:09<00:00,  1.09it/s]
  model_perf_metrics_merged_creditcard = model_perf_metrics_merged_creditcard.append(model_perf_metric_lr_cf, ignore_index=True)
  0%|          | 0/10 [00:30<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model_perf_metrics_merged_creditcard

In [103]:
# Experiment 1- low sample size and medium-high total CFs
for j in [10, 50]:
    for i in [10, 50, 100, 200]:
        e1, X_fit_cf, y_fit_cf = generate_counterfactuals(X_fit, y_fit, model_RF,continuous_features, j, i)
        new_X_fit = pd.concat([X_fit, X_fit_cf])
        new_y_fit = pd.concat([y_fit, y_fit_cf])
        model_LR_cf, model_perf_metric_lr_cf = run_LR(new_X_fit, new_y_fit, X_test, y_test, '_{}_{}'.format(j, i))
        model_perf_metrics_merged_creditcard = model_perf_metrics_merged_creditcard.append(model_perf_metric_lr_cf, ignore_index=True)

 40%|████      | 4/10 [02:56<04:25, 44.19s/it]


KeyboardInterrupt: 

In [None]:
model_perf_metrics_merged_creditcard

In [None]:
# Experiment 2- medium sample size and low-medium total CFs
for j in [100, 500]:
    for i in [20, 30]:
        e2, X_fit_cf, y_fit_cf = generate_counterfactuals(X_fit, y_fit, model_RF,continuous_features, j, i)
        new_X_fit = pd.concat([X_fit, X_fit_cf])
        new_y_fit = pd.concat([y_fit, y_fit_cf])
        model_LR_cf, model_perf_metric_lr_cf = run_LR(new_X_fit, new_y_fit, X_test, y_test, '_{}_{}'.format(j, i))
        model_perf_metrics_merged_creditcard = model_perf_metrics_merged_creditcard.append(model_perf_metric_lr_cf, ignore_index=True)

In [None]:
model_perf_metrics_merged_creditcard