Credit_Card_Approval

In [96]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [97]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions
from sklearn.metrics import accuracy_score,  recall_score, precision_score, f1_score, roc_auc_score, roc_curve, auc

In [98]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

In [99]:
X.head()

Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
0,0,202.0,g,f,1,t,t,1.25,v,w,g,u,0.0,30.83,b
1,560,43.0,g,f,6,t,t,3.04,h,q,g,u,4.46,58.67,a
2,824,280.0,g,f,0,f,t,1.5,h,q,g,u,0.5,24.5,a
3,3,100.0,g,t,5,t,t,3.75,v,w,g,u,1.54,27.83,b
4,0,120.0,s,f,0,f,t,1.71,v,w,g,u,5.625,20.17,b


In [100]:
y.head()

Unnamed: 0,A16
0,+
1,+
2,+
3,+
4,+


In [101]:
y.value_counts()

A16
-      383
+      307
dtype: int64

In [102]:
# Step 1: Remove missing values from X
X.dropna(inplace=True)

# Step 2: Ensure X and y have the same index
y = y.loc[X.index]

In [103]:
y['A16'] = y['A16'].str.replace('+', '1')
y['A16'] = y['A16'].str.replace('-', '0')

y['A16'] = y['A16'].astype(int)

  y['A16'] = y['A16'].str.replace('+', '1')


In [104]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 653 entries, 0 to 689
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     653 non-null    int64  
 1   A14     653 non-null    float64
 2   A13     653 non-null    object 
 3   A12     653 non-null    object 
 4   A11     653 non-null    int64  
 5   A10     653 non-null    object 
 6   A9      653 non-null    object 
 7   A8      653 non-null    float64
 8   A7      653 non-null    object 
 9   A6      653 non-null    object 
 10  A5      653 non-null    object 
 11  A4      653 non-null    object 
 12  A3      653 non-null    float64
 13  A2      653 non-null    float64
 14  A1      653 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 81.6+ KB


In [105]:
y.head()

Unnamed: 0,A16
0,1
1,1
2,1
3,1
4,1


In [106]:
# Split the dataset into fitting data (60%) and test set (40%)
X_fit, X_test, y_fit, y_test = train_test_split(X, y, test_size=0.3, random_state=1155)

In [107]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 628 to 101
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A15     457 non-null    int64  
 1   A14     457 non-null    float64
 2   A13     457 non-null    object 
 3   A12     457 non-null    object 
 4   A11     457 non-null    int64  
 5   A10     457 non-null    object 
 6   A9      457 non-null    object 
 7   A8      457 non-null    float64
 8   A7      457 non-null    object 
 9   A6      457 non-null    object 
 10  A5      457 non-null    object 
 11  A4      457 non-null    object 
 12  A3      457 non-null    float64
 13  A2      457 non-null    float64
 14  A1      457 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 57.1+ KB


In [108]:
# Defining the columns' categories
numerical = ['A15' ,'A14', 'A11', 'A8', 'A3', 'A2']
categorical = ['A13','A12',  'A10', 'A9', 'A7', 'A6', 'A5', 'A4', 'A1']

In [109]:
#Defining Random Forest model
def run_RF (X_fit, y_fit, X_test, y_test, model_name):
        # Preprocessing for cataegorical data - OneHotEncoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
    transformations = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical)],
        remainder='passthrough')
    
    clf_RF = Pipeline(steps=[('preprocessor', transformations), ('classifier', RandomForestClassifier())])
    model_RF = clf_RF.fit(X_fit, y_fit)
    y_pred_rf = model_RF.predict(X_test)
    y_prob_rf = model_RF.predict_proba(X_test)[:, 1]

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    recall_rf = recall_score(y_test, y_pred_rf)
    precision_rf = precision_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf)
    roc_rf = roc_auc_score(y_test, y_pred_rf)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

    models_rf = [('RF {}'.format(model_name), accuracy_rf, recall_rf, precision_rf, f1_rf, roc_rf)]
    model_perf_metrics_rf = pd.DataFrame(models_rf, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC(%)'])
        
    return model_RF, model_perf_metrics_rf

In [110]:
#Defining Logistic Regression without scaling
def run_LR(X_fit, y_fit, X_test, y_test, model_name):
    # Preprocessing for categorical data - OneHotEncoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Preprocessing for numerical data - StandardScaler
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    # Bundle preprocessing for numerical and categorical data
    transformations = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical),  
            ('cat', categorical_transformer, categorical)], 
        remainder='passthrough')

# Logistic Regression
    clf_LR = Pipeline(steps=[('preprocessor', transformations), ('classifier', LogisticRegression(max_iter=1000))])
    model_LR = clf_LR.fit(X_fit, y_fit)
    y_pred_lr = model_LR.predict(X_test)

    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    recall_lr = recall_score(y_test, y_pred_lr)
    precision_lr = precision_score(y_test, y_pred_lr)
    f1_lr = f1_score(y_test, y_pred_lr)
    roc_lr = roc_auc_score(y_test, y_pred_lr)
    row_num = len(X_fit)
    row_num_test = len(X_test)

    models_lr = [('LR {}'.format(model_name), accuracy_lr, recall_lr, precision_lr, f1_lr, roc_lr, row_num, row_num_test)]
    model_perf_metric_lr = pd.DataFrame(models_lr, columns = ['Model', 'Accuracy (%)', 'Recall (%)', 'Precision (%)', 'F1 (%)', 'AUC(%)', 'X_fit Size', 'X_test Size'])

    return  model_perf_metric_lr, roc_lr, f1_lr

In [111]:
model_RF, model_perf_metric_rf = run_RF(X_fit, y_fit, X_test, y_test, 'default')
model_perf_metric_lr, auc, f1 = run_LR(X_fit, y_fit, X_test, y_test, 'default')

  return fit_method(estimator, *args, **kwargs)


  y = column_or_1d(y, warn=True)


In [112]:
model_perf_metrics_merged = model_perf_metric_rf.merge(model_perf_metric_lr, how='outer')


In [113]:
model_perf_metrics_merged

Unnamed: 0,Model,Accuracy (%),Recall (%),Precision (%),F1 (%),AUC(%),X_fit Size,X_test Size
0,RF default,0.892857,0.891566,0.860465,0.87574,0.892686,,
1,LR default,0.887755,0.891566,0.850575,0.870588,0.888261,457.0,196.0


In [114]:
def generate_counterfactuals(X_fit, y_fit, model,continuous_features, sample_size, total_CFs, fea_to_vary):
    # Create a Data object
    d = dice_ml.Data(dataframe=X_fit.assign(A16=y_fit), continuous_features=continuous_features, outcome_name='A16')

    # Create a Model object
    m = dice_ml.Model(model=model, backend="sklearn")

    # Generate counterfactuals
    exp = dice_ml.Dice(d, m, method="random")

    e1 = exp.generate_counterfactuals(X_fit[0:sample_size], total_CFs=total_CFs, desired_class="opposite", features_to_vary= fea_to_vary)
    #Commented out the below line as it is not needed to see the changes
    #e1.visualize_as_dataframe(show_only_changes=True)

    cf_df = pd.DataFrame()
    
    for i in range(0, sample_size):
        xd = e1.cf_examples_list[i].final_cfs_df
        cf_df = pd.concat([cf_df, xd])

    else:
        cf_df.reset_index(drop=True, inplace=True)
        new_start_index = 40000*i + len(cf_df)
        cf_df.index += new_start_index 
        cf_df.to_csv('cf_df_{}_{}.csv'.format(sample_size, total_CFs))
        X_fit_cf = cf_df.drop(['A16'], axis=1)
        y_fit_cf = cf_df['A16']

    return X_fit_cf, y_fit_cf

In [115]:
# Experiment running function
def running_exp(X_fit, y_fit, continuous_features, sample_size_list, total_CFs_list, iteration_num, fea_to_vary, num):
    #Running default Random Forest
    model_RF, model_perf_metric_rf = run_RF(X_fit, y_fit, X_test, y_test, 'default')
    #Running default Logistic Regression
    model_perf_metric_lr, auc, f1 = run_LR(X_fit, y_fit, X_test, y_test, 'default')

    # Merging the two dataframes and printing the result
    model_perf_metrics_merged = model_perf_metric_rf.merge(model_perf_metric_lr, how='outer')
    model_perf_metrics_merged_only_for_cfs = model_perf_metric_rf.merge(model_perf_metric_lr, how='outer')

    for j in sample_size_list:
        for i in total_CFs_list:
            total_auc = 0
            total_f1 = 0
            total_auc_cf = 0
            total_f1_cf = 0
            
            for k in range(0,iteration_num):    
                X_fit_cf, y_fit_cf = generate_counterfactuals(X_fit, y_fit, model_RF,continuous_features, j, i, fea_to_vary)
                new_X_fit = pd.concat([X_fit, X_fit_cf])
                new_y_fit = pd.concat([y_fit, y_fit_cf])
            
                model_perf_metric_lr_only_cf, auc_only_cf, f1_only_cf = run_LR(X_fit_cf, y_fit_cf, X_test, y_test, '_{}_sample:{}_cf:{}'.format(k, j, i))
                model_perf_metric_lr_cf, auc_lr_cf, f1_lr_cf = run_LR(new_X_fit, new_y_fit, X_test, y_test, '_{}_sample:{}_cf:{}'.format(k, j, i))
                
                total_auc_cf += auc_only_cf
                total_auc += auc_lr_cf
                total_f1 += f1_lr_cf
                total_f1_cf += f1_only_cf
                model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
                model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)

            else:
                total_auc_cf /= iteration_num
                total_f1_cf /= iteration_num
                total_auc /= iteration_num
                total_f1 /= iteration_num

                new_row_cf= {'Model': 'LR_avg_of_only_cfs_of{}_sample:{}_cf:{}'.format(k,j, i) , 'Accuracy (%)': '', 'Recall (%)': '', 'Precision (%)': '', 'F1 (%)':total_f1_cf, 'AUC(%)': total_auc_cf, 'X_fit Size': '', 'X_test Size': ''}
                new_row = {'Model': 'LR_avg_of_all{}_sample:{}_cf:{}'.format(k, j, i) , 'Accuracy (%)': '', 'Recall (%)': '', 'Precision (%)': '', 'F1 (%)':total_f1, 'AUC(%)': total_auc, 'X_fit Size': '', 'X_test Size': ''}
                    
                model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(new_row_cf, ignore_index=True)
                model_perf_metrics_merged = model_perf_metrics_merged.append(new_row, ignore_index=True)

    model_perf_metrics_merged_only_for_cfs.to_excel('model_perf_metrics_merged_only_for_cfs_{}.xlsx'.format(num))
    model_perf_metrics_merged.to_excel('model_perf_metrics_merged_{}.xlsx'.format(num))
    
    return  model_perf_metrics_merged_only_for_cfs, model_perf_metrics_merged  

In [116]:
# Selecting the first column of y
y_fit = y_fit.iloc[:, 0]
y_test = y_test.iloc[:, 0]

In [94]:
fea_to_vary = numerical + categorical
sample_size_list = [10, 20, 30, 40]
total_CFs_list = [1, 2, 3, 4]
iteration_num = 15

model_perf_metrics_merged_only_for_cfs, model_perf_metrics_merged = running_exp(X_fit, y_fit, numerical, sample_size_list, total_CFs_list, iteration_num, fea_to_vary, 1)

100%|██████████| 10/10 [00:01<00:00,  7.60it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 10/10 [00:01<00:00,  7.55it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 10/10 [00:01<00:00,  7.62it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 10/10 [00:01<00:00,  7.34it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only

In [117]:
fea_to_vary = numerical + categorical
sample_size_list = [50, 100]
total_CFs_list = [1, 2, 3]
iteration_num = 15

model_perf_metrics_merged_only_for_cfs, model_perf_metrics_merged = running_exp(X_fit, y_fit, numerical, sample_size_list, total_CFs_list, iteration_num, fea_to_vary, 2)

100%|██████████| 50/50 [00:08<00:00,  5.90it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 50/50 [00:07<00:00,  6.63it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 50/50 [00:07<00:00,  6.62it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 50/50 [00:07<00:00,  6.37it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only

In [118]:
fea_to_vary = numerical + categorical
sample_size_list = [200, 300]
total_CFs_list = [1, 2, 3]
iteration_num = 15

model_perf_metrics_merged_only_for_cfs, model_perf_metrics_merged = running_exp(X_fit, y_fit, numerical, sample_size_list, total_CFs_list, iteration_num, fea_to_vary, 2)

100%|██████████| 200/200 [00:33<00:00,  6.05it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 200/200 [00:40<00:00,  4.97it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 200/200 [00:38<00:00,  5.13it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_merged_only_for_cfs.append(model_perf_metric_lr_only_cf, ignore_index=True)
  model_perf_metrics_merged = model_perf_metrics_merged.append(model_perf_metric_lr_cf, ignore_index=True)
100%|██████████| 200/200 [00:25<00:00,  7.82it/s]
  model_perf_metrics_merged_only_for_cfs = model_perf_metrics_mer

In [119]:
fea_to_vary = numerical + categorical
sample_size_list = [500, 600]
total_CFs_list = [1, 2]
iteration_num = 15

model_perf_metrics_merged_only_for_cfs, model_perf_metrics_merged = running_exp(X_fit, y_fit, numerical, sample_size_list, total_CFs_list, iteration_num, fea_to_vary, 3)

100%|██████████| 457/457 [01:02<00:00,  7.33it/s]


IndexError: list index out of range