# Setup

* Load modules

In [1]:
# Import general pkgs
from tabulate import tabulate
import warnings
import os
import math
import joblib
import pickle
import itertools
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import (confusion_matrix, auc, classification_report, dcg_score, f1_score, precision_recall_curve,
                             precision_score, recall_score, roc_auc_score, roc_curve)
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline

warnings.filterwarnings('ignore')

In [3]:
# Import custom pkgs
import sys
sys.path.append("../../../utils/")
import utils
import dice_util

In [4]:
# interpretml EBM
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree
from interpret.data import Marginal
from interpret.perf import RegressionPerf
from interpret import show

# DiCE
import dice_ml
from dice_ml import Dice

In [5]:
# print(os.listdir('../../../data/wine_quality/raw/'))

* Load data

In [6]:
path ='../../../data/wine_quality/'
df_train = joblib.load(path+'processed/wine_quality_renamed.csv')
# col_name = joblib.load(path+'raw/column_eng_kor.pickle')
seed=777

# Train model

In [7]:
# X, y for modeling
X = df_train.drop(['quality'], axis=1)
y = df_train['quality']

In [8]:
ebm = ExplainableBoostingRegressor(interactions=0, random_state=seed) # no interaction terms
ebm.fit(X, y)

ExplainableBoostingRegressor(feature_names=['fixed_acidity', 'volatile_acidity',
                                            'citric_acid', 'residual_sugar',
                                            'chlorides', 'free_sulfur_dioxide',
                                            'total_sulfur_dioxide', 'density',
                                            'ph', 'sulphates', 'alcohol'],
                             feature_types=['continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous'],
                             interactions=0, random_state=777)

# DiCE explainer

In [10]:
## random
# Dataset for training an ML model
d = dice_ml.Data(dataframe=df_train,
                 continuous_features= df_train.drop('quality', axis=1).columns.tolist(), 
                 outcome_name='quality')
# Pre-trained ML model
m = dice_ml.Model(model=ebm, backend='sklearn', model_type='regressor')
# DiCE explanation instance
exp_random = dice_ml.Dice(d,m, method='random')

# Generate Counterfactuals

In [11]:
# DiCE의 Generate Counterfactuals 메서드의 세부 튜닝이 필요하면 함수 사용하지 않고 아래 코드 사용
idx_X = 0
total_cfs = 5
desired_score = 6
# desired_score = ebm.predict(X.iloc[[idx_X]])[0] + 1

df_org = X.iloc[[idx_X]]
e_random = exp_random.generate_counterfactuals(
    query_instances=df_org,
    total_CFs=total_cfs,
    desired_range=[desired_score, desired_score + 1],
    random_seed=seed)

df_cfs = e_random.cf_examples_list[0].final_cfs_df

100%|██████████| 1/1 [00:00<00:00,  7.59it/s]


## tabularize_org_cfs

In [12]:
# set parameters
idx_X = 0 # 관측하고싶은 샘플의 Index
df_X = X
model = ebm
dice_explainer = exp_random
desired_score = ebm.predict(X.iloc[[idx_X]])[0] + 1
total_cfs=3

df_org, df_cfs = dice_util.tabularize_org_cfs(df_X, idx_X, model, dice_explainer, desired_score, total_cfs, )

100%|██████████| 1/1 [00:00<00:00,  7.79it/s]


In [13]:
df_org

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [14]:
df_cfs

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,2.1,14.8,6.0
1,7.4,0.7,0.0,1.9,0.076,52.5,34.0,0.9978,3.51,0.56,13.2,6.0
2,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,2.1,15.0,6.0


## tabularize_value_gap

In [15]:
dice_util.tabularize_value_gap(df_org, df_cfs)

Unnamed: 0,cf_no,feature,value_org,value_cf,gap
0,0,alcohol,9.4,14.8,5.4
1,0,sulphates,0.56,2.1,1.5
2,1,free_sulfur_dioxide,11.0,52.5,41.5
3,1,alcohol,9.4,13.2,3.8
4,2,alcohol,9.4,15.0,5.6
5,2,sulphates,0.56,2.1,1.5


## tabularize_dice_pred_result

In [13]:
dice_util.tabularize_dice_pred_result(df_org,df_cfs,ebm)

Unnamed: 0,sort,pred_score
0,current,5.0
1,0,6.0
2,1,6.2
3,2,6.0


## calculate_max_score

In [20]:
max_score = dice_util.calculate_max_score(X, 0, ebm, exp_random)

100%|██████████| 1/1 [00:00<00:00,  8.46it/s]
100%|██████████| 1/1 [00:00<00:00,  9.51it/s]
100%|██████████| 1/1 [00:00<00:00,  9.38it/s]
100%|██████████| 1/1 [00:00<00:00,  9.60it/s]
100%|██████████| 1/1 [00:00<00:00,  9.14it/s]
100%|██████████| 1/1 [00:00<00:00,  9.02it/s]
100%|██████████| 1/1 [00:00<00:00,  9.10it/s]
100%|██████████| 1/1 [00:00<00:00,  8.85it/s]
100%|██████████| 1/1 [00:00<00:00,  9.07it/s]
100%|██████████| 1/1 [00:00<00:00,  9.43it/s]
100%|██████████| 1/1 [00:00<00:00,  9.77it/s]
100%|██████████| 1/1 [00:00<00:00,  9.31it/s]
100%|██████████| 1/1 [00:00<00:00,  8.17it/s]
100%|██████████| 1/1 [00:00<00:00,  7.34it/s]
100%|██████████| 1/1 [00:00<00:00,  6.24it/s]
100%|██████████| 1/1 [00:00<00:00,  4.52it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
6.6 is max score which is changable.





## tabularize_spase_cfs (Charting을 위해 필요한 데이터프레임 형태임)

In [22]:
dice_util.tabularize_sparse_cfs(df_org, df_cfs)

Unnamed: 0,cf_no,feature,value_org,value_cf,value_cf_plus,value_cf_minus,endpoint_for_minus
0,0,fixed_acidity,7.4,0.0,,,
1,0,volatile_acidity,0.7,0.0,,,
2,0,citric_acid,0.0,0.0,,,
3,0,residual_sugar,1.9,0.0,,,
4,0,chlorides,0.076,0.0,,,
5,0,free_sulfur_dioxide,11.0,0.0,,,
6,0,total_sulfur_dioxide,34.0,0.0,,,
7,0,density,0.9978,0.0,,,
8,0,ph,3.51,0.0,,,
9,0,sulphates,0.56,2.1,2.1,,
