## 데이터 가져오기
## 모델 학습
## 모델 평가
## SHAP 해석
## LIME 해석

In [None]:
# shap 설치
!pip install shap

In [None]:
# lime 설치
!pip install lime

In [None]:
# 패키지 가져오기
import pandas as pd
import numpy as np
import shap
import lime
import sklearn
import shap
shap.initjs() # load JS visualization code to notebook
import xgboost
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'white'
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# 데이터 가져오기 및 X와 y 구분
heloc = pd.read_csv('data/heloc_dataset_v1 (1).csv')
X = heloc.drop(columns = 'RiskPerformance')
y = heloc.RiskPerformance.replace(to_replace=['Bad', 'Good'], value=[1, 0])

In [89]:
# 훈련 세트 및 테스트 세트 분할
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=0)
X_train_array = np.array(X_train)
X_test_array = np.array(X_test)
y_train_array = np.array(y_train)
y_test_array = np.array(y_test)

In [90]:
# # 通过超参数搜索构建XGBoost模型
# xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)] # Number of trees to be used
# xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)] # Maximum number of levels in tree
# xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)] # Minimum number of instaces needed in each node
# xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist'] # Tree construction algorithm used in XGBoost
# xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)] # Learning rate
# xgb_gamma = [int(x) for x in np.linspace(0, 0.5, 6)] # Minimum loss reduction required to make further partition
# # # Learning objective used
# # xgb_objective = ['binary:logistic', 'binary:hinge']
# # Create the grid
# xgb_grid = {'n_estimators': xgb_n_estimators,
#             'max_depth': xgb_max_depth,
#             'min_child_weight': xgb_min_child_weight,
#             'tree_method': xgb_tree_method,
#             'eta': xgb_eta,
#             'gamma': xgb_gamma}
# # Create the model to be tuned
# xgb_base = xgboost.XGBClassifier()
# # Create the random search 
# xgb_random = RandomizedSearchCV(estimator = xgb_base, param_distributions = xgb_grid, 
#                                 n_iter = 5, cv = 3, verbose = 2, 
#                                 random_state = 42, n_jobs = -1)
# # Fit the random search model
# xgb_random.fit(X_train_array, y_train_array)
# # Get the optimal parameters
# xgb_random.best_params_

In [None]:
# 최종 XGBoost 모델 훈련
xgb_final = xgboost.XGBClassifier(tree_method = 'hist',
                         n_estimators = 800,
                         min_child_weight = 6,
                         max_depth = 2,
                         gamma = 0,
                         eta = 0.4,
                         early_stop=10,
                         random_state = 42)
xgb_final.fit(X_train_array, y_train_array)

In [None]:
# 모델 평가
def model_eval(model, title, test_features, test_labels):
    scores = pd.DataFrame()
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels,predictions)
    roc_auc = roc_auc_score(test_labels,predictions)
    F1 = f1_score(test_labels,predictions)
    precision = precision_score(test_labels,predictions)
    recall = recall_score(test_labels,predictions)
    scores[title] = [accuracy,roc_auc,F1,precision,recall]
    scores.index = ['Accuracy Score', 'ROC_AUC', 'F1_Score', 'Precision_Score','Recall_Score']
    return scores
train_scores = model_eval(xgb_final,"train",X_train_array,y_train_array)
test_scores = model_eval(xgb_final, "test",X_test_array, y_test_array)
print(train_scores)
print(test_scores)

In [93]:
# XGBOOST 모델 기반 해석기 구축
explainer = shap.TreeExplainer(xgb_final)
shap_values = explainer.shap_values(X)
shap_interaction_values = explainer.shap_interaction_values(X)

In [None]:
# 첫 번째 샘플의 국부 결과 귀인을 추진력 그래프로 표시 (local?)
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:], matplotlib=True)

In [None]:
first_result = xgb_final.predict(X.iloc[0:1,:])
print(first_result)

In [None]:
# 여러 샘플의 전역 결과 귀인을 힘 그래프로 표시 (global?)
shap.force_plot(explainer.expected_value, shap_values[:100,:], X.iloc[:100,:])

In [None]:
# 전역 결과 귀인을 막대 그래프로 표시
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
shap.summary_plot(shap_interaction_values,X)

In [None]:
# 총괄도(산점도)를 통해 전역 결과 귀인을 표시
shap.summary_plot(shap_values, X)

In [None]:
# 단일 특성의 결과 귀인을 의존성 산점도로 표시
shap.dependence_plot("ExternalRiskEstimate", shap_values, X)

In [None]:
# 단일 샘플의 결과 귀인을 결정 경로도로 표시
shap.decision_plot(explainer.expected_value, shap_values[:1],X.iloc[1])

In [None]:
# 단일 샘플의 결과 귀인을 결정 경로도로 표시 (특성 간 상호작용 포함)간 상호작용 포함) 경로도로 표시 (특성 간 상호작용 포함)
shap.decision_plot(explainer.expected_value,shap_interaction_values[:1],X.iloc[1],feature_display_range=slice(None, -20, -1))

In [None]:
shap_explianer_values = explainer(X)
shap.plots.beeswarm(shap_explianer_values)

In [None]:
X_test.head()

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:], matplotlib=True)

# 첫 번쨰 값에 대한 SHAP 값 계산
shap_values = explainer.shap_values(X.iloc[0:1])

# SHAP 값 DataFrame으로 변환
shap_df = pd.DataFrame(shap_values, columns=X.columns)

# 결과 출력
print(shap_df)

In [105]:
# 결과를 CSV 파일로 저장
shap_df.to_csv(f'result/shap_values_first_test_data_{0}.csv', index=False)

In [106]:
feature_names=['ExternalRiskEstimate', 'MSinceOldestTradeOpen',
       'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades',
       'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec',
       'PercentTradesNeverDelq', 'MSinceMostRecentDelq',
       'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades',
       'NumTradesOpeninLast12M', 'PercentInstallTrades',
       'MSinceMostRecentInqexcl7days', 'NumInqLast6M', 'NumInqLast6Mexcl7days',
       'NetFractionRevolvingBurden', 'NetFractionInstallBurden',
       'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance',
       'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']
#feature_names = X_test.columns.tolist()
target_names=['Good','Bad']

In [107]:
# XGBOOST 모델 기반 LIME 해석기 구축
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=feature_names, class_names=target_names, discretize_continuous=False)

In [None]:
i = np.random.randint(0, X_test.shape[0])
i

In [109]:
# LIME 해석기를 사용하여 단일 샘플 결과 해석
exp = lime_explainer.explain_instance(X_test.iloc[i], xgb_final.predict_proba, num_features=23)

In [None]:
# 결과 귀인 표시 
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
# 기여도를 텍스트로 추출 (i번째 test data에 대해?)
feature_effects = exp.as_list()
print("Feature contributions:")
for feature, effect in feature_effects:
    print(f"{feature}: {effect}")

In [None]:
# 결과를 데이터프레임으로 변환
contrib_df = pd.DataFrame(feature_effects, columns=['Feature', 'Effect'])
print(contrib_df)

In [113]:
# 필요시 CSV 파일로 저장
contrib_df.to_csv(f'result/lime_feature_contributions_{i}.csv', index=False)

In [None]:
# LIME 해석기를 사용하여 단일 샘플 결과 해석
exp = lime_explainer.explain_instance(X_test.iloc[0], xgb_final.predict_proba, num_features=23)
# 기여도를 텍스트로 추출 (i번째 test data에 대해?)
feature_effects = exp.as_list()
print("Feature contributions:")
for feature, effect in feature_effects:
    print(f"{feature}: {effect}")

In [None]:
# 결과를 데이터프레임으로 변환
contrib_df = pd.DataFrame(feature_effects, columns=['Feature', 'Effect'])
print(contrib_df)
# 필요시 CSV 파일로 저장
contrib_df.to_csv(f'result/lime_feature_contributions_{0}.csv', index=False)