# Feature importance 평가를 위한 SHAP
- kfold validation 실행 후 SHAP 
- 연산에 소요되는 시간 확인을 위해 우선적으로 catboost model에 대해서만 실행

In [None]:
혹시 연구실 컴퓨터에 shap 설치 안되어 있으면 설치해서 실행하길...

In [5]:
import pandas as pd
import shap
import numpy as np
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
np.int = int
np.bool = bool

In [6]:
df = pd.read_csv("dataset.csv")
X = df[['cement', 'blast furnace slag', 'fly ash', 'water ', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age']]
y = df['CCS']

In [7]:
# 5-fold cross-validation 실행
kf = KFold(n_splits=5, shuffle=True, random_state=42)
shap_values_all = []
feature_names = X.columns

In [8]:
# 5‑폴드 교차검증 세터
kf = KFold(n_splits=5, shuffle=True, random_state=42)
shap_values_all = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}...")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 1) XGBoost 모델 학습
    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
    model.fit(X_train, y_train)

    # 2) 함수형 SHAP Explainer 생성
    explainer = shap.Explainer(model.predict, X_train, feature_names=X.columns.tolist())

    # 3) SHAP 값 계산 (Explanation 객체)
    explanation = explainer(X_test)

    # 4) 절댓값 취해 리스트에 저장
    shap_values_all.append(np.abs(explanation.values))

# 5) 모든 fold 결과를 합치고 feature별 평균 SHAP 값 계산
all_shap = np.vstack(shap_values_all)  # (총 샘플 수, 피처 수)
mean_shap_per_feature = all_shap.mean(axis=0)

# 6) 결과 출력
for name, val in zip(X.columns, mean_shap_per_feature):
    print(f"{name:20s}: {val:.4f}")

Fold 1...
Fold 2...


Exact explainer: 207it [00:10,  1.47s/it]                         


Fold 3...
Fold 4...
Fold 5...


Exact explainer: 207it [00:10,  1.30s/it]                         

cement              : 6.9733
blast furnace slag  : 3.5037
fly ash             : 0.4246
water               : 3.9407
superplasticizer    : 2.2885
coarse aggregate    : 0.7759
fine aggregate      : 1.4946
age                 : 7.8605





In [9]:
# SHAP 평균값 계산 -> 최종 importance 계산
mean_shap = np.mean(np.array(shap_values_all), axis=0)  # shape: (n_samples, n_features)
feature_importance = pd.Series(mean_shap.mean(axis=0), index=feature_names).sort_values(ascending=False)


In [10]:
print("\n Feature Importance (Mean SHAP Values across 5 folds):")
print(feature_importance)


 Feature Importance (Mean SHAP Values across 5 folds):
age                   7.860495
cement                6.973292
water                 3.940749
blast furnace slag    3.503730
superplasticizer      2.288544
fine aggregate        1.494568
coarse aggregate      0.775861
fly ash               0.424646
dtype: float64
