# Feature importance 평가를 위한 SHAP
- kfold validation 실행 후 SHAP 
- 연산에 소요되는 시간 확인을 위해 우선적으로 catboost model에 대해서만 실행

In [None]:
혹시 연구실 컴퓨터에 shap 설치 안되어 있으면 설치해서 실행하길...

In [1]:
import pandas as pd
import shap
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
np.int = int
np.bool = bool

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

In [2]:
df = pd.read_csv("dataset.csv")
X = df[['cement', 'blast furnace slag', 'fly ash', 'water ', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age']]
y = df['CCS']

In [3]:
# 5-fold cross-validation 실행
kf = KFold(n_splits=5, shuffle=True, random_state=42)
shap_values_all = []
feature_names = X.columns

In [4]:
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}...")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 1) CatBoost 모델 학습
    model = CatBoostRegressor(verbose=0, random_state=42)
    model.fit(X_train, y_train)

    # 2) 함수형 SHAP Explainer 생성 (TreeExplainer 대신)
    explainer = shap.Explainer(model.predict, X_train, feature_names=X.columns.tolist())

    # 3) SHAP 값 계산 (Explanation 객체)
    explanation = explainer(X_test)

    # 4) shap_values 배열에 접근하여 절댓값 저장
    #    explanation.values.shape == (n_samples, n_features)
    shap_values_all.append(np.abs(explanation.values))

# 5) 모든 fold 결과를 모아서 feature별 평균 SHAP 값 계산
all_shap = np.vstack(shap_values_all)  # shape: (total_samples, n_features)
mean_shap_per_feature = all_shap.mean(axis=0)

# 6) 결과 확인
for name, val in zip(X.columns, mean_shap_per_feature):
    print(f"{name:20s}: {val:.4f}")

Fold 1...
Fold 2...
Fold 3...
Fold 4...
Fold 5...
cement              : 7.1096
blast furnace slag  : 3.6861
fly ash             : 0.4135
water               : 4.3935
superplasticizer    : 2.0477
coarse aggregate    : 0.9427
fine aggregate      : 1.4497
age                 : 7.4056


In [5]:
# SHAP 평균값 계산 -> 최종 importance 계산
mean_shap = np.mean(np.array(shap_values_all), axis=0)  # shape: (n_samples, n_features)
feature_importance = pd.Series(mean_shap.mean(axis=0), index=feature_names).sort_values(ascending=False)


In [6]:
print("\n Feature Importance (Mean SHAP Values across 5 folds):")
print(feature_importance)


 Feature Importance (Mean SHAP Values across 5 folds):
age                   7.405642
cement                7.109648
water                 4.393472
blast furnace slag    3.686109
superplasticizer      2.047708
fine aggregate        1.449718
coarse aggregate      0.942669
fly ash               0.413492
dtype: float64
