In [2]:
import pandas as pd
import numpy as np
import shap
import warnings
import time

# 모델 평가 및 분리
from sklearn.model_selection import cross_val_score, train_test_split

# Tree-based Models (Scikit-learn: CPU Multi-core)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    ExtraTreesClassifier
)

# 외부 라이브러리 모델
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 경고 메시지 무시
warnings.filterwarnings('ignore')

# --------------------------------------------------------------------------------
# 1. 데이터 로드 및 전처리
# --------------------------------------------------------------------------------
df = pd.read_csv('Dry_Eye_Dataset_preprocessed.csv')

target_col = 'Dry Eye Disease'
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------------------------------------------------------------
# 2. Feature Importance 계산 (SHAP 방식 + XGBoost GPU)
# --------------------------------------------------------------------------------
print("Calculating Feature Importance using SHAP with XGBoost...")

# [수정됨] 최신 XGBoost 문법 적용 (안전한 설정)
# GPU가 있으면 쓰고, 없으면 알아서 CPU로 돕니다.
xgb_for_shap = XGBClassifier(
    random_state=42,
    tree_method='hist',       # 최신 버전 표준 (빠름)
    device='cuda',            # GPU 사용 (T4 감지)
    eval_metric='logloss',
    use_label_encoder=False,
    verbosity=0
)

# 학습 시도 (GPU 에러 발생 시 CPU로 자동 전환하도록 try-except 처리 안함, 최신버전은 device='cuda'로 해결됨)
try:
    xgb_for_shap.fit(X_train, y_train)
except Exception as e:
    print(f"GPU Init failed, falling back to CPU: {e}")
    xgb_for_shap = XGBClassifier(random_state=42, n_jobs=-1)
    xgb_for_shap.fit(X_train, y_train)

# SHAP 계산
explainer = shap.TreeExplainer(xgb_for_shap)
shap_values = explainer.shap_values(X_test)

# SHAP 값 처리
if isinstance(shap_values, list):
    feature_importance_vals = np.abs(shap_values[1]).mean(axis=0)
else:
    feature_importance_vals = np.abs(shap_values).mean(axis=0)

feature_names = X_train.columns
feature_importance_list = sorted(zip(feature_importance_vals, feature_names), reverse=True)

# --------------------------------------------------------------------------------
# 3. 정렬된 Feature 목록 출력
# --------------------------------------------------------------------------------
print("\n[Feature Importance Ranking (SHAP via XGBoost)]")
print(f"{'Rank':<5} | {'Feature Name':<30} | {'SHAP Importance':<15}")
print("-" * 60)

for idx, (score, name) in enumerate(feature_importance_list):
    print(f"{idx + 1:<5} | {name:<30} | {score:.5f}")

sorted_features = [name for score, name in feature_importance_list]

print("\n" + "="*85 + "\n")

# --------------------------------------------------------------------------------
# 4. 모델별 Top 1 ~ 10 성능 평가
# --------------------------------------------------------------------------------

models = [
    # 1. Scikit-learn Models (CPU n_jobs=-1)
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42, n_jobs=-1)),
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("Bagging", BaggingClassifier(random_state=42, n_jobs=-1)),
    ("Extra Trees", ExtraTreesClassifier(random_state=42, n_jobs=-1)),

    # 2. XGBoost (GPU Accelerated - Safe Mode)
    ("XGBoost", XGBClassifier(
        random_state=42,
        tree_method='hist',   # 'gpu_hist' 대신 'hist' 사용
        device='cuda',        # GPU 명시
        eval_metric='logloss',
        use_label_encoder=False,
        verbosity=0
    )),

    # 3. LightGBM (CPU Multi-core - Safe Mode)
    # Colab에서 LightGBM GPU는 별도 컴파일 없이는 에러가 잘 나므로 CPU 멀티코어가 가장 안전하고 빠름
    ("LightGBM", LGBMClassifier(
        random_state=42,
        verbose=-1,
        n_jobs=-1
    ))
]

print(f"{'Model':<20} | {'N_Feats':<8} | {'CV_Valid_Acc':<12} | {'Test_Acc':<10} | {'Note'}")
print("-" * 85)

for model_name, model in models:
    max_k = min(10, len(sorted_features))

    for k in range(1, max_k + 1):
        current_features = sorted_features[:k]

        X_train_sel = X_train[current_features]
        X_test_sel = X_test[current_features]

        try:
            # XGBoost/LGBM 자체 GPU/CPU 설정 따름.
            # Scikit-learn CV 함수에게는 병렬처리를 맡기지 않음 (n_jobs=1) -> 모델 내부에서 병렬처리 하도록 유도
            # (모델 내부 n_jobs=-1과 CV n_jobs=-1이 충돌하면 오히려 느려질 수 있어서, 모델 위주로 둠)
            if "XGBoost" in model_name or "LightGBM" in model_name or "Random Forest" in model_name or "Extra Trees" in model_name:
                 cv_n_jobs = 1 # 모델 안에서 병렬처리
            else:
                 cv_n_jobs = -1 # 모델이 병렬처리 없으면 CV에서 병렬처리

            cv_scores = cross_val_score(model, X_train_sel, y_train, cv=5, scoring='accuracy', n_jobs=cv_n_jobs)
            cv_acc = cv_scores.mean()

            # 모델 학습 및 테스트
            model.fit(X_train_sel, y_train)
            test_acc = model.score(X_test_sel, y_test)

            print(f"{model_name:<20} | {k:<8} | {cv_acc:.4f}       | {test_acc:.4f}     | Top {k} features")

        except Exception as e:
            print(f"{model_name:<20} | {k:<8} | Error          | Error          | {e}")

    print("-" * 85)

Calculating Feature Importance using SHAP with XGBoost...

[Feature Importance Ranking (SHAP via XGBoost)]
Rank  | Feature Name                   | SHAP Importance
------------------------------------------------------------
1     | Discomfort Eye-strain          | 0.26283
2     | Itchiness/Irritation in eye    | 0.25358
3     | Redness in eye                 | 0.25239
4     | Average screen time            | 0.14805
5     | Physical activity              | 0.13879
6     | Systolic_BP                    | 0.13239
7     | Weight                         | 0.11887
8     | Sleep duration                 | 0.11734
9     | Daily steps                    | 0.11470
10    | Height                         | 0.11149
11    | Age                            | 0.11047
12    | Diastolic_BP                   | 0.10836
13    | Heart rate                     | 0.10554
14    | Sleep quality                  | 0.06508
15    | Stress level                   | 0.06117
16    | Smart device before bed        |