In [1]:
import pandas as pd
import itertools
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def optuna_tune_xgboost(X, y, combo, n_trials=50):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'random_state': 42,
            'verbosity': 0,
        }

        model = XGBRegressor(**params)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model, X[list(combo)], y, cv=kf, scoring='r2')
        return scores.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_trial.value, study.best_trial.params

def evaluate_and_tune_top_combos(X, y, top_n=3, n_trials=50):
    base_results = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    print("🔍 1단계: 전체 조합 성능 평가 중...\n")
    for r in range(1, len(X.columns) + 1):
        for combo in itertools.combinations(X.columns, r):
            X_subset = X[list(combo)]
            model = XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=6, random_state=42, verbosity=0)
            scores = cross_val_score(model, X_subset, y, cv=kf, scoring='r2')
            base_results.append({
                'features': combo,
                'mean_r2': scores.mean()
            })

    top_combos = sorted(base_results, key=lambda x: x['mean_r2'], reverse=True)[:top_n]

    print("⚙️ 2단계: 상위 조합에 대해 Optuna 하이퍼파라미터 튜닝 중...\n")

    for i, combo_info in enumerate(top_combos):
        combo = combo_info['features']
        print(f"⏳ {i+1}/{top_n} 조합 튜닝 중: {combo}")
        best_r2, best_params = optuna_tune_xgboost(X, y, combo, n_trials=n_trials)

        print(f"\n✅ 결과 {i+1}:")
        print(f"📌 Feature 조합: {combo}")
        print(f"📈 최적 R²: {best_r2:.4f}")
        print(f"⚙️ 최적 하이퍼파라미터: {best_params}")
        print("-" * 60)

# 데이터 불러오기
df = pd.read_csv('dataset.csv')
X = df[['cement', 'blast furnace slag', 'fly ash', 'water ', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age']]
y = df['CCS']

# 실행
evaluate_and_tune_top_combos(X, y, top_n=3, n_trials=50)

🔍 1단계: 전체 조합 성능 평가 중...



[I 2025-07-17 15:03:55,474] A new study created in memory with name: no-name-c1fb53b8-cf87-4414-8c05-523daf615c09


⚙️ 2단계: 상위 조합에 대해 Optuna 하이퍼파라미터 튜닝 중...

⏳ 1/3 조합 튜닝 중: ('cement', 'blast furnace slag', 'fly ash', 'water ', 'superplasticizer', 'fine aggregate', 'age')


[I 2025-07-17 15:03:57,156] Trial 0 finished with value: 0.9350336238771035 and parameters: {'n_estimators': 764, 'learning_rate': 0.0785903755627758, 'max_depth': 3, 'subsample': 0.9104160076181422, 'colsample_bytree': 0.5308164806421753, 'reg_alpha': 4.608790659650684, 'reg_lambda': 4.009511276088777}. Best is trial 0 with value: 0.9350336238771035.
[I 2025-07-17 15:03:59,266] Trial 1 finished with value: 0.9346344233500627 and parameters: {'n_estimators': 773, 'learning_rate': 0.10281268042579744, 'max_depth': 4, 'subsample': 0.5387337517427757, 'colsample_bytree': 0.5579889139853266, 'reg_alpha': 6.0071678301691644, 'reg_lambda': 8.48900324727874}. Best is trial 0 with value: 0.9350336238771035.
[I 2025-07-17 15:04:02,852] Trial 2 finished with value: 0.9252413510984077 and parameters: {'n_estimators': 435, 'learning_rate': 0.033214624016941224, 'max_depth': 10, 'subsample': 0.9942156138913327, 'colsample_bytree': 0.7234006965161763, 'reg_alpha': 6.513683835043505, 'reg_lambda': 6.


✅ 결과 1:
📌 Feature 조합: ('cement', 'blast furnace slag', 'fly ash', 'water ', 'superplasticizer', 'fine aggregate', 'age')
📈 최적 R²: 0.9389
⚙️ 최적 하이퍼파라미터: {'n_estimators': 883, 'learning_rate': 0.10380678815712259, 'max_depth': 4, 'subsample': 0.6731320200522151, 'colsample_bytree': 0.7583214548913666, 'reg_alpha': 9.2259842322436, 'reg_lambda': 6.554939857411285}
------------------------------------------------------------
⏳ 2/3 조합 튜닝 중: ('cement', 'blast furnace slag', 'water ', 'superplasticizer', 'fine aggregate', 'age')


[I 2025-07-17 15:05:58,670] Trial 0 finished with value: 0.8805512311248338 and parameters: {'n_estimators': 456, 'learning_rate': 0.012209094801724678, 'max_depth': 3, 'subsample': 0.5773456499647331, 'colsample_bytree': 0.8564766468984039, 'reg_alpha': 6.0953914026117975, 'reg_lambda': 3.7454122324222086}. Best is trial 0 with value: 0.8805512311248338.
[I 2025-07-17 15:06:00,766] Trial 1 finished with value: 0.931459337420003 and parameters: {'n_estimators': 508, 'learning_rate': 0.025603852572397136, 'max_depth': 6, 'subsample': 0.6325086582543642, 'colsample_bytree': 0.8219306251569833, 'reg_alpha': 3.766986122463968, 'reg_lambda': 0.1258616554580183}. Best is trial 1 with value: 0.931459337420003.
[I 2025-07-17 15:06:08,345] Trial 2 finished with value: 0.9278483673602442 and parameters: {'n_estimators': 970, 'learning_rate': 0.011828397574959971, 'max_depth': 10, 'subsample': 0.8863963828720027, 'colsample_bytree': 0.7994993811628175, 'reg_alpha': 2.1577330575003106, 'reg_lambda


✅ 결과 2:
📌 Feature 조합: ('cement', 'blast furnace slag', 'water ', 'superplasticizer', 'fine aggregate', 'age')
📈 최적 R²: 0.9398
⚙️ 최적 하이퍼파라미터: {'n_estimators': 925, 'learning_rate': 0.2056362744709152, 'max_depth': 4, 'subsample': 0.9625213975754185, 'colsample_bytree': 0.8264731817374069, 'reg_alpha': 0.8724521302946711, 'reg_lambda': 8.085434379243221}
------------------------------------------------------------
⏳ 3/3 조합 튜닝 중: ('cement', 'blast furnace slag', 'water ', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age')


[I 2025-07-17 15:07:52,828] Trial 0 finished with value: 0.8967036966866668 and parameters: {'n_estimators': 268, 'learning_rate': 0.030813440108358583, 'max_depth': 3, 'subsample': 0.8863925576997085, 'colsample_bytree': 0.9337913230747558, 'reg_alpha': 0.8889322660541732, 'reg_lambda': 7.27151992336816}. Best is trial 0 with value: 0.8967036966866668.
[I 2025-07-17 15:07:56,032] Trial 1 finished with value: 0.9340531915236389 and parameters: {'n_estimators': 933, 'learning_rate': 0.02812466236207612, 'max_depth': 5, 'subsample': 0.7743664920055489, 'colsample_bytree': 0.8040629617015687, 'reg_alpha': 6.332596530616894, 'reg_lambda': 4.697535770471046}. Best is trial 1 with value: 0.9340531915236389.
[I 2025-07-17 15:07:58,998] Trial 2 finished with value: 0.9361820625072742 and parameters: {'n_estimators': 713, 'learning_rate': 0.0987154310478302, 'max_depth': 8, 'subsample': 0.5259290154074474, 'colsample_bytree': 0.5385290729642939, 'reg_alpha': 9.820964167061074, 'reg_lambda': 8.6


✅ 결과 3:
📌 Feature 조합: ('cement', 'blast furnace slag', 'water ', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age')
📈 최적 R²: 0.9401
⚙️ 최적 하이퍼파라미터: {'n_estimators': 429, 'learning_rate': 0.1377010711273984, 'max_depth': 4, 'subsample': 0.8419644063736891, 'colsample_bytree': 0.5907939005767037, 'reg_alpha': 1.2744800113014643, 'reg_lambda': 5.492233873170676}
------------------------------------------------------------
