In [4]:
import pandas as pd
import optuna
import itertools
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor


In [2]:
df = pd.read_csv('dataset.csv')
X = df[['cement', 'blast furnace slag', 'fly ash', 'superplasticizer', 'coarse aggregate', 'age']]
y = df['CCS']

# 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial, X_train, y_train, X_test, y_test):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'random_seed': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

def optuna_tune_catboost(X_train, y_train, X_test, y_test, features, n_trials=50):
    def tuned_objective(trial):
        return objective(trial, X_train[features], y_train, X_test[features], y_test)

    study = optuna.create_study(direction='maximize')
    study.optimize(tuned_objective, n_trials=n_trials)

    best_params = study.best_params
    best_model = CatBoostRegressor(**best_params, random_seed=42, verbose=0)
    best_model.fit(X_train[features], y_train)
    y_pred = best_model.predict(X_test[features])
    r2 = r2_score(y_test, y_pred)

    return r2, best_params

# 지정 피처
selected_features = ['cement', 'blast furnace slag', 'fly ash', 'superplasticizer', 'coarse aggregate', 'age']

# 실행
r2, best_params = optuna_tune_catboost(X_train, y_train, X_test, y_test, selected_features, n_trials=50)

print(f"📌 지정 피처: {selected_features}")
print(f"📈 테스트 세트 기준 R²: {r2:.4f}")
print(f"⚙️ 최적 하이퍼파라미터: {best_params}")

[I 2025-07-30 17:38:06,098] A new study created in memory with name: no-name-3c56ac52-3dc1-4432-981c-00408f258b16


[I 2025-07-30 17:38:09,021] Trial 0 finished with value: 0.9497034818115826 and parameters: {'iterations': 746, 'learning_rate': 0.1257236240236609, 'depth': 8}. Best is trial 0 with value: 0.9497034818115826.
[I 2025-07-30 17:38:10,030] Trial 1 finished with value: 0.9461303034238112 and parameters: {'iterations': 602, 'learning_rate': 0.13058574815609694, 'depth': 6}. Best is trial 0 with value: 0.9497034818115826.
[I 2025-07-30 17:38:13,499] Trial 2 finished with value: 0.9536688557912703 and parameters: {'iterations': 967, 'learning_rate': 0.08183881227618986, 'depth': 8}. Best is trial 2 with value: 0.9536688557912703.
[I 2025-07-30 17:38:16,833] Trial 3 finished with value: 0.9423960767681697 and parameters: {'iterations': 266, 'learning_rate': 0.18593074206887825, 'depth': 10}. Best is trial 2 with value: 0.9536688557912703.
[I 2025-07-30 17:38:17,065] Trial 4 finished with value: 0.8986250203770284 and parameters: {'iterations': 208, 'learning_rate': 0.0979872744761593, 'depth'

📌 지정 피처: ['cement', 'blast furnace slag', 'fly ash', 'superplasticizer', 'coarse aggregate', 'age']
📈 테스트 세트 기준 R²: 0.9559
⚙️ 최적 하이퍼파라미터: {'iterations': 931, 'learning_rate': 0.1252659009325331, 'depth': 5}


In [5]:
# 1) 최적 파라미터로 모델 생성
model = CatBoostRegressor(**best_params, random_seed=42, verbose=0)

# 2) 5‑폴드 교차검증용 설정
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# 3) 교차검증으로 R² 점수 계산
cv_scores = cross_val_score(
    model,
    X_train[selected_features],
    y_train,
    cv=cv,
    scoring='r2'
)

# 4) 교차검증 결과 출력
print("📊 교차검증 R² 점수들:", cv_scores)
print(f"▶️ 평균 R²: {cv_scores.mean():.4f}")
print(f"▶️ 표준편차 (std): {cv_scores.std():.4f}")
print(f"▶️ 분산 (var): {cv_scores.var():.4f}")

# 5) 전체 학습 데이터로 모델 재학습 후, 테스트 세트 평가
model.fit(X_train[selected_features], y_train)
y_test_pred = model.predict(X_test[selected_features])
test_r2 = r2_score(y_test, y_test_pred)
print(f"\n🧪 테스트 세트 R²: {test_r2:.4f}")

📊 교차검증 R² 점수들: [0.94912553 0.89685658 0.89493999 0.9114179  0.93547398]
▶️ 평균 R²: 0.9176
▶️ 표준편차 (std): 0.0214
▶️ 분산 (var): 0.0005

🧪 테스트 세트 R²: 0.9559
