In [3]:
# ===========================================
# 1. 라이브러리 및 설정
# ===========================================
import os
import json
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupShuffleSplit

# 프로젝트 내부 모듈
from load_data import load_data
from features import get_features
from feature_eng import apply_feature_engineering
from config import TARGET, GROUP_COL, CAT_COLS, SEED

# 결과 저장 경로
RESULT_DIR = "./results/final_model"
os.makedirs(RESULT_DIR, exist_ok=True)

# ===========================================
# 2. 데이터 로드 및 피처 엔지니어링
# ===========================================
df = load_data()
X_base, y, groups, base_num, base_cat = get_features(df)

USE_ENGINEERED = True  # 엔지니어드 피처 사용 여부 설정

if USE_ENGINEERED:
    df_eng = apply_feature_engineering(df.copy())
    # base_num + engineered numeric + categorical columns 사용
    engineered_cols = [
        c for c in df_eng.columns
        if c not in X_base.columns and c not in [TARGET, GROUP_COL]
    ]
    use_cols = base_num + engineered_cols + base_cat
    X = df_eng[use_cols]
else:
    X = X_base

print(f"[INFO] 데이터셋 크기: {X.shape}, 타겟: {TARGET}")

# ===========================================
# 3. Train / Valid / Test 분리 (Group 유지)
# ===========================================
def group_split_three(X, y, groups,
                      train_ratio=0.6, valid_ratio=0.2, test_ratio=0.2,
                      seed=SEED):
    assert abs(train_ratio + valid_ratio + test_ratio - 1.0) < 1e-8, \
        "split 비율 합은 1이어야 합니다."

    # 1) test 분리
    gss1 = GroupShuffleSplit(n_splits=1, test_size=test_ratio, random_state=seed)
    tr_idx, te_idx = next(gss1.split(X, y, groups))
    X_trv, y_trv, g_trv = X.iloc[tr_idx], y.iloc[tr_idx], groups.iloc[tr_idx]
    X_te, y_te, g_te = X.iloc[te_idx], y.iloc[te_idx], groups.iloc[te_idx]

    # 2) train / valid 분리
    valid_ratio_in_trv = valid_ratio / (1.0 - test_ratio)
    gss2 = GroupShuffleSplit(n_splits=1, test_size=valid_ratio_in_trv, random_state=seed)
    tr_idx2, va_idx = next(gss2.split(X_trv, y_trv, g_trv))

    X_tr, y_tr, g_tr = X_trv.iloc[tr_idx2], y_trv.iloc[tr_idx2], g_trv.iloc[tr_idx2]
    X_va, y_va, g_va = X_trv.iloc[va_idx], y_trv.iloc[va_idx], g_trv.iloc[va_idx]

    return X_tr, y_tr, X_va, y_va, X_te, y_te

X_tr, y_tr, X_va, y_va, X_te, y_te = group_split_three(X, y, groups)
print(f"[INFO] Train: {X_tr.shape}, Valid: {X_va.shape}, Test: {X_te.shape}")

# ===========================================
# 4. Optuna 최적 하이퍼파라미터 불러오기
# ===========================================
with open("./results/eng_cpu_fast/best_params.json", "r") as f:
    best_params = json.load(f)

print("[INFO] Best Params:")
print(json.dumps(best_params, indent=2))

# ===========================================
# 5. 최종 학습 (Train + Valid 전체)
# ===========================================
X_train_full = pd.concat([X_tr, X_va])
y_train_full = pd.concat([y_tr, y_va])

# ✅ cat_features를 컬럼명 리스트로 전달 (중요!)
cat_cols_used = [c for c in CAT_COLS if c in X_train_full.columns]

model = CatBoostRegressor(
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=SEED,
    verbose=200,
    **best_params
)

train_pool = Pool(X_train_full, y_train_full, cat_features=cat_cols_used)
model.fit(train_pool)

# ===========================================
# 6. 테스트 세트 평가
# ===========================================
test_pool = Pool(X_te, y_te, cat_features=cat_cols_used)
y_pred = model.predict(test_pool)
test_rmse = np.sqrt(mean_squared_error(y_te, y_pred))
print(f"\n[RESULT] Test RMSE: {test_rmse:.4f}")

# ===========================================
# 7. 모델 및 피처 중요도 저장
# ===========================================
model_path = os.path.join(RESULT_DIR, "catboost_model_final.cbm")
model.save_model(model_path)

feature_importance = pd.DataFrame({
    "feature": X_train_full.columns,
    "importance": model.get_feature_importance()
}).sort_values("importance", ascending=False)

feature_importance_path = os.path.join(RESULT_DIR, "feature_importance_final.csv")
feature_importance.to_csv(feature_importance_path, index=False, encoding="utf-8-sig")

print(f"\n[INFO] 최종 모델 저장 완료: {model_path}")
print(f"[INFO] 피처 중요도 저장 완료: {feature_importance_path}")


[INFO] 데이터셋 크기: (2000, 37), 타겟: cycles_to_80pct_SOH
[INFO] Train: (1000, 37), Valid: (500, 37), Test: (500, 37)
[INFO] Best Params:
{
  "n_estimators": 1300,
  "learning_rate": 0.03452954575964436,
  "depth": 4,
  "l2_leaf_reg": 0.018535499106848145,
  "random_strength": 0.22177886638046754,
  "border_count": 89,
  "bootstrap_type": "Bayesian",
  "bagging_temperature": 3.003096688762532
}


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=7]="CELL0751": Cannot convert 'CELL0751' to float