머신러닝 모델 : xgbost 구현

In [None]:
# ===== 1️⃣ 기본 설정 =====
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
# ===== 2️⃣ 연도별 XGBoost 학습 함수 =====
def train_xgboost_by_year(BASE_PATH, year):
    year_path = os.path.join(BASE_PATH, str(year))
    if not os.path.exists(year_path):
        print(f"⚠ {year_path} 폴더가 존재하지 않습니다. 데이터를 먼저 다운로드하세요.")
        return

    # CSV 파일 불러오기
    files = [os.path.join(year_path, f) for f in os.listdir(year_path) if f.endswith(".csv")]
    if not files:
        print(f"⚠ {year}년 폴더에 CSV 파일이 존재하지 않습니다.")
        return

    df_list = [pd.read_csv(f, encoding='utf-8-sig') for f in files]
    data = pd.concat(df_list, ignore_index=True)

    # 전처리
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day
    data['hour'] = data['date'].dt.hour
    data = data.dropna(subset=['generation'])

    feature_cols = ['year','month','day','hour','temperature','humidity','radiation']
    feature_cols = [c for c in feature_cols if c in data.columns]
    X = data[feature_cols]
    y = data['generation']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost 모델 학습
    model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"\n📊 {year}년 모델 평가 결과")
    print(f"MAE  : {mae:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"R²   : {r2:.4f}")

    # 결과 CSV 저장
    OUTPUT_PATH = os.path.join(BASE_PATH, f"xgboost_result_{year}.csv")
    pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred}).to_csv(OUTPUT_PATH, index=False, encoding='utf-8-sig')
    print(f"💾 {year}년 예측 결과 저장 완료 → {OUTPUT_PATH}")

    return model