In [None]:
print()

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

# 데이터 로드
train_data = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\train.csv')
test_data = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\test.csv')

# 결측치 처리
train_data['강수량(mm)'].fillna(0, inplace=True)
train_data['풍속(m/s)'].fillna(train_data['풍속(m/s)'].mean(), inplace=True)
train_data['습도(%)'].fillna(train_data['습도(%)'].mean(), inplace=True)
train_data['일조(hr)'].fillna(0, inplace=True)
train_data['일사(MJ/m2)'].fillna(0, inplace=True)
test_data['강수량(mm)'].fillna(0, inplace=True)
test_data['일조(hr)'] = 0
test_data['일사(MJ/m2)'] = 0

# 날짜 및 시간 특성 파생
train_data['일시'] = pd.to_datetime(train_data['일시'], format='%Y%m%d %H')
test_data['일시'] = pd.to_datetime(test_data['일시'], format='%Y%m%d %H')
train_data['연'] = train_data['일시'].dt.year
train_data['월'] = train_data['일시'].dt.month
train_data['일'] = train_data['일시'].dt.day
train_data['시간'] = train_data['일시'].dt.hour
test_data['연'] = test_data['일시'].dt.year
test_data['월'] = test_data['일시'].dt.month
test_data['일'] = test_data['일시'].dt.day
test_data['시간'] = test_data['일시'].dt.hour

# 필요하지 않은 컬럼 제거
train_data.drop(columns=['num_date_time', '일시'], inplace=True)
test_data.drop(columns=['num_date_time', '일시'], inplace=True)

# 특성과 라벨 분리
X_train = train_data.drop(columns=['전력소비량(kWh)'])
y_train = train_data['전력소비량(kWh)']

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_data)

# 데이터 분할 (훈련 및 검증 세트)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# 하이퍼파라미터 그리드 정의
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [5000],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
}

# XGBoost 모델 생성
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# 그리드 서치 객체 생성
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')

# 그리드 서치 수행
grid_search.fit(X_train_split, y_train_split, eval_set=[(X_val_split, y_val_split)], early_stopping_rounds=10, verbose=True)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", grid_search.best_params_)

# 검증 세트에서 최적 모델의 성능 평가
y_val_pred_best = grid_search.best_estimator_.predict(X_val_split)
val_rmse_best = np.sqrt(mean_squared_error(y_val_split, y_val_pred_best))
print("Validation RMSE with Best Model:", val_rmse_best)

# test 데이터에 대한 예측
y_test_pred_best = grid_search.best_estimator_.predict(X_test_scaled)
test_predictions_best = pd.DataFrame({'전력소비량(kWh)': y_test_pred_best})
test_predictions_best.to_csv('C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/xgb0811-3.csv', index=False)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


KeyboardInterrupt: 