In [None]:
import random
import pandas as pd
import numpy as np
import os
import time
import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\train.csv')
test_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\test.csv')

In [None]:
train_df.info()

In [None]:
def SMAPE(y, pred):
    smape = abs((y - pred)) / ((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)

    return smape

def mae(y, pred):

    return np.mean(abs(y - pred))

In [None]:
# 결측치 처리
train_df['풍속(m/s)'].fillna(train_df['풍속(m/s)'].mean(), inplace = True)
train_df['습도(%)'].fillna(train_df['습도(%)'].mean(), inplace = True)
train_df['강수량(mm)'].fillna(train_df['강수량(mm)'].mean(), inplace = True)
train_df['일조(hr)'].fillna(0, inplace = True)
train_df['일사(MJ/m2)'].fillna(0, inplace = True)

test_df['일조(hr)'] = 0
test_df['일사(MJ/m2)'] = 0
test_df['강수량(mm)'].fillna(test_df['강수량(mm)'].mean(), inplace = True)

In [None]:
def weekday(s):
    s = s.split()[0]
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()

    return weekday

In [None]:
# 공휴일 : [0601, 0606, 0815]
specialday = ['20220601', '20220606', '20220815']

In [None]:
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))
train_df['6월'] = train_df['month'].apply(lambda x: 1 if x == 6 else 0)
train_df['7월'] = train_df['month'].apply(lambda x: 1 if x == 7 else 0)
train_df['8월'] = train_df['month'].apply(lambda x: 1 if x == 8 else 0)
train_df['낮'] = train_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
train_df['밤'] = train_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
train_df['요일'] = train_df['일시'].apply(weekday)
train_df['specialday'] = train_df['일시'].apply(lambda x: 1 if x in specialday else 0)
train_df['THI'] = 9 / 5 * train_df['기온(C)'] - 0.55 * (1 - train_df['습도(%)'] / 100) * (9 / 5 * train_df['습도(%)'] - 26) + 32

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))
test_df['6월'] = test_df['month'].apply(lambda x: 1 if x == 6 else 0)
test_df['7월'] = test_df['month'].apply(lambda x: 1 if x == 7 else 0)
test_df['8월'] = test_df['month'].apply(lambda x: 1 if x == 8 else 0)
test_df['낮'] = test_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
test_df['밤'] = test_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
test_df['요일'] = test_df['일시'].apply(weekday)
test_df['specialday'] = 0
test_df['THI'] = 9 / 5 * test_df['기온(C)'] - 0.55 * (1 - test_df['습도(%)'] / 100) * (9 / 5 * test_df['습도(%)'] - 26) + 32

In [None]:
train_df

In [None]:
for i in range(1, 101):
    df = train_df[train_df['건물번호'] == i]
    train_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    train_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    train_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

train_df['기온_gap'].fillna(train_df['기온_gap'].mean(), inplace = True)
train_df['풍속_gap'].fillna(train_df['풍속_gap'].mean(), inplace = True)
train_df['습도_gap'].fillna(train_df['습도_gap'].mean(), inplace = True)
train_df

In [None]:
for i in range(1, 101):
    df = test_df[test_df['건물번호'] == i]
    test_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    test_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    test_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

test_df['기온_gap'].fillna(test_df['기온_gap'].mean(), inplace = True)
test_df['풍속_gap'].fillna(test_df['풍속_gap'].mean(), inplace = True)
test_df['습도_gap'].fillna(test_df['습도_gap'].mean(), inplace = True)
test_df

In [None]:
# 날짜 범위 설정
start_date_train = '20220601'
end_date_train = '20220821'
end_date_val = '20220824'

# 날짜 범위에 해당하는 인덱스 추출
train_indices = (train_df['일시'] >= start_date_train) & (train_df['일시'] <= end_date_train)
val_indices = (train_df['일시'] > end_date_train) & (train_df['일시'] <= end_date_val)

In [None]:
X_train = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_train = train_df['전력소비량(kWh)']
test_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)'], inplace = True)

In [None]:
X_train.head()

In [None]:
# 훈련 및 검증 데이터 분할
X_train_split = X_train[train_indices]
X_val_split = X_train[val_indices]
y_train_split = y_train[train_indices]
y_val_split = y_train[val_indices]

In [None]:
# XGBRegressor
xgb_params = {
    'learning_rate': 0.07,
    'max_depth': 50,
    'colsample_bytree' : 0.8,
    'min_child_weight': 10,
    'n_estimators': 100
}

# LGBMRegressor
lgbm_params = {
    'learning_rate': 0.07,
    'max_depth': 50,
    'min_child_samples': 5,
    'num_leaves' : 100,
    'n_estimators': 100
}

# RandomForestRegressor
rf_params = {
    'n_estimators': 100,
    'max_depth': 50,
    'min_samples_split': 3,
    'min_samples_leaf': 3,
    'random_state': 42
}

# DecisionTreeRegressor
dt_params = {
    'max_depth': 50,
    'min_samples_split': 3,
    'min_samples_leaf': 3,
    'random_state': 42
}

In [None]:
models_by_building = {}

for building_num in range(1, 101):
    building_data = X_train_split[X_train_split['건물번호'] == building_num]
    building_target = y_train_split[X_train_split['건물번호'] == building_num]

    # 각 모델별로 생성 및 학습
    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        **xgb_params
    )
    xgb_model.fit(building_data, building_target)

    lgbm_model = LGBMRegressor(
        objective='regression',
        random_state=42,
        **lgbm_params
    )
    lgbm_model.fit(building_data, building_target)

    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(building_data, building_target)

    dt_model = DecisionTreeRegressor(**dt_params)
    dt_model.fit(building_data, building_target)

    models_by_building[building_num] = {
        'xgb': xgb_model,
        'lgbm': lgbm_model,
        'rf': rf_model,
        'dt': dt_model
    }


print("Training completed for all buildings.")

ensemble_preds_by_building = {}
for building_num in range(1, 101):
    building_data = X_val_split[X_val_split['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(building_data)
    dt_preds = models_by_building[building_num]['dt'].predict(building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + dt_preds) / 4
    ensemble_preds_by_building[building_num] = ensemble_preds

print("Prediction completed for all buildings.")

In [None]:
val_smape_best = 0
for building_num in range(1, 101):
    val_smape = SMAPE(y_val_split[X_val_split['건물번호'] == building_num], ensemble_preds_by_building[building_num])
    val_smape_best += val_smape

val_smape_best /= 100  # 전체 건물 수로 나누어 평균 계산
print("Average Validation SMAPE:", val_smape_best)

y_test_pred_best = np.array([])
for building_num in range(1, 101):
    test_building_data = test_df[test_df['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(test_building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(test_building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(test_building_data)
    dt_preds = models_by_building[building_num]['dt'].predict(test_building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + dt_preds) / 4
    y_test_pred_best = np.concatenate((y_test_pred_best, ensemble_preds))

In [None]:
submission = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\sample_submission.csv')
submission

In [None]:
submission['answer'] = y_test_pred_best

In [None]:
submission.tail(60)

In [None]:
# submission.to_csv('C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/0828-2.csv', index=False)