In [1]:
import random
import pandas as pd
import numpy as np
import os
import time
import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\train.csv')
test_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\test.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


In [5]:
def SMAPE(y, pred):
    smape = abs((y - pred)) / ((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)

    return smape

def mae(y, pred):

    return np.mean(abs(y - pred))

In [6]:
# 결측치 처리
train_df['풍속(m/s)'].fillna(train_df['풍속(m/s)'].mean(), inplace = True)
train_df['습도(%)'].fillna(train_df['습도(%)'].mean(), inplace = True)
train_df['강수량(mm)'].fillna(train_df['강수량(mm)'].mean(), inplace = True)
train_df['일조(hr)'].fillna(0, inplace = True)
train_df['일사(MJ/m2)'].fillna(0, inplace = True)

test_df['일조(hr)'] = 0
test_df['일사(MJ/m2)'] = 0
test_df['강수량(mm)'].fillna(test_df['강수량(mm)'].mean(), inplace = True)

In [7]:
def weekday(s):
    s = s.split()[0]
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()

    return weekday

In [8]:
# 공휴일 : [0601, 0606, 0815]
specialday = ['20220601', '20220606', '20220815']

In [9]:
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))
train_df['6월'] = train_df['month'].apply(lambda x: 1 if x == 6 else 0)
train_df['7월'] = train_df['month'].apply(lambda x: 1 if x == 7 else 0)
train_df['8월'] = train_df['month'].apply(lambda x: 1 if x == 8 else 0)
train_df['낮'] = train_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
train_df['밤'] = train_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
train_df['요일'] = train_df['일시'].apply(weekday)
train_df['specialday'] = train_df['일시'].apply(lambda x: 1 if x in specialday else 0)
train_df['THI'] = 9 / 5 * train_df['기온(C)'] - 0.55 * (1 - train_df['습도(%)'] / 100) * (9 / 5 * train_df['습도(%)'] - 26) + 32

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))
test_df['6월'] = test_df['month'].apply(lambda x: 1 if x == 6 else 0)
test_df['7월'] = test_df['month'].apply(lambda x: 1 if x == 7 else 0)
test_df['8월'] = test_df['month'].apply(lambda x: 1 if x == 8 else 0)
test_df['낮'] = test_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
test_df['밤'] = test_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
test_df['요일'] = test_df['일시'].apply(weekday)
test_df['specialday'] = 0
test_df['THI'] = 9 / 5 * test_df['기온(C)'] - 0.55 * (1 - test_df['습도(%)'] / 100) * (9 / 5 * test_df['습도(%)'] - 26) + 32

In [10]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),...,day,time,6월,7월,8월,낮,밤,요일,specialday,THI
0,1_20220601 00,1,20220601 00,18.6,2.04083,0.9,42.0,0.0,0.0,1085.28,...,1,0,1,0,0,0,1,2,0,49.6576
1,1_20220601 01,1,20220601 01,18.0,2.04083,1.1,45.0,0.0,0.0,1047.36,...,1,1,1,0,0,0,1,2,0,47.7625
2,1_20220601 02,1,20220601 02,17.7,2.04083,1.5,45.0,0.0,0.0,974.88,...,1,2,1,0,0,0,1,2,0,47.2225
3,1_20220601 03,1,20220601 03,16.7,2.04083,1.4,48.0,0.0,0.0,953.76,...,1,3,1,0,0,0,1,2,0,44.7856
4,1_20220601 04,1,20220601 04,18.4,2.04083,2.8,43.0,0.0,0.0,986.40,...,1,4,1,0,0,0,1,2,0,49.0061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,2.04083,0.9,86.0,0.5,0.0,881.04,...,24,19,0,0,1,1,1,2,0,63.6624
203996,100_20220824 20,100,20220824 20,22.4,2.04083,1.3,86.0,0.0,0.0,798.96,...,24,20,0,0,1,0,1,2,0,62.4024
203997,100_20220824 21,100,20220824 21,21.3,2.04083,1.0,92.0,0.0,0.0,825.12,...,24,21,0,0,1,0,1,2,0,64.1976
203998,100_20220824 22,100,20220824 22,21.0,2.04083,0.3,94.0,0.0,0.0,640.08,...,24,22,0,0,1,0,1,2,0,65.0744


In [13]:
for i in range(1, 101):
    df = train_df[train_df['건물번호'] == i]
    train_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    train_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    train_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

train_df['기온_gap'].fillna(train_df['기온_gap'].mean(), inplace = True)
train_df['풍속_gap'].fillna(train_df['풍속_gap'].mean(), inplace = True)
train_df['습도_gap'].fillna(train_df['습도_gap'].mean(), inplace = True)
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),...,7월,8월,낮,밤,요일,specialday,THI,기온_gap,풍속_gap,습도_gap
0,1_20220601 00,1,20220601 00,18.6,2.04083,0.9,42.0,0.0,0.0,1085.28,...,0,0,0,1,2,0,49.6576,0.002102,0.00004,0.015316
1,1_20220601 01,1,20220601 01,18.0,2.04083,1.1,45.0,0.0,0.0,1047.36,...,0,0,0,1,2,0,47.7625,-0.600000,0.20000,3.000000
2,1_20220601 02,1,20220601 02,17.7,2.04083,1.5,45.0,0.0,0.0,974.88,...,0,0,0,1,2,0,47.2225,-0.300000,0.40000,0.000000
3,1_20220601 03,1,20220601 03,16.7,2.04083,1.4,48.0,0.0,0.0,953.76,...,0,0,0,1,2,0,44.7856,-1.000000,-0.10000,3.000000
4,1_20220601 04,1,20220601 04,18.4,2.04083,2.8,43.0,0.0,0.0,986.40,...,0,0,0,1,2,0,49.0061,1.700000,1.40000,-5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,2.04083,0.9,86.0,0.5,0.0,881.04,...,0,1,1,1,2,0,63.6624,-2.000000,-0.40000,12.000000
203996,100_20220824 20,100,20220824 20,22.4,2.04083,1.3,86.0,0.0,0.0,798.96,...,0,1,0,1,2,0,62.4024,-0.700000,0.40000,0.000000
203997,100_20220824 21,100,20220824 21,21.3,2.04083,1.0,92.0,0.0,0.0,825.12,...,0,1,0,1,2,0,64.1976,-1.100000,-0.30000,6.000000
203998,100_20220824 22,100,20220824 22,21.0,2.04083,0.3,94.0,0.0,0.0,640.08,...,0,1,0,1,2,0,65.0744,-0.300000,-0.70000,2.000000


In [14]:
for i in range(1, 101):
    df = test_df[test_df['건물번호'] == i]
    test_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    test_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    test_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

test_df['기온_gap'].fillna(test_df['기온_gap'].mean(), inplace = True)
test_df['풍속_gap'].fillna(test_df['풍속_gap'].mean(), inplace = True)
test_df['습도_gap'].fillna(test_df['습도_gap'].mean(), inplace = True)
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),month,...,7월,8월,낮,밤,요일,specialday,THI,기온_gap,풍속_gap,습도_gap
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,0,0,8,...,0,1,0,1,3,0,58.3456,-0.009401,-0.004305,0.062455
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,0,0,8,...,0,1,0,1,3,0,57.4456,-0.500000,-1.300000,0.000000
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,0,0,8,...,0,1,0,1,3,0,57.8725,-0.300000,0.600000,3.000000
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,0,0,8,...,0,1,0,1,3,0,57.9376,-0.600000,-0.200000,3.000000
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,0,0,8,...,0,1,0,1,3,0,56.9961,-0.300000,-0.300000,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84,0,0,8,...,0,1,1,1,2,0,61.4824,-1.300000,0.000000,7.000000
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95,0,0,8,...,0,1,0,1,2,0,65.2725,-1.800000,-0.500000,11.000000
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98,0,0,8,...,0,1,0,1,2,0,66.7056,-0.500000,0.000000,3.000000
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97,0,0,8,...,0,1,0,1,2,0,65.7281,-0.100000,0.700000,-1.000000


In [15]:
# 날짜 범위 설정
start_date_train = '20220601'
end_date_train = '20220820'
end_date_val = '20220824'

# 날짜 범위에 해당하는 인덱스 추출
train_indices = (train_df['일시'] >= start_date_train) & (train_df['일시'] <= end_date_train)
val_indices = (train_df['일시'] > end_date_train) & (train_df['일시'] <= end_date_val)

In [16]:
X_train = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_train = train_df['전력소비량(kWh)']
test_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)'], inplace = True)

In [17]:
X_train.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,6월,7월,8월,낮,밤,요일,specialday,THI,기온_gap,풍속_gap,습도_gap
0,1,18.6,2.04083,0.9,42.0,6,1,0,1,0,0,0,1,2,0,49.6576,0.002102,4e-05,0.015316
1,1,18.0,2.04083,1.1,45.0,6,1,1,1,0,0,0,1,2,0,47.7625,-0.6,0.2,3.0
2,1,17.7,2.04083,1.5,45.0,6,1,2,1,0,0,0,1,2,0,47.2225,-0.3,0.4,0.0
3,1,16.7,2.04083,1.4,48.0,6,1,3,1,0,0,0,1,2,0,44.7856,-1.0,-0.1,3.0
4,1,18.4,2.04083,2.8,43.0,6,1,4,1,0,0,0,1,2,0,49.0061,1.7,1.4,-5.0


In [18]:
# 훈련 및 검증 데이터 분할
X_train_split = X_train[train_indices]
X_val_split = X_train[val_indices]
y_train_split = y_train[train_indices]
y_val_split = y_train[val_indices]

In [19]:
# XGBRegressor
xgb_params = {
    'learning_rate': 0.07,
    'max_depth': 50,
    'colsample_bytree' : 0.8,
    'min_child_weight': 10,
    'n_estimators': 100
}

# LGBMRegressor
lgbm_params = {
    'learning_rate': 0.07,
    'max_depth': 50,
    'min_child_samples': 5,
    'num_leaves' : 100,
    'n_estimators': 100
}

# RandomForestRegressor
rf_params = {
    'n_estimators': 100,
    'max_depth': 50,
    'min_samples_split': 3,
    'min_samples_leaf': 3,
    'random_state': 42
}

# DecisionTreeRegressor
dt_params = {
    'max_depth': 50,
    'min_samples_split': 3,
    'min_samples_leaf': 3,
    'random_state': 42
}

In [20]:
models_by_building = {}

for building_num in range(1, 101):
    building_data = X_train_split[X_train_split['건물번호'] == building_num]
    building_target = y_train_split[X_train_split['건물번호'] == building_num]

    # 각 모델별로 생성 및 학습
    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        **xgb_params
    )
    xgb_model.fit(building_data, building_target)

    lgbm_model = LGBMRegressor(
        objective='regression',
        random_state=42,
        **lgbm_params
    )
    lgbm_model.fit(building_data, building_target)

    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(building_data, building_target)

    dt_model = DecisionTreeRegressor(**dt_params)
    dt_model.fit(building_data, building_target)

    models_by_building[building_num] = {
        'xgb': xgb_model,
        'lgbm': lgbm_model,
        'rf': rf_model,
        'dt': dt_model
    }


print("Training completed for all buildings.")

ensemble_preds_by_building = {}
for building_num in range(1, 101):
    building_data = X_val_split[X_val_split['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(building_data)
    dt_preds = models_by_building[building_num]['dt'].predict(building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + dt_preds) / 4
    ensemble_preds_by_building[building_num] = ensemble_preds

print("Prediction completed for all buildings.")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 862
[LightGBM] [Info] Number of data points in the train set: 1920, number of used features: 17
[LightGBM] [Info] Start training from score 2643.372748
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 859
[LightGBM] [Info] Number of data points in the train set: 1920, number of used features: 17
[LightGBM] [Info] Start training from score 1676.138344
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 862
[LightGBM] [Info] Number of data points in the train set: 1920, number of used features: 17
[LightGBM] [Info] Start training from score 1448.643469
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 862


In [23]:
val_smape_best = 0
for building_num in range(1, 101):
    val_smape = SMAPE(y_val_split[X_val_split['건물번호'] == building_num], ensemble_preds_by_building[building_num])
    val_smape_best += val_smape

val_smape_best /= 100  # 전체 건물 수로 나누어 평균 계산
print("Average Validation SMAPE:", val_smape_best)

y_test_pred_best = np.array([])
for building_num in range(1, 101):
    test_building_data = test_df[test_df['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(test_building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(test_building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(test_building_data)
    dt_preds = models_by_building[building_num]['dt'].predict(test_building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + dt_preds) / 4
    y_test_pred_best = np.concatenate((y_test_pred_best, ensemble_preds))

Average Validation SMAPE: 5.979990655749459


In [24]:
submission = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [25]:
submission['answer'] = y_test_pred_best

In [26]:
submission.tail(60)

Unnamed: 0,num_date_time,answer
16740,100_20220829 12,859.552403
16741,100_20220829 13,832.340795
16742,100_20220829 14,858.639772
16743,100_20220829 15,877.114769
16744,100_20220829 16,889.770858
16745,100_20220829 17,900.797117
16746,100_20220829 18,858.18326
16747,100_20220829 19,897.362835
16748,100_20220829 20,757.162724
16749,100_20220829 21,707.937676


In [27]:
# submission.to_csv('C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/0828-2.csv', index=False)