In [1]:
import random
import pandas as pd
import numpy as np
import os
import time
import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\train.csv')
test_df = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\test.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


In [5]:
def SMAPE(y, pred):
    smape = abs((y - pred)) / ((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)

    return smape

def mae(y, pred):

    return np.mean(abs(y - pred))

In [6]:
# 결측치 처리
train_df['풍속(m/s)'].fillna(train_df['풍속(m/s)'].mean(), inplace = True)
train_df['습도(%)'].fillna(train_df['습도(%)'].mean(), inplace = True)
train_df['강수량(mm)'].fillna(train_df['강수량(mm)'].mean(), inplace = True)
train_df['일조(hr)'].fillna(0, inplace = True)
train_df['일사(MJ/m2)'].fillna(0, inplace = True)

test_df['일조(hr)'] = 0
test_df['일사(MJ/m2)'] = 0
test_df['강수량(mm)'].fillna(test_df['강수량(mm)'].mean(), inplace = True)

In [7]:
def weekday(s):
    s = s.split()[0]
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()

    return weekday

In [8]:
# 공휴일 : [0601, 0606, 0815]
specialday = ['20220601', '20220606', '20220815']

In [9]:
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))
train_df['6월'] = train_df['month'].apply(lambda x: 1 if x == 6 else 0)
train_df['7월'] = train_df['month'].apply(lambda x: 1 if x == 7 else 0)
train_df['8월'] = train_df['month'].apply(lambda x: 1 if x == 8 else 0)
train_df['낮'] = train_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
train_df['밤'] = train_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
train_df['요일'] = train_df['일시'].apply(weekday)
# train_df['specialday'] = train_df['일시'].apply(lambda x: 1 if x in specialday else 0)

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))
test_df['6월'] = test_df['month'].apply(lambda x: 1 if x == 6 else 0)
test_df['7월'] = test_df['month'].apply(lambda x: 1 if x == 7 else 0)
test_df['8월'] = test_df['month'].apply(lambda x: 1 if x == 8 else 0)
test_df['낮'] = test_df['time'].apply(lambda x: 1 if (x >= 8 and x <= 19) else 0)
test_df['밤'] = test_df['time'].apply(lambda x: 1 if (x < 8 or x >= 19) else 0)
test_df['요일'] = test_df['일시'].apply(weekday)
# test_df['specialday'] = 0

In [10]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time,6월,7월,8월,낮,밤
0,1_20220601 00,1,20220601 00,18.6,2.04083,0.9,42.0,0.0,0.0,1085.28,6,1,0,1,0,0,0,1
1,1_20220601 01,1,20220601 01,18.0,2.04083,1.1,45.0,0.0,0.0,1047.36,6,1,1,1,0,0,0,1
2,1_20220601 02,1,20220601 02,17.7,2.04083,1.5,45.0,0.0,0.0,974.88,6,1,2,1,0,0,0,1
3,1_20220601 03,1,20220601 03,16.7,2.04083,1.4,48.0,0.0,0.0,953.76,6,1,3,1,0,0,0,1
4,1_20220601 04,1,20220601 04,18.4,2.04083,2.8,43.0,0.0,0.0,986.40,6,1,4,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,2.04083,0.9,86.0,0.5,0.0,881.04,8,24,19,0,0,1,1,1
203996,100_20220824 20,100,20220824 20,22.4,2.04083,1.3,86.0,0.0,0.0,798.96,8,24,20,0,0,1,0,1
203997,100_20220824 21,100,20220824 21,21.3,2.04083,1.0,92.0,0.0,0.0,825.12,8,24,21,0,0,1,0,1
203998,100_20220824 22,100,20220824 22,21.0,2.04083,0.3,94.0,0.0,0.0,640.08,8,24,22,0,0,1,0,1


In [11]:
train_df.head(60)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time,6월,7월,8월,낮,밤
0,1_20220601 00,1,20220601 00,18.6,2.04083,0.9,42.0,0.0,0.0,1085.28,6,1,0,1,0,0,0,1
1,1_20220601 01,1,20220601 01,18.0,2.04083,1.1,45.0,0.0,0.0,1047.36,6,1,1,1,0,0,0,1
2,1_20220601 02,1,20220601 02,17.7,2.04083,1.5,45.0,0.0,0.0,974.88,6,1,2,1,0,0,0,1
3,1_20220601 03,1,20220601 03,16.7,2.04083,1.4,48.0,0.0,0.0,953.76,6,1,3,1,0,0,0,1
4,1_20220601 04,1,20220601 04,18.4,2.04083,2.8,43.0,0.0,0.0,986.4,6,1,4,1,0,0,0,1
5,1_20220601 05,1,20220601 05,17.2,2.04083,2.1,46.0,0.0,0.0,1087.2,6,1,5,1,0,0,0,1
6,1_20220601 06,1,20220601 06,16.3,2.04083,1.0,50.0,0.0,0.05,1314.72,6,1,6,1,0,0,0,1
7,1_20220601 07,1,20220601 07,17.4,2.04083,1.3,50.0,1.0,0.55,1684.8,6,1,7,1,0,0,0,1
8,1_20220601 08,1,20220601 08,20.6,2.04083,1.8,44.0,1.0,1.29,1976.16,6,1,8,1,0,0,1,0
9,1_20220601 09,1,20220601 09,23.2,2.04083,1.7,41.0,1.0,2.01,2289.12,6,1,9,1,0,0,1,0


In [12]:
train_df.tail(60)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time,6월,7월,8월,낮,밤
203940,100_20220822 12,100,20220822 12,29.7,2.04083,2.3,66.0,0.8,0.0,1250.88,8,22,12,0,0,1,1,0
203941,100_20220822 13,100,20220822 13,30.5,2.04083,1.8,62.0,0.7,0.0,1237.68,8,22,13,0,0,1,1,0
203942,100_20220822 14,100,20220822 14,31.4,2.04083,1.5,60.0,0.8,0.0,1272.0,8,22,14,0,0,1,1,0
203943,100_20220822 15,100,20220822 15,30.7,2.04083,2.1,62.0,0.6,0.0,1305.12,8,22,15,0,0,1,1,0
203944,100_20220822 16,100,20220822 16,29.1,2.04083,1.7,71.0,0.0,0.0,1309.44,8,22,16,0,0,1,1,0
203945,100_20220822 17,100,20220822 17,29.4,2.04083,2.4,67.0,0.9,0.0,1290.72,8,22,17,0,0,1,1,0
203946,100_20220822 18,100,20220822 18,28.0,2.04083,1.0,73.0,0.5,0.0,1139.52,8,22,18,0,0,1,1,0
203947,100_20220822 19,100,20220822 19,26.8,2.04083,1.2,79.0,0.0,0.0,1010.64,8,22,19,0,0,1,1,1
203948,100_20220822 20,100,20220822 20,26.3,2.04083,0.8,81.0,0.0,0.0,982.8,8,22,20,0,0,1,0,1
203949,100_20220822 21,100,20220822 21,25.3,2.04083,0.2,87.0,0.0,0.0,863.28,8,22,21,0,0,1,0,1


In [13]:
# 날짜 범위 설정
start_date_train = '20220601'
end_date_train = '20220812'
end_date_val = '20220824'

# 날짜 범위에 해당하는 인덱스 추출
train_indices = (train_df['일시'] >= start_date_train) & (train_df['일시'] <= end_date_train)
val_indices = (train_df['일시'] > end_date_train) & (train_df['일시'] <= end_date_val)

In [14]:
X_train = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', 'month', '전력소비량(kWh)'])
y_train = train_df['전력소비량(kWh)']
test_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', 'month'], inplace = True)

In [15]:
X_train.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,6월,7월,8월,낮,밤
0,1,18.6,2.04083,0.9,42.0,1,0,1,0,0,0,1
1,1,18.0,2.04083,1.1,45.0,1,1,1,0,0,0,1
2,1,17.7,2.04083,1.5,45.0,1,2,1,0,0,0,1
3,1,16.7,2.04083,1.4,48.0,1,3,1,0,0,0,1
4,1,18.4,2.04083,2.8,43.0,1,4,1,0,0,0,1


In [16]:
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(test_df[X_train.columns])

In [17]:
# 훈련 및 검증 데이터 분할
X_train_split = X_train[train_indices]
X_val_split = X_train[val_indices]
y_train_split = y_train[train_indices]
y_val_split = y_train[val_indices]

In [18]:
# xgb_params ={
#     'learning_rate': 0.05,
#     'max_depth': 50,
#     'min_child_weight': 15,
#     'n_estimators': 150,
# }

# xgb_model = XGBRegressor(
#     objective='reg:squarederror',
#     learning_rate = xgb_params['learning_rate'],
#     max_depth = xgb_params['max_depth'],
#     min_child_weight = xgb_params['min_child_weight'],
#     n_estimators = xgb_params['n_estimators'],
#     # subsample=params['subsample'],
#     # colsample_bytree=params['colsample_bytree'],
#     # gamma=params['gamma'],
#     # reg_alpha=params['reg_alpha'],
#     # reg_lambda=params['reg_lambda'],
#     random_state=42
# )

In [19]:
# models_by_building = {}

# for building_num in range(1, 101):
#     building_data = X_train_split[X_train_split['건물번호'] == building_num]
#     building_target = y_train_split[X_train['건물번호'] == building_num]

In [20]:
# # XGBRegressor
xgb_params = {
    'learning_rate': 0.1,
    'max_depth': 50,
    'min_child_weight': 15,
    'n_estimators': 100
}
# xgb_model = XGBRegressor(
#     objective='reg:squarederror',
#     random_state=42,
#     **xgb_params
# )

# # LGBMRegressor
lgbm_params = {
    'learning_rate': 0.1,
    'max_depth': 50,
    'min_child_samples': 15,
    'num_leaves' : 100,
    'n_estimators': 100
}
# lgbm_model = LGBMRegressor(
#     objective='regression',
#     random_state=42,
#     **lgbm_params
# )

# # CatBoostRegressor
# catboost_params = {
#     'learning_rate': 0.1,
#     'depth': 15,
#     'iterations': 100
# }
# catboost_model = CatBoostRegressor(
#     loss_function='RMSE',
#     random_seed=42,
#     **catboost_params
# )

rf_params = {
    'n_estimators': 100,
    'max_depth': 50,
    'min_samples_split': 5,
    'min_samples_leaf': 5,
    'random_state': 42
}
# rf_model = RandomForestRegressor(**rf_params)

In [21]:
# # xgb_model.fit(X_train_split, y_train_split)
# # lgbm_model.fit(X_train_split, y_train_split)
# # catboost_model.fit(X_train_split, y_train_split)
# # rf_model.fit(X_train_split, y_train_split)

# xgb_model.fit(building_data, building_target)
# lgbm_model.fit(building_data, building_target)
# catboost_model.fit(building_data, building_target) 
# rf_model.fit(building_data, building_target)

# models_by_building[building_num] = {
#     'xgb' : xgb_model,
#     'lgbm' : lgbm_model,
#     'catboost' : catboost_model,
#     'rf' : rf_model
# }

In [22]:
# # xgb_preds = xgb_model.predict(X_val_split)
# # lgbm_preds = lgbm_model.predict(X_val_split)
# # catboost_preds = catboost_model.predict(X_val_split)
# # rf_preds = rf_model.predict(X_val_split)

# # ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + catboost_preds) / 4

# ensemble_preds_by_building = {}
# for building_num in range(1, 101):
#     building_data = X_val_split[X_val_split['건물번호'] == building_num]
    
#     xgb_preds = models_by_building[building_num]['xgb'].predict(building_data)
#     lgbm_preds = models_by_building[building_num]['lgbm'].predict(building_data)
#     catboost_preds = models_by_building[building_num]['catboost'].predict(building_data)
#     rf_preds = models_by_building[building_num]['rf'].predict(building_data)
    
#     ensemble_preds = (rf_preds + xgb_preds + lgbm_preds + catboost_preds) / 4
#     ensemble_preds_by_building[building_num] = ensemble_preds

# print("Prediction completed for all buildings.")

In [23]:
models_by_building = {}

for building_num in range(1, 101):
    building_data = X_train_split[X_train_split['건물번호'] == building_num]
    building_target = y_train_split[X_train_split['건물번호'] == building_num]

    # 각 모델별로 생성 및 학습
    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        **xgb_params
    )
    xgb_model.fit(building_data, building_target)

    lgbm_model = LGBMRegressor(
        objective='regression',
        random_state=42,
        **lgbm_params
    )
    lgbm_model.fit(building_data, building_target)

    # catboost_model = CatBoostRegressor(
    #     loss_function='RMSE',
    #     random_seed=42,
    #     **catboost_params
    # )
    # catboost_model.fit(building_data, building_target)

    rf_model = RandomForestRegressor(**rf_params)
    rf_model.fit(building_data, building_target)

    models_by_building[building_num] = {
        'xgb': xgb_model,
        'lgbm': lgbm_model,
        # 'catboost': catboost_model,
        'rf': rf_model
    }

print("Training completed for all buildings.")

ensemble_preds_by_building = {}
for building_num in range(1, 101):
    building_data = X_val_split[X_val_split['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(building_data)
    # catboost_preds = models_by_building[building_num]['catboost'].predict(building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds) / 3
    ensemble_preds_by_building[building_num] = ensemble_preds

print("Prediction completed for all buildings.")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 1728, number of used features: 11
[LightGBM] [Info] Start training from score 2615.237498
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 405
[LightGBM] [Info] Number of data points in the train set: 1728, number of used features: 11
[LightGBM] [Info] Start training from score 1658.905104
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 1728, number of used features: 11
[LightGBM] [Info] Start training from score 1440.353022
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 1728, number of used features: 11
[LightGBM] [Info] Start training from score 984.578698
You can set `force_col_wise=true` to remove t

In [24]:
val_smape_best = 0
for building_num in range(1, 101):
    val_smape = SMAPE(y_val_split[X_val_split['건물번호'] == building_num], ensemble_preds_by_building[building_num])
    val_smape_best += val_smape

val_smape_best /= 100  # 전체 건물 수로 나누어 평균 계산
print("Average Validation SMAPE:", val_smape_best)

y_test_pred_best = np.array([])
for building_num in range(1, 101):
    test_building_data = test_df[test_df['건물번호'] == building_num]

    xgb_preds = models_by_building[building_num]['xgb'].predict(test_building_data)
    lgbm_preds = models_by_building[building_num]['lgbm'].predict(test_building_data)
    # catboost_preds = models_by_building[building_num]['catboost'].predict(test_building_data)
    rf_preds = models_by_building[building_num]['rf'].predict(test_building_data)

    ensemble_preds = (rf_preds + xgb_preds + lgbm_preds) / 3
    y_test_pred_best = np.concatenate((y_test_pred_best, ensemble_preds))

Average Validation SMAPE: 12.405176920047603


In [25]:
# # y_val_pred_best = xgb_model.predict(X_val_split)

# val_smape_best = SMAPE(y_val_split, ensemble_preds)
# print(val_smape_best)

In [26]:
# y_test_pred_best = xgb_model.predict(test_df)

In [27]:
submission = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_전력사용량\sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [28]:
submission['answer'] = y_test_pred_best
submission.head(60)

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1921.089249
1,1_20220825 01,1842.800642
2,1_20220825 02,1797.927501
3,1_20220825 03,1811.086884
4,1_20220825 04,1838.076136
5,1_20220825 05,1931.928343
6,1_20220825 06,2161.651042
7,1_20220825 07,2348.825122
8,1_20220825 08,2707.828465
9,1_20220825 09,3080.359583


In [29]:
submission.tail(60)

Unnamed: 0,num_date_time,answer
16740,100_20220829 12,943.068396
16741,100_20220829 13,960.162589
16742,100_20220829 14,966.635062
16743,100_20220829 15,978.681584
16744,100_20220829 16,983.919839
16745,100_20220829 17,974.934935
16746,100_20220829 18,959.968415
16747,100_20220829 19,950.974852
16748,100_20220829 20,727.648668
16749,100_20220829 21,669.70764


In [30]:
submission.to_csv('C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/0823-2.csv', index=False)