In [157]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [158]:
import os
os.chdir('/content/drive/MyDrive/dacon/2023 전력 사용량 예측')

## Import

In [159]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

## Fixed Random Seed

In [160]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

## Data Load

In [161]:
train_df = pd.read_csv('./train.csv')
building_info = pd.read_csv('./building_info.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [162]:
train_df.shape

(204000, 10)

In [163]:
test_df.shape

(16800, 7)

In [164]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [165]:
building_info.head()

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


In [166]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77


- 일조, 일사량 삭제
- building_info 건물번호 기준 train, test df 와 결합

## DATA **preprocessing**

In [167]:
train_df = train_df.drop(['일조(hr)','일사(MJ/m2)'], axis=1)
train_df.head(24)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,986.4
5,1_20220601 05,1,20220601 05,17.2,,2.1,46.0,1087.2
6,1_20220601 06,1,20220601 06,16.3,,1.0,50.0,1314.72
7,1_20220601 07,1,20220601 07,17.4,,1.3,50.0,1684.8
8,1_20220601 08,1,20220601 08,20.6,,1.8,44.0,1976.16
9,1_20220601 09,1,20220601 09,23.2,,1.7,41.0,2289.12


In [168]:
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
전력소비량(kWh)            0
dtype: int64

In [169]:
# 강수량. 풍속, 습도 결측치 처리
train_df['강수량(mm)'].fillna(0.0, inplace=True)
train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(), 2), inplace=True)
train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(), 2), inplace=True)

In [170]:
# 일시 month, day, time 으로 변환
train_df['month'] = train_df['일시'].apply(lambda x:float(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x:float(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : float(x[9:11]))

train_df.drop('num_date_time', axis=1, inplace=True)

In [171]:
train_df = train_df[train_df.columns[:7].to_list() + train_df.columns[8:].to_list() + train_df.columns[7:8].to_list()]

In [172]:
train_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month
0,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,1.0,0.0,6.0
1,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,1.0,1.0,6.0
2,1,20220601 02,17.7,0.0,1.5,45.0,974.88,1.0,2.0,6.0
3,1,20220601 03,16.7,0.0,1.4,48.0,953.76,1.0,3.0,6.0
4,1,20220601 04,18.4,0.0,2.8,43.0,986.4,1.0,4.0,6.0


In [173]:
test_df.shape

(16800, 7)

In [174]:
test_df['습도(%)'] = test_df['습도(%)'].astype('float64')

# 날짜 데이터 추가
test_df['month'] = test_df['일시'].apply(lambda x : float(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : float(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : float(x[9:11]))

test_df.drop('num_date_time', axis=1, inplace=True)

In [175]:
test_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1,20220825 00,23.5,0.0,2.2,72.0,8.0,25.0,0.0
1,1,20220825 01,23.0,0.0,0.9,72.0,8.0,25.0,1.0
2,1,20220825 02,22.7,0.0,1.5,75.0,8.0,25.0,2.0
3,1,20220825 03,22.1,0.0,1.3,78.0,8.0,25.0,3.0
4,1,20220825 04,21.8,0.0,1.0,77.0,8.0,25.0,4.0



- building_info 와 결합

In [176]:
train_df = pd.merge(train_df, building_info, on='건물번호', how='left')
test_df = pd.merge(test_df, building_info, on='건물번호', how='left')

train_df.drop(['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1, inplace=True)
test_df.drop(['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1, inplace=True)

train_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month,건물유형,연면적(m2),냉방면적(m2)
0,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,1.0,0.0,6.0,건물기타,110634.0,39570.0
1,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,1.0,1.0,6.0,건물기타,110634.0,39570.0
2,1,20220601 02,17.7,0.0,1.5,45.0,974.88,1.0,2.0,6.0,건물기타,110634.0,39570.0
3,1,20220601 03,16.7,0.0,1.4,48.0,953.76,1.0,3.0,6.0,건물기타,110634.0,39570.0
4,1,20220601 04,18.4,0.0,2.8,43.0,986.4,1.0,4.0,6.0,건물기타,110634.0,39570.0


In [177]:
test_df.shape

(16800, 12)

In [178]:
from sklearn.preprocessing import OneHotEncoder

building_type_list = [
    '건물기타',
    '공공',
    '대학교',
    '데이터센터',
    '백화점및아울렛',
    '병원',
    '상용',
    '아파트',
    '연구소',
    '지식산업센터',
    '할인마트',
    '호텔및리조트'
]

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(train_df[['건물유형']])
building_df = pd.DataFrame(one_hot_encoder.transform(train_df[['건물유형']]), columns=building_type_list)
building_df.head()



Unnamed: 0,건물기타,공공,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
train_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month,건물유형,연면적(m2),냉방면적(m2)
0,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,1.0,0.0,6.0,건물기타,110634.0,39570.0
1,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,1.0,1.0,6.0,건물기타,110634.0,39570.0
2,1,20220601 02,17.7,0.0,1.5,45.0,974.88,1.0,2.0,6.0,건물기타,110634.0,39570.0
3,1,20220601 03,16.7,0.0,1.4,48.0,953.76,1.0,3.0,6.0,건물기타,110634.0,39570.0
4,1,20220601 04,18.4,0.0,2.8,43.0,986.4,1.0,4.0,6.0,건물기타,110634.0,39570.0


In [180]:
train_df = pd.concat([train_df.drop(['건물유형'], axis=1), building_df], axis=1)

In [181]:
train_df.head(10)

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month,...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,1.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,1.0,1.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,20220601 02,17.7,0.0,1.5,45.0,974.88,1.0,2.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,20220601 03,16.7,0.0,1.4,48.0,953.76,1.0,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,20220601 04,18.4,0.0,2.8,43.0,986.4,1.0,4.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,20220601 05,17.2,0.0,2.1,46.0,1087.2,1.0,5.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,20220601 06,16.3,0.0,1.0,50.0,1314.72,1.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,20220601 07,17.4,0.0,1.3,50.0,1684.8,1.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,20220601 08,20.6,0.0,1.8,44.0,1976.16,1.0,8.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,20220601 09,23.2,0.0,1.7,41.0,2289.12,1.0,9.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
train_df.describe()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month,연면적(m2),...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
count,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,...,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0,204000.0
mean,50.5,25.543524,0.439489,2.134031,78.716295,2451.036462,14.835294,11.5,6.929412,260887.5,...,0.08,0.05,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08
std,28.866141,3.779523,2.539497,1.358045,15.079775,2440.648858,8.446178,6.922204,0.793923,1273550.0,...,0.271294,0.217945,0.271294,0.271294,0.271294,0.271294,0.271294,0.271294,0.271294,0.271294
min,1.0,10.1,0.0,0.0,13.0,0.0,1.0,0.0,6.0,5578.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,23.4,0.0,1.2,69.0,1085.76,8.0,5.75,6.0,61446.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.5,25.7,0.0,2.0,81.0,1766.4,15.0,11.5,7.0,92640.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,75.25,28.1,0.0,2.8,91.0,2970.0,22.0,17.25,8.0,184813.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,100.0,37.1,92.2,13.3,100.0,25488.4,31.0,23.0,8.0,12872880.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [183]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 24 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   건물번호        204000 non-null  int64  
 1   일시          204000 non-null  object 
 2   기온(C)       204000 non-null  float64
 3   강수량(mm)     204000 non-null  float64
 4   풍속(m/s)     204000 non-null  float64
 5   습도(%)       204000 non-null  float64
 6   전력소비량(kWh)  204000 non-null  float64
 7   day         204000 non-null  float64
 8   time        204000 non-null  float64
 9   month       204000 non-null  float64
 10  연면적(m2)     204000 non-null  float64
 11  냉방면적(m2)    204000 non-null  float64
 12  건물기타        204000 non-null  float64
 13  공공          204000 non-null  float64
 14  대학교         204000 non-null  float64
 15  데이터센터       204000 non-null  float64
 16  백화점및아울렛     204000 non-null  float64
 17  병원          204000 non-null  float64
 18  상용          204000 non-null  float64
 19  아파

In [184]:
test_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,건물유형,연면적(m2),냉방면적(m2)
0,1,20220825 00,23.5,0.0,2.2,72.0,8.0,25.0,0.0,건물기타,110634.0,39570.0
1,1,20220825 01,23.0,0.0,0.9,72.0,8.0,25.0,1.0,건물기타,110634.0,39570.0
2,1,20220825 02,22.7,0.0,1.5,75.0,8.0,25.0,2.0,건물기타,110634.0,39570.0
3,1,20220825 03,22.1,0.0,1.3,78.0,8.0,25.0,3.0,건물기타,110634.0,39570.0
4,1,20220825 04,21.8,0.0,1.0,77.0,8.0,25.0,4.0,건물기타,110634.0,39570.0


In [185]:
test_df.isnull().any()

건물번호        False
일시          False
기온(C)       False
강수량(mm)     False
풍속(m/s)     False
습도(%)       False
month       False
day         False
time        False
건물유형        False
연면적(m2)     False
냉방면적(m2)    False
dtype: bool

In [186]:
test_df.shape

(16800, 12)

In [187]:
test_building_df = pd.DataFrame(one_hot_encoder.transform(test_df[['건물유형']]), columns=building_type_list)
test_df = pd.concat([test_df.drop(['건물유형'], axis=1), test_building_df], axis=1)

In [188]:
test_df.shape

(16800, 23)

In [189]:
test_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,연면적(m2),...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,1,20220825 00,23.5,0.0,2.2,72.0,8.0,25.0,0.0,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,20220825 01,23.0,0.0,0.9,72.0,8.0,25.0,1.0,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,20220825 02,22.7,0.0,1.5,75.0,8.0,25.0,2.0,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,20220825 03,22.1,0.0,1.3,78.0,8.0,25.0,3.0,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,20220825 04,21.8,0.0,1.0,77.0,8.0,25.0,4.0,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [190]:
test_df.shape

(16800, 23)

## LSTM modeling

## Hyperparameter Setting

In [191]:
# 하이퍼파라미터
input_size = 22  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

In [192]:
class TimeSeriesDataset(Dataset):
  def __init__(self, df, window_size):
    self.df = df
    self.window_size = window_size

  def __len__(self):
    return len(self.df) - self.window_size

  def __getitem__(self, idx):
    x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)

    if self.df.shape[1] > 1:
      y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
    else:
      y = None
    return x, y

def create_data_loader(df, window_size, batch_size):
  dataset = TimeSeriesDataset(df, window_size)
  data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
  return data_loader

- train data 전처리

In [193]:
num_scaler = MinMaxScaler()
num_feats = ['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', 'month', 'day', 'time', '연면적(m2)', '냉방면적(m2)', '전력소비량(kWh)']
non_num_feats = ['건물번호', '일시'] + building_type_list
train_data = train_df.drop(non_num_feats+['전력소비량(kWh)'], axis=1)
train_data = pd.concat([train_data, train_df[building_type_list], train_df['전력소비량(kWh)']], axis=1)
train_data[num_feats] = num_scaler.fit_transform(train_data[num_feats])

In [194]:
train_data.head()

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,month,연면적(m2),냉방면적(m2),건물기타,...,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트,전력소비량(kWh)
0,0.314815,0.0,0.067669,0.333333,0.0,0.0,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042579
1,0.292593,0.0,0.082707,0.367816,0.0,0.043478,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041092
2,0.281481,0.0,0.112782,0.367816,0.0,0.086957,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038248
3,0.244444,0.0,0.105263,0.402299,0.0,0.130435,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037419
4,0.307407,0.0,0.210526,0.344828,0.0,0.173913,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0387


In [195]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 22 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   기온(C)       204000 non-null  float64
 1   강수량(mm)     204000 non-null  float64
 2   풍속(m/s)     204000 non-null  float64
 3   습도(%)       204000 non-null  float64
 4   day         204000 non-null  float64
 5   time        204000 non-null  float64
 6   month       204000 non-null  float64
 7   연면적(m2)     204000 non-null  float64
 8   냉방면적(m2)    204000 non-null  float64
 9   건물기타        204000 non-null  float64
 10  공공          204000 non-null  float64
 11  대학교         204000 non-null  float64
 12  데이터센터       204000 non-null  float64
 13  백화점및아울렛     204000 non-null  float64
 14  병원          204000 non-null  float64
 15  상용          204000 non-null  float64
 16  아파트         204000 non-null  float64
 17  연구소         204000 non-null  float64
 18  지식산업센터      204000 non-null  float64
 19  할인

In [196]:
train_data.isnull().any()

기온(C)         False
강수량(mm)       False
풍속(m/s)       False
습도(%)         False
day           False
time          False
month         False
연면적(m2)       False
냉방면적(m2)      False
건물기타          False
공공            False
대학교           False
데이터센터         False
백화점및아울렛       False
병원            False
상용            False
아파트           False
연구소           False
지식산업센터        False
할인마트          False
호텔및리조트        False
전력소비량(kWh)    False
dtype: bool

In [197]:
train_loader = create_data_loader(train_data.values, window_size, batch_size)

In [198]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [199]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: cuda


In [200]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

Epoch [1/5], Step [300/3188], Loss: 0.0006
Epoch [1/5], Step [600/3188], Loss: 0.0004
Epoch [1/5], Step [900/3188], Loss: 0.0002
Epoch [1/5], Step [1200/3188], Loss: 0.0018
Epoch [1/5], Step [1500/3188], Loss: 0.0213
Epoch [1/5], Step [1800/3188], Loss: 0.0016
Epoch [1/5], Step [2100/3188], Loss: 0.0000
Epoch [1/5], Step [2400/3188], Loss: 0.0001
Epoch [1/5], Step [2700/3188], Loss: 0.0000
Epoch [1/5], Step [3000/3188], Loss: 0.0045
Epoch [2/5], Step [300/3188], Loss: 0.0008
Epoch [2/5], Step [600/3188], Loss: 0.0004
Epoch [2/5], Step [900/3188], Loss: 0.0004
Epoch [2/5], Step [1200/3188], Loss: 0.0019
Epoch [2/5], Step [1500/3188], Loss: 0.0245
Epoch [2/5], Step [1800/3188], Loss: 0.0039
Epoch [2/5], Step [2100/3188], Loss: 0.0000
Epoch [2/5], Step [2400/3188], Loss: 0.0001
Epoch [2/5], Step [2700/3188], Loss: 0.0002
Epoch [2/5], Step [3000/3188], Loss: 0.0036
Epoch [3/5], Step [300/3188], Loss: 0.0017
Epoch [3/5], Step [600/3188], Loss: 0.0004
Epoch [3/5], Step [900/3188], Loss: 0.00

In [201]:
train_df.head()

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),day,time,month,...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,1.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,1.0,1.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,20220601 02,17.7,0.0,1.5,45.0,974.88,1.0,2.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,20220601 03,16.7,0.0,1.4,48.0,953.76,1.0,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,20220601 04,18.4,0.0,2.8,43.0,986.4,1.0,4.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
train_data.head()

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,month,연면적(m2),냉방면적(m2),건물기타,...,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트,전력소비량(kWh)
0,0.314815,0.0,0.067669,0.333333,0.0,0.0,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042579
1,0.292593,0.0,0.082707,0.367816,0.0,0.043478,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041092
2,0.281481,0.0,0.112782,0.367816,0.0,0.086957,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038248
3,0.244444,0.0,0.105263,0.402299,0.0,0.130435,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037419
4,0.307407,0.0,0.210526,0.344828,0.0,0.173913,0.0,0.008165,0.003616,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0387


In [203]:
last_train_data = train_data.loc[204000-24:, :]

test_data = test_df.drop(non_num_feats, axis=1)
test_data = pd.concat([test_data, test_df[building_type_list]], axis=1)

final_df = pd.concat((test_data, pd.DataFrame(np.zeros(test_data.shape[0]))), axis=1)
final_df = final_df.rename({0:'전력소비량(kWh)'}, axis=1)

final_df[num_feats] = num_scaler.transform(final_df[num_feats])

test_data = pd.concat([last_train_data, final_df]).reset_index(drop=True)

test_data.head()

#train_df 에서 새로 받아서  preprocessing

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,month,연면적(m2),냉방면적(m2),건물기타,...,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트,전력소비량(kWh)
0,0.444444,0.0,0.045113,0.91954,0.766667,0.0,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.017956
1,0.411111,0.0,0.0,0.988506,0.766667,0.043478,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.015951
2,0.4,0.0,0.015038,0.977011,0.766667,0.086957,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.015405
3,0.422222,0.0,0.090226,0.908046,0.766667,0.130435,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.014849
4,0.422222,0.0,0.135338,0.908046,0.766667,0.173913,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.014877


In [204]:
test_data.shape

(16824, 22)

In [205]:
test_dataset = TimeSeriesDataset(test_data.values, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [206]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.values.shape[0] - window_size):
        x = torch.Tensor(test_data.values[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))

        test_data.values[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

In [207]:
test_data.head()

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,month,연면적(m2),냉방면적(m2),건물기타,...,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트,전력소비량(kWh)
0,0.444444,0.0,0.045113,0.91954,0.766667,0.0,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.017956
1,0.411111,0.0,0.0,0.988506,0.766667,0.043478,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.015951
2,0.4,0.0,0.015038,0.977011,0.766667,0.086957,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.015405
3,0.422222,0.0,0.090226,0.908046,0.766667,0.130435,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.014849
4,0.422222,0.0,0.135338,0.908046,0.766667,0.173913,1.0,0.004035,0.003659,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.014877


In [208]:
test_data[num_feats] = num_scaler.inverse_transform(test_data[num_feats])

In [209]:
test_data.head()

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),day,time,month,연면적(m2),냉방면적(m2),건물기타,...,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트,전력소비량(kWh)
0,22.1,0.0,0.6,93.0,24.0,0.0,8.0,57497.84,40035.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,457.68
1,21.2,0.0,0.0,99.0,24.0,1.0,8.0,57497.84,40035.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,406.56
2,20.9,0.0,0.2,98.0,24.0,2.0,8.0,57497.84,40035.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,392.64
3,21.5,0.0,1.2,92.0,24.0,3.0,8.0,57497.84,40035.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,378.48
4,21.5,0.0,1.8,92.0,24.0,4.0,8.0,57497.84,40035.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,379.2


In [215]:
predictions = test_data.values[24:,-1]

In [216]:
predictions.shape

(16800,)

In [217]:
sample_submission['answer'] = predictions
sample_submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,476.790792
1,1_20220825 01,1044.260448
2,1_20220825 02,1120.820754
3,1_20220825 03,1154.199216
4,1_20220825 04,1256.870003


In [218]:
sample_submission.to_csv('0730lstm_submission.csv', index=False)