<a href="https://colab.research.google.com/github/dnstjr4567/mo2023-1/blob/main/%5BBaseline%5D_LSTM100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
from tqdm.auto import tqdm

## Fixed Random Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
building = pd.read_csv('/content/drive/Shareddrives/전력예측/building_info.csv', encoding = 'utf-8')
test_df = pd.read_csv('/content/drive/Shareddrives/전력예측/test.csv', encoding = 'utf-8')
train_df = pd.read_csv('/content/drive/Shareddrives/전력예측/train.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/Shareddrives/전력예측/sample_submission.csv',encoding = 'utf-8')

## Data Load

In [None]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77
...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97


## Train Data Pre-processing

In [None]:
# 일조, 일사 열 제거
train_df = train_df.drop(['일조(hr)','일사(MJ/m2)'], axis=1)
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,986.4


In [None]:
# 결측치 확인
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
전력소비량(kWh)            0
dtype: int64

In [None]:
# 강수량 결측치 0.0으로 채우기
train_df['강수량(mm)'].fillna(0.0, inplace=True)

# 풍속, 습도 결측치 평균으로 채우고 반올림하기
train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(),2), inplace=True)
train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(),2), inplace=True)

In [None]:
train_df['discomfort'] = 1.8*train_df["기온(C)"] - 0.55 * (1-train_df["습도(%)"]) * (1.8 * train_df["기온(C)"] - 26) + 32

In [None]:
train_df['month'] = train_df['일시'].apply(lambda x : float(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : float(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : float(x[9:11]))

In [None]:
# 순서 재배치
train_df = train_df[train_df.columns[:7].to_list() + train_df.columns[8:].to_list() + train_df.columns[7:8].to_list()]
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),discomfort,month,day,time,전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,234.154,6.0,1.0,0.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,219.28,6.0,1.0,1.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,205.672,6.0,1.0,2.0,974.88
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,167.011,6.0,1.0,3.0,953.76
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,229.592,6.0,1.0,4.0,986.4


## Hyperparameter Setting

In [None]:
input_size = 9  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

## Dataset

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [None]:
train = []

for i in range(0,len(train_df),2040):
  train.append(train_df[i:i+2041])

print(len(train))

train[0]

100


Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),discomfort,month,day,time,전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,234.154,6.0,1.0,0.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,219.280,6.0,1.0,1.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,205.672,6.0,1.0,2.0,974.88
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,167.011,6.0,1.0,3.0,953.76
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,229.592,6.0,1.0,4.0,986.40
...,...,...,...,...,...,...,...,...,...,...,...,...
2036,1_20220824 20,1,20220824 20,25.0,0.0,2.6,66.0,756.250,8.0,24.0,20.0,2208.96
2037,1_20220824 21,1,20220824 21,24.6,0.0,1.3,67.0,739.844,8.0,24.0,21.0,1806.24
2038,1_20220824 22,1,20220824 22,24.1,0.0,2.1,70.0,734.951,8.0,24.0,22.0,1387.20
2039,1_20220824 23,1,20220824 23,23.8,0.0,1.4,70.0,713.918,8.0,24.0,23.0,1925.28


In [None]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_df.drop(['num_date_time', '건물번호', '일시'], axis=1).values)
train_loader = create_data_loader(train_data, window_size, batch_size)

In [None]:
scaler = MinMaxScaler()
train_data = []
train_loader = []
for i in range(100):
    scaled_data = scaler.fit_transform(train[i].drop(['num_date_time', '건물번호', '일시'], axis=1).values)
    train_data.append(scaled_data)
    train_loader.append(create_data_loader(scaled_data, window_size, batch_size))

## Model Define

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")
model =[]
for i in range(100):
  model.append( LSTM(input_size, hidden_size, num_layers, output_size).to(device))

criterion = nn.MSELoss()
optimizer = []
for i in range(100):
  optimizer.append(torch.optim.Adam(model[i].parameters(), lr=learning_rate))

current device: cuda


## Train

In [None]:
outputs = []
losses = []

for j in range(100):
    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(train_loader[j]):
            inputs = inputs.to(device)
            labels = labels.unsqueeze(1).to(device)

            # Forward
            output = model[j](inputs)
            loss = criterion(output, labels)

            # Backward and optimize
            optimizer[j].zero_grad()
            loss.backward(retain_graph=True)  # Retain the graph for multiple backward passes
            optimizer[j].step()

            if (i+1) % 300 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                      .format(epoch+1, num_epochs, i+1, len(train_loader[j]), loss.item()))

        # Append loss and output for each epoch
        losses.append(loss.item())
        outputs.append(output)


In [None]:
"""outputs = []
losses = []
for j in range(100):
  for epoch in range(num_epochs):
      for i, (inputs, labels) in enumerate(train_loader[j]):
          inputs = inputs.to(device)
          labels = labels.unsqueeze(1).to(device)

        # Forward
          outputs.append(model(inputs))
          losses.append(criterion(outputs[j], labels))

        # Backward and optimize
          optimizer.zero_grad()
          losses[j].backward()
          optimizer.step()

          if (i+1) % 300 == 0:
              print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                     .format(epoch+1, num_epochs, i+1, len(train_loader[j]), loss.item()))"""

"outputs = []\nlosses = []\nfor j in range(100):\n  for epoch in range(num_epochs):\n      for i, (inputs, labels) in enumerate(train_loader[j]):\n          inputs = inputs.to(device)\n          labels = labels.unsqueeze(1).to(device)\n\n        # Forward\n          outputs.append(model(inputs))\n          losses.append(criterion(outputs[j], labels))\n\n        # Backward and optimize\n          optimizer.zero_grad()\n          losses[j].backward()\n          optimizer.step()\n\n          if (i+1) % 300 == 0:\n              print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'\n                     .format(epoch+1, num_epochs, i+1, len(train_loader[j]), loss.item()))"

## Test Data Pre-processing

In [None]:
# 학습 데이터에서 마지막 행 가져오기


# 실수형 데이터로 변환
test_df['습도(%)'] = test_df['습도(%)'].astype('float64')

# 날짜 데이터 추가
test_df['month'] = test_df['일시'].apply(lambda x : float(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : float(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : float(x[9:11]))
test_df['discomfort'] = 1.8*test_df["기온(C)"] - 0.55 * (1-test_df["습도(%)"]) * (1.8 * test_df["기온(C)"] - 26) + 32
# 전력소비량 열 생성
final_df = pd.concat((test_df.drop(['num_date_time', '건물번호', '일시',], axis=1), pd.DataFrame(np.zeros(test_df.shape[0]))),axis=1)
final_df = final_df.rename({0:'전력소비량(kWh)'},axis=1)

In [None]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,discomfort
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72.0,8.0,25.0,0.0,710.815
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72.0,8.0,25.0,1.0,674.770
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75.0,8.0,25.0,2.0,677.662
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78.0,8.0,25.0,3.0,655.363
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77.0,8.0,25.0,4.0,624.672
...,...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84.0,8.0,31.0,19.0,734.425
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95.0,8.0,31.0,20.0,651.402
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98.0,8.0,31.0,21.0,621.066
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97.0,8.0,31.0,22.0,605.684


In [None]:
final_df

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,discomfort,전력소비량(kWh)
0,23.5,0.0,2.2,72.0,8.0,25.0,0.0,710.815,0.0
1,23.0,0.0,0.9,72.0,8.0,25.0,1.0,674.770,0.0
2,22.7,0.0,1.5,75.0,8.0,25.0,2.0,677.662,0.0
3,22.1,0.0,1.3,78.0,8.0,25.0,3.0,655.363,0.0
4,21.8,0.0,1.0,77.0,8.0,25.0,4.0,624.672,0.0
...,...,...,...,...,...,...,...,...,...
16795,22.5,0.0,0.9,84.0,8.0,31.0,19.0,734.425,0.0
16796,20.7,0.0,0.4,95.0,8.0,31.0,20.0,651.402,0.0
16797,20.2,0.0,0.4,98.0,8.0,31.0,21.0,621.066,0.0
16798,20.1,0.0,1.1,97.0,8.0,31.0,22.0,605.684,0.0


## Test Dataset

In [None]:
# Dataset & DataLoader
"""test_dataset = TimeSeriesDataset(test_data, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)"""

'test_dataset = TimeSeriesDataset(test_data, window_size)\ntest_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)'

In [None]:
test = []
for i in range(0,len(test_df),168):
  test.append(test_df[i:i+169])

In [None]:

test_data = []
test_loader = []
for i in range(100):
    scaled_data = scaler.fit_transform(test[i].drop(['num_date_time', '건물번호', '일시'], axis=1).values)
    test_data.append(scaled_data)
    test_loader.append(create_data_loader(scaled_data, window_size, batch_size))

## Inference

In [None]:
train_data[0]

array([[0.17475728, 0.        , 0.10227273, ..., 0.        , 0.        ,
        0.07261945],
       [0.14563107, 0.        , 0.125     , ..., 0.        , 0.04347826,
        0.06377979],
       [0.13106796, 0.        , 0.17045455, ..., 0.        , 0.08695652,
        0.04688374],
       ...,
       [0.44174757, 0.        , 0.23863636, ..., 0.76666667, 0.95652174,
        0.14300101],
       [0.42718447, 0.        , 0.15909091, ..., 0.76666667, 1.        ,
        0.2684346 ],
       [0.17475728, 0.        , 0.10227273, ..., 0.        , 0.        ,
        0.09245272]])

In [None]:

test_predictions = []
for j in range(100):
  model[j].eval()
  with torch.no_grad():
      for i in range(test_data[j].shape[0] - window_size):
          x = torch.Tensor(test_data[j][i:i+window_size,:]).to(device)
          new_x = model[j](x.view(1,window_size,-1))

          test_data[i+window_size,-1] = new_x # 입력 업데이트
          test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

RuntimeError: ignored

## Submit

In [None]:
predictions = scaler.inverse_transform(test_data)[24:,-1] # 원래 scale로 복구

In [None]:
sample_submission['answer'] = predictions
sample_submission

In [None]:
sample_submission.to_csv('lstm_baseline3_submission.csv', index=False)