<a href="https://colab.research.google.com/github/dnstjr4567/mo2023-1/blob/main/%5Bchange%5D_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from google.colab import drive
from tqdm.auto import tqdm

## Fixed Random Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
building = pd.read_csv('/content/drive/Shareddrives/전력예측/building_info.csv', encoding = 'utf-8')
test_df = pd.read_csv('/content/drive/Shareddrives/전력예측/test.csv', encoding = 'utf-8')
train_df = pd.read_csv('/content/drive/Shareddrives/전력예측/train.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/Shareddrives/전력예측/sample_submission.csv',encoding = 'utf-8')

## Data Load

In [None]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


## Train Data Pre-processing

In [None]:
# 일조, 일사 열 제거
train_df = train_df.drop(['일조(hr)','일사(MJ/m2)'], axis=1)
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,986.4


In [None]:
train_df['discomfort'] = 1.8*train_df["기온(C)"] - 0.55 * (1-train_df["습도(%)"]) * (1.8 * train_df["기온(C)"] - 26) + 32

In [None]:
test_df['discomfort'] = 1.8*test_df["기온(C)"] - 0.55 * (1-test_df["습도(%)"]) * (1.8 * test_df["기온(C)"] - 26) + 32

In [None]:
# 결측치 확인
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
전력소비량(kWh)            0
discomfort            9
dtype: int64

In [None]:
# 강수량 결측치 0.0으로 채우기
train_df['강수량(mm)'].fillna(0.0, inplace=True)

# 풍속, 습도 결측치 평균으로 채우고 반올림하기
train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(),2), inplace=True)
train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(),2), inplace=True)

In [None]:
train_df['month'] = train_df['일시'].apply(lambda x : float(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : float(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : float(x[9:11]))

In [None]:
test_df['month'] = test_df['일시'].apply(lambda x : float(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : float(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : float(x[9:11]))

In [None]:
# 순서 재배치
train_df = train_df[train_df.columns[:7].to_list() + train_df.columns[8:].to_list() + train_df.columns[7:8].to_list()]
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),discomfort,month,day,time,전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,234.154,6.0,1.0,0.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,219.28,6.0,1.0,1.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,205.672,6.0,1.0,2.0,974.88
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,167.011,6.0,1.0,3.0,953.76
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,229.592,6.0,1.0,4.0,986.4


In [None]:
test_df = test_df[test_df.columns[:7].to_list() + test_df.columns[8:].to_list() + test_df.columns[7:8].to_list()]
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,discomfort
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8.0,25.0,0.0,710.815
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8.0,25.0,1.0,674.77
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8.0,25.0,2.0,677.662
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8.0,25.0,3.0,655.363
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8.0,25.0,4.0,624.672


In [None]:
test_df['전력소비량(kWh)'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['전력소비량(kWh)'] = 0


In [None]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,discomfort,전력소비량(kWh)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8.0,25.0,0.0,710.815,0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8.0,25.0,1.0,674.77,0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8.0,25.0,2.0,677.662,0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8.0,25.0,3.0,655.363,0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8.0,25.0,4.0,624.672,0


## Hyperparameter Setting

In [None]:
input_size = 9  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

## Dataset

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [None]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_df.drop(['num_date_time', '건물번호', '일시'], axis=1).values)
# 가정: 주어진 데이터를 DataFrame 형태로 가정하겠습니다.
# 데이터 프레임의 이름은 train_df로 가정합니다.

# 건물번호를 기준으로 데이터를 그룹화합니다.
grouped_data = train_df.groupby('건물번호')

# train_group 리스트를 초기화하고 건물번호별로 데이터를 넣어줍니다.
train_group = [[] for _ in range(100)]
train_loader = [[] for _ in range(100)]
for building_number, group_data in grouped_data:
    # 건물번호에 해당하는 데이터를 리스트로 변환하여 train_group에 추가합니다.
    data_list = group_data.drop(['num_date_time', '건물번호', '일시'], axis=1).values.tolist()
    train_group[building_number - 1].extend(data_list)
    train_loader[building_number-1] = create_data_loader(train_group[building_number -1], window_size, batch_size)


In [None]:
train_loader[99][0]

TypeError: ignored

In [None]:
# normalization
#scaler = MinMaxScaler()
#train_data = scaler.fit_transform(train_df.drop(['num_date_time', '건물번호', '일시'], axis=1).values)
#train_loader = create_data_loader(train_data, window_size, batch_size)

## Model Define

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")


current device: cuda


## Train

In [None]:
for i in range(100):
  model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)
  criterion = nn.MSELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader[i+1]):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, len(train_loader[i]), loss.item()))

## Test Data Pre-processing

In [None]:
# 학습 데이터에서 마지막 행 가져오기
last_train_data = train_df.drop(['num_date_time', '건물번호', '일시',], axis=1).loc[204000-24:,:]

# 실수형 데이터로 변환
test_df['습도(%)'] = test_df['습도(%)'].astype('float64')

# 날짜 데이터 추가
test_df['month'] = test_df['일시'].apply(lambda x : float(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : float(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : float(x[9:11]))

# 전력소비량 열 생성
final_df = pd.concat((test_df.drop(['num_date_time', '건물번호', '일시',], axis=1), pd.DataFrame(np.zeros(test_df.shape[0]))),axis=1)
final_df = final_df.rename({0:'전력소비량(kWh)'},axis=1)

## Test Dataset

In [None]:
test_df = pd.concat((last_train_data, final_df)).reset_index(drop=True)
test_data = scaler.transform(test_df.values) # train과 동일하게 scaling
test_data.shape

InvalidIndexError: ignored

In [None]:
# normalization
scaler = MinMaxScaler()
test_data = scaler.fit_transform(test_df.drop(['num_date_time', '건물번호', '일시'], axis=1).values)
# 가정: 주어진 데이터를 DataFrame 형태로 가정하겠습니다.
# 데이터 프레임의 이름은 train_df로 가정합니다.

# 건물번호를 기준으로 데이터를 그룹화합니다.
grouped_data = test_df.groupby('건물번호')

# train_group 리스트를 초기화하고 건물번호별로 데이터를 넣어줍니다.
test_group = [[] for _ in range(100)]
test_loader = [[] for _ in range(100)]
for building_number, group_data in grouped_data:
    # 건물번호에 해당하는 데이터를 리스트로 변환하여 train_group에 추가합니다.
    data_list = group_data.drop(['num_date_time', '건물번호', '일시'], axis=1).values.tolist()
    test_group[building_number - 1].extend(data_list)
    test_loader[building_number-1] = create_data_loader(test_group[building_number -1], window_size, batch_size)


In [None]:
# Dataset & DataLoader
test_dataset = TimeSeriesDataset(test_data, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

## Inference

In [None]:
# 결과를 저장할 리스트 초기화
test_results = []

for i in range(1,101):
    model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for j, (inputs, labels) in enumerate(train_loader[i]):
            inputs = inputs.to(device)
            labels = labels.unsqueeze(1).to(device)

            # Forward
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (j+1) % 300 == 0:
                print('Training: Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                      .format(epoch+1, num_epochs, j+1, len(train_loader[i+1]), loss.item()))

    # 건물 번호 i+1 에 대한 테스트 데이터 로더 생성


    # 테스트 데이터로 모델 평가
    model.eval()

    test_predictions = []

    with torch.no_grad():
        for t in range(grouped_test_loaders[i].shape[0] - window_size):
            x = torch.Tensor(grouped_test_loaders[i][t:t+window_size,:]).to(device)
            new_x = model(x.view(1,window_size,-1))

            grouped_test_loaders[i][t+window_size,-1] = new_x # 입력 업데이트
            test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장



In [None]:
"""model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))

        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장"""

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

## Submit

In [None]:
predictions = scaler.inverse_transform(test_data)[24:,-1] # 원래 scale로 복구

In [None]:
predictions = scaler.inverse_transform(grouped_test_loaders)[24:,-1] # 원래 scale로 복구

TypeError: ignored

In [None]:
sample_submission['answer'] = predictions
sample_submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,388.100787
1,1_20220825 01,305.343174
2,1_20220825 02,250.823813
3,1_20220825 03,197.580982
4,1_20220825 04,194.601208
...,...,...
16795,100_20220831 19,473.298469
16796,100_20220831 20,416.697957
16797,100_20220831 21,352.821297
16798,100_20220831 22,288.323843


In [None]:
sample_submission.to_csv('lstm_baseline2_submission.csv', index=False)