## Preprocess

In [128]:
import pandas as pd
import datetime as dt

In [129]:
df = pd.read_csv("train.csv")
sample = pd.read_csv("sample_submission.csv")

In [130]:
df['일시'] = pd.to_datetime(df['일시'])

In [131]:
# 년-달-일 분리
df['년'] = df['일시'].dt.year
df['달'] = df['일시'].dt.month
df['일'] = df['일시'].dt.day

In [6]:
df = df.loc[df['일시'] >= '2000-01-01']
df.reset_index(drop = True, inplace = True)

In [132]:
# 결측치 채우기
null_cols = ['최고기온', '최저기온', '평균풍속', '일조합', '일사합','일조율']
for col in null_cols:
  df[col].interpolate(method='linear', inplace=True)

df['일교차'] = df['최고기온'] - df['최저기온']
df.loc[df['강수량'].isnull(), '강수량'] = 0.0
df.bfill(axis = 0, inplace = True)

## model

In [133]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm

In [134]:
train.values

array([[ 9.9,  1.8,  8.1, ...,  1. ,  1. ,  5.5],
       [ 6.9, -0.9,  7.8, ...,  1. ,  2. ,  4.2],
       [ 0.1, -4.6,  4.7, ...,  1. ,  3. , -2.2],
       ...,
       [ 2.1, -7.8,  9.9, ..., 12. , 29. , -2.9],
       [ 2.3, -4.4,  6.7, ..., 12. , 30. , -1.8],
       [ 2.1, -5.1,  7.2, ..., 12. , 31. , -1.2]])

In [164]:
class TimeDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size
        
    def __len__(self):
        return len(self.df) - self.window_size
        
    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        
        # 다음 시점의 시퀀스를 레이블로 사용
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
            
        return x, y
        
def create_data_loader(df, window_size, batch_size):
    dataset = TimeDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size = batch_size, shuffle = False)
    return data_loader

In [165]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, output_size)
        # self.layer_1 = nn.Linear(hidden_size, 256)
        # self.layer_2 = nn.Linear(256,256)
        # self.layer_3 = nn.Linear(256,128)
        # self.layer_4 = nn.Linear(128,output_size)
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        print("HI")
        out, _ = self.lstm(x, (h0, c0))
        # out, (hn, cn) = self.lstm(x, (h0, c0))
        # hn = hn.view(-1, self.hidden_size) # 1차원으로 변환해줌
        # print("Hn shape:",hn.shape)
        # output = self.fc(hn)
        # out = self.fc(out)
        # print(out[:,-1,:])
        # print("Shape", out.shape)
        # print("Predict", out[0,-1,:])
        out = self.fc(out[:,-1,:])
        return out

In [None]:
# 하이퍼파라미터
input_size = 13  # feature의 개수
hidden_size = 128
num_layers = 2
output_size = 1
num_epochs = 1
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 32
learning_rate = 0.001

train = df.drop('일시',axis = 1)
train = train[['최고기온', '최저기온', '일교차', '강수량', '평균습도', '평균풍속', '일조합', '일사합', '일조율',
       '년', '달', '일','평균기온']]

23011*0.8

train = train[:18408]
validation = train[18408:]

# Dataset
train_data = train.values
train_loader = create_data_loader(train_data, window_size, batch_size)

train.values[0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)
        
        # Forward
        outputs = model(inputs)
        print("output: ",outputs.shape)
        print("label: ",labels.shape)
        print("Output Data: ",outputs[:6,:])
        print("Label Data: ",labels[:6,:])
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad() #초기화
        loss.backward()
        optimizer.step()

        if (i+1) % 5 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

### inference

In [105]:
# tt = pd.read_csv("train.csv")
test = train[-358:].copy()

In [86]:
# test.drop("평균기온",axis = 1, inplace = True)

In [106]:
test.reset_index(drop  = True,inplace= True)

In [107]:
test_data = test.values

In [108]:
test_data

array([[ 3.7, -5. ,  8.7, ...,  1. ,  8. ,  0.3],
       [ 5.6, -1.2,  6.8, ...,  1. ,  9. ,  1.3],
       [ 4.4, -3.4,  7.8, ...,  1. , 10. ,  0.5],
       ...,
       [ 2.1, -7.8,  9.9, ..., 12. , 29. , -2.9],
       [ 2.3, -4.4,  6.7, ..., 12. , 30. , -1.8],
       [ 2.1, -5.1,  7.2, ..., 12. , 31. , -1.2]])

In [110]:
s = torch.tensor(test_data, dtype=torch.float)

In [114]:
s.unsqueeze(0).shape

torch.Size([1, 358, 13])

array([[ 3.7, -5. ,  8.7, ...,  1. ,  8. ,  0.3],
       [ 5.6, -1.2,  6.8, ...,  1. ,  9. ,  1.3],
       [ 4.4, -3.4,  7.8, ...,  1. , 10. ,  0.5],
       ...,
       [ 2.1, -7.8,  9.9, ..., 12. , 29. , -2.9],
       [ 2.3, -4.4,  6.7, ..., 12. , 30. , -1.8],
       [ 2.1, -5.1,  7.2, ..., 12. , 31. , -1.2]])

In [None]:
test_dataset = TimeDataset(test_data, 1)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [121]:
test_data.shape

(358, 13)

In [125]:
test_window = 10
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - test_window):
        x = torch.Tensor(test_data[i:i+test_window,:]).to(device)
        print("X:  ",x.shape)
        new_x = model(x.view(1,test_window,-1))
        
        test_data[i+test_window,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 10, 64])
X:   torch.Size([10, 13])
Shape torch.Size([1, 1

In [115]:
# 입력 데이터에 대한 추론
with torch.no_grad():
    input_sequence = s.unsqueeze(0)  # 배치 차원 추가
    output_sequence = model(input_sequence)
    output_sequence = output_sequence.squeeze(0)  # 배치 차원 제거

Shape torch.Size([1, 358, 64])


In [118]:
output_sequence

tensor([8.7889])

In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for inputs, _  in test_loader:
        # print(inputs)
        outputs = model(inputs)
        predictions.append(outputs.item())  # 예측 결과를 리스트에 추가

In [None]:
output_sequence