In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [79]:
import torch 
import torch.optim as optim
import torch.nn as nn

from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error 

In [80]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split

In [81]:
from tqdm import tqdm

In [82]:
df = pd.read_csv('C:/Users/chaeh/bigcon/dataset/0910 착과수_컬럼추가.csv') #착과수

In [85]:
df = df.drop(columns=['EXTN_TPRT', 'ABSLT_HMDT', 'EXTN_SRQT', 'WATER_LACK_VL'])

In [86]:
df.shape

(26064, 9)

In [87]:
df.columns

Index(['STRG_DT', 'FRST_TREE_CNT', 'PFBS_NTRO_CBDX_CTRN', 'SPL_TPRT_1',
       'HTNG_TPRT_1', 'SPL_TPRT_2', 'AVE_INNER_HMDT_1_2',
       'SKLT_OPDR_RATE_1_RIGHT', 'GDD'],
      dtype='object')

## 전처리

In [88]:
torch.manual_seed(0) #학습을 다시 수행 시 실헙을 동일하게 진행하기 위해서 난수를 동일하게 생성

<torch._C.Generator at 0x1d177844210>

In [89]:
# GPU 설정
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [90]:
# 파라미터 초기화
data_dim = 8 #전체 feature 개수
output_dim = 1
batch_size = 128 #배치 사이즈도 하이퍼파라미터로 설정해야 하지만, dataloader시 필요하여 임으로 지정함. 

In [91]:
#학습 데이터 셋 생성
def make_dataset(data, seq_len):
    data_X = []
    data_Y = []
    
    #sequence 데이터 생성
    for i in range(len(data)-seq_len):
        x = data[i:i+seq_len, :] #0~143, 1~144, ...
        y = data[i+seq_len, [-1]] #0~143, 1~144, ...
        data_X.append(x)
        data_Y.append(y)
    
    return np.array(data_X), np.array(data_Y)

def data_result(df, seq_length):
    #필요한 컬럼만 선택
    df = df[['PFBS_NTRO_CBDX_CTRN', 'SPL_TPRT_1', 'HTNG_TPRT_1',
           'SPL_TPRT_2', 'AVE_INNER_HMDT_1_2', 'SKLT_OPDR_RATE_1_RIGHT', 'GDD', 'FRST_TREE_CNT']]

    #train/test 분리 + validation
    train_size = int(len(df)*0.6)
    val_size = int(0.2 * len(df))
    test_size = len(df) - train_size - val_size
    train_set = df[0:train_size]
    val_set = df[train_size:train_size+val_size]
    test_set = df[train_size+val_size:] #train_size - seq_length
    
    #scaling
    train_set = train_set.copy() #deepcopy 원리
    val_set = val_set.copy()
    test_set = test_set.copy()

    scaler_x = StandardScaler()
    scaler_x.fit(train_set.iloc[:,:-1])

    train_set.iloc[:,:-1] = scaler_x.transform(train_set.iloc[:,:-1])
    val_set.iloc[:,:-1] = scaler_x.transform(val_set.iloc[:,:-1])
    test_set.iloc[:,:-1] = scaler_x.transform(test_set.iloc[:,:-1])


    scaler_y = StandardScaler()
    scaler_y.fit(train_set.iloc[:,[-1]])

    train_set.iloc[:,-1] = scaler_y.transform(train_set.iloc[:,[-1]])
    val_set.iloc[:,-1] = scaler_y.transform(val_set.iloc[:,[-1]])
    test_set.iloc[:,-1] = scaler_y.transform(test_set.iloc[:,[-1]])

    trainX, trainY = make_dataset(np.array(train_set), seq_length) #sequence 데이터로 생성
    valX, valY = make_dataset(np.array(val_set), seq_length)
    testX, testY = make_dataset(np.array(test_set), seq_length)
    
    #사용하는 device로 적용, numpy를 tensor로 바꾸기 
    trainX_tensor = torch.FloatTensor(trainX).to(device)
    trainY_tensor = torch.FloatTensor(trainY).to(device)

    valX_tensor = torch.FloatTensor(valX).to(device)
    valY_tensor = torch.FloatTensor(valY).to(device)

    testX_tensor = torch.FloatTensor(testX).to(device)
    testY_tensor = torch.FloatTensor(testY).to(device)
    
    #TensorDataset은 학습 데이터 X와 레이블 Y를 묶어 놓은 컨테이너
    dataset = TensorDataset(trainX_tensor, trainY_tensor)
    dataset_val = TensorDataset(valX_tensor, valY_tensor)

    #DataLoader를 통해 배치 크기, 데이터를 섞을지 등을 결정한다.
    #참고) drop_last: batch_size에 따라 마지막 batch의 길이가 달라질 수 있어서, 마지막 배치를 사용할지 여부
    dataloader = DataLoader(dataset, batch_size = batch_size, shuffle = False, drop_last = True)
    dataloader_val = DataLoader(dataset_val, batch_size = batch_size, shuffle = False, drop_last = True) 
    
    return dataloader, dataloader_val, testX_tensor, testY_tensor

## modeling

In [92]:
#LSTM 만들기

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, seq_length, output_dim, layers): 
        super(LSTMModel, self).__init__()
        
        self.output_dim = output_dim
        self.layers = layers
        self.hidden_dim = hidden_dim
        self.seq_length = seq_length
        self.input_dim = input_dim
        
        #참고) batch_first: True이면 Output 값의 사이즈는 (batch, seq, feature)
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers = layers, batch_first = True)
        self.fc = nn.Linear(hidden_dim, output_dim, bias = True)
    

    def reset_hidden_state(self):
        #hidden state 초기화 함수
        #매번 새로운 학습을 하여 최적의 값을 찾기 위해서 사용
        self.hidden = (
        torch.zeros(self.layers, self.seq_length, self.hidden_dim),
        torch.zeros(self.layers, self.seq_length, self.hidden_dim)
        )
        
        
    def forward(self,x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1])

        return out

## hyperparameter tuning

In [17]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

In [18]:
param_grid = {
    'hidden_dim': [64, 128, 256],
    'seq_length': [36, 72, 144],
    'layers': [1, 2, 3],
    'lr': [0.001, 0.01, 0.1]
}

In [19]:
best_mse = float('inf')
best_params = {}

for params in tqdm(ParameterGrid(param_grid), desc = 'gridsearch start'):
    dataloader, dataloader_val, testX_tensor, testY_tensor = data_result(df, params['seq_length'])
    
    # 모델 초기화
    model = LSTMModel(8, params['hidden_dim'], params['seq_length'], 1, params['layers'])
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])
    criterion = nn.MSELoss()

    # 모델 학습 및 평가
    for epoch in range(10):
        model.train()
        
        for batch_idx, samples in enumerate(dataloader):
            x_train, y_train = samples
            
            #seq별 hidden state reset
            model.reset_hidden_state()
            
            #h(x) 계산
            outputs = model(x_train)
            
            #cost 계산
            loss = criterion(outputs, y_train)
            
            #참고: 파이토치에서 역전파 수행 시 계속 값을 더해주기 때문에,
            #역전파 수행 전 gradients를 0으로 만들어주고 시작해야 한다.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 평가 루프
        model.eval()
        predictions = []
        targets = []
        
        for batch_idx, samples in enumerate(dataloader_val):
            with torch.no_grad():
                x_val, y_val = samples
                model.reset_hidden_state()
                output = model(x_val)
            predictions.extend(output.tolist())
            targets.extend(y_val.tolist())
        mse = mean_squared_error(targets, predictions)

    # 현재 하이퍼파라미터 조합의 MSE를 기록
    if mse < best_mse:
        best_mse = mse
        best_params = params

print("Best Hyperparameters:", best_params)
print("Best Validation MSE:", best_mse)

gridsearch start: 100%|███████████████████████████████████████████████████████████| 81/81 [24:18:30<00:00, 1080.38s/it]

Best Hyperparameters: {'hidden_dim': 256, 'layers': 1, 'lr': 0.001, 'seq_length': 144}
Best Validation MSE: 0.25332796162734894



