In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from sklearn.metrics import mean_squared_error
from math import sqrt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# validation을 위한 함수


## 데이터 분할

class train_test_split_ts:
    def __init__(self, cum=1, n_split=5, test_size=1):
        """       
        - cum : train을 첫번째 데이터부터 누적해서 사용할지 일부 구간만 사용할지 
        - n_split : number of splits
        - test_size : size of the test set
        """

        self.cum = cum
        self.n_split = n_split
        self.test_size = test_size

    
    def split(self, window_x, window_y):
        """
        전체 데이터 x, y 준비
        """

        cum = self.cum
        n_split = self.n_split
        test_size = self.test_size

        n_data = len(window_x)
        idx = [i for i in range(n_data)]

        data_range = n_data // n_split # cross-validation을 위한 구간
        train_start_indice = [i for i in range(0, n_data, n_data//n_split)][:n_split]
        
        for i in range(n_split):
            train_start = train_start_indice[i] # train_start_index


            if cum == 0: # train 일부 구간
                if i == n_split-1: # 맨 마지막 구간은 남아있는 데이터 모두 train으로 넣기
                    yield (idx[train_start:n_data-test_size], idx[-test_size:])

                else:
                    yield (idx[train_start:train_start+data_range-test_size],
                        idx[train_start+data_range-test_size:train_start+data_range])
                

            elif cum == 1:  # train 누적 구간
                if i == n_split-1: # 맨 마지막 구간은 남아있는 데이터 모두 train으로 넣기
                    yield (idx[0:n_data-test_size], idx[-test_size:])

                else:
                    yield (idx[0:train_start+data_range-test_size],
                        idx[train_start+data_range-test_size:train_start+data_range])

            else:
                raise ValueError("cum=0 or 1")

In [None]:
# # 사용 예)
# ts_split = train_test_split_ts(cum=1, n_split=5, test_size=1)
# for train_idx, test_idx in ts_split.split(window_x, window_y):
#     print("Train:", train_idx)
#     print("Test:", test_idx)
#     print('-'*50)

In [3]:
# 모델
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, (h, c) = self.lstm(x_time)

        out_time, _ = self.hidden_lstm(out_time)

        out_time, _ = self.hidden_lstm(out_time) # [30, 30]

        out_time, _ = self.hidden_lstm(out_time) # [30, 4]

        out_time = self.time_fc(out_time[:,-7:, :]) 

        
        return out_time.view(-1,7,4)


# 모델 학습
def train_model(model, X_train, y_train):
  
    X_tensor = torch.tensor(X_train).float().to(device)
    y_tensor = torch.tensor(y_train).float().to(device)

    # Train model
    optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
    criterion = nn.MSELoss(size_average = True)
    num_epochs  = 500

    train_error = []
    for t in range(1, num_epochs+1):
        train_pred = model(X_tensor)
        loss = criterion(train_pred, y_tensor) ### trend
        train_error.append(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if t == num_epochs:
            print(f"{t} Epochs train MSE: {loss.item():1.5f}")


# valid loss
def get_test_errors(model, X_test, y_test):
    X_tensor = torch.tensor(X_test).float().to(device)
    y_pred_tensor = model(X_tensor)
    y_pred = y_pred_tensor.cpu().detach().numpy()
    
    
    error1 = sqrt(mean_squared_error(y_test[:, 0], y_pred[:, 0])) # 사용자
    error2 = sqrt(mean_squared_error(y_test[:, 1], y_pred[:, 1])) # 세션
    error3 = sqrt(mean_squared_error(y_test[:, 2], y_pred[:, 2])) # 신규방문자
    error4 = sqrt(mean_squared_error(y_test[:, 3], y_pred[:, 3])) # 페이지뷰
    error = np.mean([error1, error2, error3, error4])
        
    return error1, error2, error3, error4, error

# 데이터 로드 

In [4]:
all = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/all_preped_01.csv")
print(all.shape)
print(all.columns)
all_raw = all.copy()

(555, 10)
Index(['date', '사용자', '세션', '신규방문자', '페이지뷰', 'cnt_signin', 'cnt_login',
       'cnt_sub', 'total_participants', 'isTrain'],
      dtype='object')


In [5]:
# window_x, window_y (스케일링 전)
train = all.loc[all['isTrain']==1, :] 
train = all.loc[all['isTrain']==1, [col for col in all.columns if col != 'isTrain']] 

In [7]:
# window_x, window_y (스케일링 전)
train = all.loc[all['isTrain']==1, :] 
train = all.loc[all['isTrain']==1, [col for col in all.columns if col != 'isTrain']] 
train = train.iloc[:, [0, 1, 2, 3, 4, -1]]

input_window = 30
output_window = 7

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 5)) # (데이터 수, time sequence, input_dim)
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))  # (데이터 수, time sequence, output_dim)

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1:5].values

# Validation

- 스케일링 된 상태에서 error를 봐야함. 왜냐하면 시간의 흐름에 따라 상승곡선을 그리고 있어서 스케일링 하지 않으면 2018년보다 2020년의 error가 무조건 크게 나옴. 이러면 validation을 했을 때 마지막 fold 영향만 받게 되어 의미가 없어짐.

# 방법1) 누적




In [8]:
errors = {'사용자': [], '세션':[], '신규방문자':[], '페이지뷰':[], 'total':[]}

ts_split = train_test_split_ts(cum=1, n_split=5, test_size=1)
i = 1
for train_idx, test_idx in ts_split.split(window_x, window_y):
    print(f"{i}fold cv---------------------------------")
    X_train, X_test = window_x[train_idx], window_x[test_idx]
    y_train, y_test = window_y[train_idx], window_y[test_idx]

    # 스케일링
    mini = X_train.min(axis=0)
    size = X_train.max(axis=0) - X_train.min(axis=0)
    X_train = (X_train - mini) / size
    X_test = (X_test - mini) / size
    y_train = (y_train - mini[:7, :4]) / size[:7, :4]
    y_test = (y_test - mini[:7, :4]) / size[:7, :4]
    print('Done scaling...')

    # 모델 학습
    model = LSTM(input_size = 5, hidden_size = 64).to(device)
    train_model(model, X_train, y_train)
    print('Done training...')

    # 모델 평가
    err_tuple = get_test_errors(model, X_test, y_test)

    for j, k in enumerate(errors.keys()):
        errors[k].append(err_tuple[j])
    print('Done Validation...')
    
    i += 1


1fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00008
Done training...
Done Validation...
2fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00070
Done training...
Done Validation...
3fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00030
Done training...
Done Validation...
4fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00449
Done training...
Done Validation...
5fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00206
Done training...
Done Validation...


In [9]:
pd.DataFrame(errors)

Unnamed: 0,사용자,세션,신규방문자,페이지뷰,total
0,0.010219,0.006185,0.006769,0.035645,0.014704
1,0.027711,0.028292,0.039629,0.025947,0.030395
2,0.015868,0.009414,0.010443,0.02227,0.014499
3,0.204973,0.106044,0.070048,0.082674,0.115935
4,0.016492,0.03599,0.064067,0.066962,0.045878


In [10]:
pd.DataFrame(errors).mean(axis=0)

사용자      0.055053
세션       0.037185
신규방문자    0.038191
페이지뷰     0.046700
total    0.044282
dtype: float64

# 방법2) 일부 구간

In [15]:
errors = {'사용자': [], '세션':[], '신규방문자':[], '페이지뷰':[], 'total':[]}

ts_split = train_test_split_ts(cum=0, n_split=5, test_size=1)
i = 1
for train_idx, test_idx in ts_split.split(window_x, window_y):
    print(f"{i}fold cv---------------------------------")
    X_train, X_test = window_x[train_idx], window_x[test_idx]
    y_train, y_test = window_y[train_idx], window_y[test_idx]

    # 스케일링
    mini = X_train.min(axis=0)
    size = X_train.max(axis=0) - X_train.min(axis=0)
    X_train = (X_train - mini) / size
    X_test = (X_test - mini) / size
    y_train = (y_train - mini[:7, :4]) / size[:7, :4]
    y_test = (y_test - mini[:7, :4]) / size[:7, :4]
    print('Done scaling...')

    # 모델 학습
    model = LSTM(input_size = 5, hidden_size = 64).to(device)
    train_model(model, X_train, y_train)
    print('Done training...')

    # 모델 평가
    err_tuple = get_test_errors(model, X_test, y_test)

    for j, k in enumerate(errors.keys()):
        errors[k].append(err_tuple[j])
    print('Done Validation...')
    
    i += 1

1fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00009
Done training...
Done Validation...
2fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00017
Done training...
Done Validation...
3fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00042
Done training...
Done Validation...
4fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00030
Done training...
Done Validation...
5fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00001
Done training...
Done Validation...


In [16]:
pd.DataFrame(errors)

Unnamed: 0,사용자,세션,신규방문자,페이지뷰,total
0,0.007188,0.002885,0.002534,0.007992,0.00515
1,0.046715,0.028525,0.025049,0.015124,0.028853
2,0.014152,0.019774,0.033031,0.04995,0.029227
3,0.00579,0.030406,0.009606,0.028803,0.018651
4,0.003209,0.002475,0.003793,0.006252,0.003932


In [17]:
pd.DataFrame(errors).mean(axis=0)

사용자      0.015411
세션       0.016813
신규방문자    0.014803
페이지뷰     0.021624
total    0.017163
dtype: float64

- 최근 데이터를 사용해서 학습한 경우의 validation loss가 더 낮음
    - 누적 total loss 평균 : 0.044282
    - 최근 일부 total loss 평균 : 0.017163

- 그래프의 자기 상관성을 확인했을 때에도 4개월 이내의 자기 상관성이 크게 나타났음

- 결론 !!
    - 최근 데이터를 가져와서 학습한 뒤에 예측을 하자