In [150]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
from sklearn.metrics import mean_squared_error
from math import sqrt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [85]:
# validation을 위한 함수


## 데이터 분할

class train_test_split_ts:
    def __init__(self, cum=1, n_split=5, test_size=1):
        """       
        - cum : train을 첫번째 데이터부터 누적해서 사용할지 일부 구간만 사용할지 
        - n_split : number of splits
        - test_size : size of the test set
        """

        self.cum = cum
        self.n_split = n_split
        self.test_size = test_size

    
    def split(self, window_x, window_y):
        """
        전체 데이터 x, y 준비
        """

        cum = self.cum
        n_split = self.n_split
        test_size = self.test_size

        n_data = len(window_x)
        idx = [i for i in range(n_data)]

        data_range = n_data // n_split # cross-validation을 위한 구간
        train_start_indice = [i for i in range(0, n_data, n_data//n_split)][:n_split]
        
        for i in range(n_split):
            train_start = train_start_indice[i] # train_start_index


            if cum == 0: # train 일부 구간
                if i == n_split-1: # 맨 마지막 구간은 남아있는 데이터 모두 train으로 넣기
                    yield (idx[train_start:n_data-test_size], idx[-test_size:])

                else:
                    yield (idx[train_start:train_start+data_range-test_size],
                        idx[train_start+data_range-test_size:train_start+data_range])
                

            elif cum == 1:  # train 누적 구간
                if i == n_split-1: # 맨 마지막 구간은 남아있는 데이터 모두 train으로 넣기
                    yield (idx[0:n_data-test_size], idx[-test_size:])

                else:
                    yield (idx[0:train_start+data_range-test_size],
                        idx[train_start+data_range-test_size:train_start+data_range])

            else:
                raise ValueError("cum=0 or 1")

In [86]:
# 사용 예)
ts_split = train_test_split_ts(cum=1, n_split=5, test_size=1)
for train_idx, test_idx in ts_split.split(window_x, window_y):
    print("Train:", train_idx)
    print("Test:", test_idx)
3    print('-'*50)

Train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
Test: [90]
--------------------------------------------------
Train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 13

In [165]:
# 모델
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, (h, c) = self.lstm(x_time)

        out_time, _ = self.hidden_lstm(out_time)

        out_time, _ = self.hidden_lstm(out_time) # [30, 30]

        out_time, _ = self.hidden_lstm(out_time) # [30, 4]

        out_time = self.time_fc(out_time[:,-7:, :]) 

        
        return out_time.view(-1,7,4)


# 모델 학습
def train_model(model, X_train, y_train):
  
    X_tensor = torch.tensor(X_train).float().to(device)
    y_tensor = torch.tensor(y_train).float().to(device)

    # Train model
    optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
    criterion = nn.MSELoss(size_average = True)
    num_epochs  = 500

    train_error = []
    for t in range(1, num_epochs+1):
        train_pred = model(X_tensor)
        loss = criterion(train_pred, y_tensor) ### trend
        train_error.append(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if t == num_epochs:
            print(f"{t} Epochs train MSE: {loss.item():1.5f}")


# valid loss
def get_test_errors(model, X_test, y_test):
    X_tensor = torch.tensor(X_test).float().to(device)
    y_pred_tensor = model(X_tensor)
    y_pred = y_pred_tensor.cpu().detach().numpy()
    
    
    error1 = sqrt(mean_squared_error(y_test[:, 0], y_pred[:, 0])) # 사용자
    error2 = sqrt(mean_squared_error(y_test[:, 1], y_pred[:, 1])) # 세션
    error3 = sqrt(mean_squared_error(y_test[:, 2], y_pred[:, 2])) # 신규방문자
    error4 = sqrt(mean_squared_error(y_test[:, 3], y_pred[:, 3])) # 페이지뷰
    error = np.mean([error1, error2, error3, error4])
        
    return error1, error2, error3, error4, error

# 데이터 로드 

In [104]:
all = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/all_preped_01.csv")
print(all.shape)
print(all.columns)
all_raw = all.copy()

(555, 10)
Index(['date', '사용자', '세션', '신규방문자', '페이지뷰', 'cnt_signin', 'cnt_login',
       'cnt_sub', 'total_participants', 'isTrain'],
      dtype='object')


In [105]:
# window_x, window_y (스케일링 전)
train = all.loc[all['isTrain']==1, :] 
train = all.loc[all['isTrain']==1, [col for col in all.columns if col != 'isTrain']] 

input_window = 30
output_window = 7

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 8)) # (데이터 수, time sequence, input_dim)
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))  # (데이터 수, time sequence, output_dim)

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1:5].values

# Validation

- 스케일링 된 상태에서 error를 봐야함. 왜냐하면 시간의 흐름에 따라 상승곡선을 그리고 있어서 스케일링 하지 않으면 2018년보다 2020년의 error가 무조건 크게 나옴. 이러면 validation을 했을 때 마지막 fold 영향만 받게 되어 의미가 없어짐.

# 방법1) 누적




In [170]:
errors = {'사용자': [], '세션':[], '신규방문자':[], '페이지뷰':[], 'total':[]}

ts_split = train_test_split_ts(cum=1, n_split=5, test_size=1)
i = 1
for train_idx, test_idx in ts_split.split(window_x, window_y):
    print(f"{i}fold cv---------------------------------")
    X_train, X_test = window_x[train_idx], window_x[test_idx]
    y_train, y_test = window_y[train_idx], window_y[test_idx]

    # 스케일링
    mini = X_train.min(axis=0)
    size = X_train.max(axis=0) - X_train.min(axis=0)
    X_train = (X_train - mini) / size
    X_test = (X_test - mini) / size
    y_train = (y_train - mini[:7, :4]) / size[:7, :4]
    y_test = (y_test - mini[:7, :4]) / size[:7, :4]
    print('Done scaling...')

    # 모델 학습
    model = LSTM(input_size = 8, hidden_size = 64).to(device)
    train_model(model, X_train, y_train)
    print('Done training...')

    # 모델 평가
    err_tuple = get_test_errors(model, X_test, y_test)

    for j, k in enumerate(errors.keys()):
        errors[k].append(err_tuple[j])
    print('Done Validation...')
    
    i += 1


1fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00028
Done training...
Done Validation...
2fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00042
Done training...
Done Validation...
3fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00039
Done training...
Done Validation...
4fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00750
Done training...
Done Validation...
5fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00057
Done training...
Done Validation...


In [172]:
pd.DataFrame(errors)

Unnamed: 0,사용자,세션,신규방문자,페이지뷰,total
0,0.01135,0.003409,0.009569,0.006094,0.007606
1,0.034905,0.011524,0.016547,0.023156,0.021533
2,0.025726,0.013574,0.001232,0.030175,0.017677
3,0.611371,0.273544,0.24069,0.080229,0.301459
4,0.013107,0.016314,0.039735,0.02054,0.022424


# 방법2) 일부 구간

In [173]:
%%time
errors = {'사용자': [], '세션':[], '신규방문자':[], '페이지뷰':[], 'total':[]}

ts_split = train_test_split_ts(cum=0, n_split=5, test_size=1)
i = 1
for train_idx, test_idx in ts_split.split(window_x, window_y):
    print(f"{i}fold cv---------------------------------")
    X_train, X_test = window_x[train_idx], window_x[test_idx]
    y_train, y_test = window_y[train_idx], window_y[test_idx]

    # 스케일링
    mini = X_train.min(axis=0)
    size = X_train.max(axis=0) - X_train.min(axis=0)
    X_train = (X_train - mini) / size
    X_test = (X_test - mini) / size
    y_train = (y_train - mini[:7, :4]) / size[:7, :4]
    y_test = (y_test - mini[:7, :4]) / size[:7, :4]
    print('Done scaling...')

    # 모델 학습
    model = LSTM(input_size = 8, hidden_size = 64).to(device)
    train_model(model, X_train, y_train)
    print('Done training...')

    # 모델 평가
    err_tuple = get_test_errors(model, X_test, y_test)

    for j, k in enumerate(errors.keys()):
        errors[k].append(err_tuple[j])
    print('Done Validation...')
    
    i += 1

1fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00006
Done training...
Done Validation...
2fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00072
Done training...
Done Validation...
3fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00021
Done training...
Done Validation...
4fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00011
Done training...
Done Validation...
5fold cv---------------------------------
Done scaling...
500 Epochs train MSE: 0.00003
Done training...
Done Validation...
CPU times: user 3min 27s, sys: 1.29 s, total: 3min 29s
Wall time: 3min 29s


In [174]:
pd.DataFrame(errors)

Unnamed: 0,사용자,세션,신규방문자,페이지뷰,total
0,0.015331,0.030582,0.00853,0.009367,0.015952
1,0.021673,0.015743,0.031324,0.024342,0.023271
2,0.024326,0.015495,0.034807,0.045808,0.030109
3,0.041817,0.08766,0.024049,0.017432,0.04274
4,0.007467,0.013275,0.008781,0.001645,0.007792
