In [118]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
# define 'device' to upload tensor in gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [119]:
train = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/train.csv", encoding = 'euc-kr')
train['DateTime'] = pd.to_datetime(train.DateTime)
train['date'] = train.DateTime.dt.date
train  = train.groupby('date').sum().reset_index()
train['date'] = pd.to_datetime(train['date'] )

In [120]:
# x에 is_holiday 변수 추가
import os
os.chdir('/content/drive/MyDrive/dacon/daconcup/')
import utils
train = utils.add_isHoliday_column(train)

In [121]:
train

Unnamed: 0,date,사용자,세션,신규방문자,페이지뷰,isHoliday
0,2018-09-09,281,266,73,1826,1
1,2018-09-10,264,247,51,2092,0
2,2018-09-11,329,310,58,1998,0
3,2018-09-12,300,287,45,2595,0
4,2018-09-13,378,344,50,3845,0
...,...,...,...,...,...,...
787,2020-11-04,4516,4472,1196,112683,0
788,2020-11-05,4155,4037,1044,102901,0
789,2020-11-06,3663,3576,825,88015,0
790,2020-11-07,2472,2417,531,57386,1


In [126]:
window_x

tensor([[[0.0517, 0.0491, 0.0426, 0.0117],
         [0.0486, 0.0456, 0.0296, 0.0134],
         [0.0606, 0.0572, 0.0337, 0.0128],
         ...,
         [0.0183, 0.0178, 0.0302, 0.0031],
         [0.0126, 0.0124, 0.0136, 0.0016],
         [0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.0486, 0.0456, 0.0296, 0.0134],
         [0.0606, 0.0572, 0.0337, 0.0128],
         [0.0552, 0.0530, 0.0260, 0.0167],
         ...,
         [0.0126, 0.0124, 0.0136, 0.0016],
         [0.0098, 0.0098, 0.0130, 0.0014],
         [0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.0606, 0.0572, 0.0337, 0.0128],
         [0.0552, 0.0530, 0.0260, 0.0167],
         [0.0696, 0.0635, 0.0290, 0.0247],
         ...,
         [0.0098, 0.0098, 0.0130, 0.0014],
         [0.0166, 0.0156, 0.0166, 0.0035],
         [0.0000, 0.0000, 0.0000, 0.0000]],

        ...,

        [[0.3088, 0.3032, 0.2036, 0.2543],
         [0.2339, 0.2317, 0.1343, 0.1947],
         [0.2719, 0.2669, 0.1604, 0.2021],
         ...,
         [0.6299, 0.6

In [43]:
# minmax scaling
mini = train.iloc[:,1:-1].min()
size = train.iloc[:,1:-1].max() - train.iloc[:,1:-1].min()
train.iloc[:,1:-1] = (train.iloc[:,1:-1] -  mini) / size

input_window = 30 # 30일씩 데이터를 넣음
output_window = 7 # 이후 7일을 예측

# (a, b, c) -> a: 데이터 수, b: input (이전 30일 데이터, isHoliday), c: 예측 layer
# 사용자 수 -> 
window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window + 1, 4)) # 공휴일여부 추가
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:-1, :] = train.iloc[start : end                , 1: -1].values # 1~30일 사용자수,...4가지
    window_x[start, -1, :] = train.iloc[end, -1]                                   # 31일의 isholiday 
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1: -1].values   # 1일~7일 사용자수,....4가지

In [44]:
window_x.shape, window_y.shape

((755, 31, 4), (755, 7, 4))

In [45]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, _ = self.lstm(x_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        
        
        out_time = self.time_fc(out_time[:,-7:, :])
        
        return out_time.view(-1,7,4)
    
model = LSTM(input_size = 4, hidden_size = 30).to(device)

In [48]:
window_x = torch.tensor(window_x).float().to(device)
window_y = torch.tensor(window_y).float().to(device)

# Train model
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.MSELoss(size_average = True)
num_epochs  = 800
train_error = []
for t in range(num_epochs):
    train_pred = model(window_x)
    loss = criterion(train_pred, window_y) ### trend
    train_error.append(loss)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if t % 20 == 0 and t !=0:
        print(f"{t} Epochs train MSE: {loss.item():1.5f}")

20 Epochs train MSE: 0.00234
40 Epochs train MSE: 0.00225
60 Epochs train MSE: 0.00219
80 Epochs train MSE: 0.00214
100 Epochs train MSE: 0.00210
120 Epochs train MSE: 0.00205
140 Epochs train MSE: 0.00198
160 Epochs train MSE: 0.00191
180 Epochs train MSE: 0.00182
200 Epochs train MSE: 0.00174
220 Epochs train MSE: 0.00167
240 Epochs train MSE: 0.00160
260 Epochs train MSE: 0.00154
280 Epochs train MSE: 0.00149
300 Epochs train MSE: 0.00145
320 Epochs train MSE: 0.00140
340 Epochs train MSE: 0.00138
360 Epochs train MSE: 0.00135
380 Epochs train MSE: 0.00131
400 Epochs train MSE: 0.00129
420 Epochs train MSE: 0.00126
440 Epochs train MSE: 0.00123
460 Epochs train MSE: 0.00122
480 Epochs train MSE: 0.00120
500 Epochs train MSE: 0.00116
520 Epochs train MSE: 0.00115
540 Epochs train MSE: 0.00112
560 Epochs train MSE: 0.00110
580 Epochs train MSE: 0.00110
600 Epochs train MSE: 0.00107
620 Epochs train MSE: 0.00106
640 Epochs train MSE: 0.00104
660 Epochs train MSE: 0.00102
680 Epochs tra

In [66]:
train

Unnamed: 0,date,사용자,세션,신규방문자,페이지뷰,isHoliday
0,2018-09-09,0.051689,0.049083,0.042604,0.011735,1.0
1,2018-09-10,0.048551,0.045564,0.029586,0.013446,0.0
2,2018-09-11,0.060550,0.057233,0.033728,0.012842,0.0
3,2018-09-12,0.055197,0.052973,0.026036,0.016682,0.0
4,2018-09-13,0.069596,0.063530,0.028994,0.024724,0.0
...,...,...,...,...,...,...
787,2020-11-04,0.833487,0.828116,0.707101,0.724949,0.0
788,2020-11-05,0.766845,0.747546,0.617160,0.662015,0.0
789,2020-11-06,0.676020,0.662160,0.487574,0.566244,0.0
790,2020-11-07,0.456157,0.447490,0.313609,0.369188,1.0


In [127]:
submission = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/submission.csv", encoding = 'euc-kr')
sub_copy = submission.copy()
sub_copy['date'] = pd.to_datetime(sub_copy['DateTime'])
sub_copy = utils.add_isHoliday_column(sub_copy)
sub_holiday_lst = sub_copy.isHoliday.tolist()

In [128]:
submission

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,0,0,0,0
1,2020-11-10,0,0,0,0
2,2020-11-11,0,0,0,0
3,2020-11-12,0,0,0,0
4,2020-11-13,0,0,0,0
...,...,...,...,...,...
56,2021-01-04,0,0,0,0
57,2021-01-05,0,0,0,0
58,2021-01-06,0,0,0,0
59,2021-01-07,0,0,0,0


In [124]:
#last_month = torch.tensor(window_x[-1,:,:][np.newaxis,...]).float().to(device) <- 수정 전
last_month = np.zeros((1, 31, 4))
last_month[:, :-1, :] = train.iloc[-30:,1:-1].values[np.newaxis,...] # <- 수정 후
last_month[:, -1, :] = sub_copy.iloc[0, -1]
last_month = torch.tensor(last_month).float().to(device) # <- 수정 후

for start in range((len(submission) - output_window)//7 + 2): # 0, ..., 8
    start = start * 7
    next_week = model(last_month)
    #last_month = torch.cat([last_month[-7:], next_week], axis = 1) <- 수정 전
    last_month = torch.cat([last_month[:,7:-1,:], next_week], axis = 1) # tensor # <- 수정 후

    isholiday_arr = np.array([[[sub_holiday_lst[start]]*4]])
    isholiday = torch.tensor(isholiday_arr).float().to(device)
    last_month = torch.cat([last_month, isholiday], axis=1)

    pred_week = next_week.cpu().detach().numpy().reshape(output_window,4)
    pred_week = pred_week * size.values + mini.values # scaling 역함수
    pred_week = pred_week.astype(int)
    
    if start/7 == (len(submission) - output_window)//7 + 1:
        submission.iloc[start :, 1:] = pred_week[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_week
submission

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,2655,2561,673,64783
1,2020-11-10,2492,2316,570,55228
2,2020-11-11,2927,2836,742,65354
3,2020-11-12,2658,2613,621,58269
4,2020-11-13,2098,2046,406,44571
...,...,...,...,...,...
56,2021-01-04,2621,2643,591,62643
57,2021-01-05,2185,2145,495,47199
58,2021-01-06,1816,1743,405,39866
59,2021-01-07,1446,1371,325,32619


In [125]:
submission.to_csv('/content/drive/MyDrive/dacon/daconcup/submission/01_baseline_plus_isholiday.csv', index = False, encoding = 'euc-kr')