# library import

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
# define 'device' to upload tensor in gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 데이터 불러오기

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/raw/train.csv", encoding = 'euc-kr')
train['DateTime'] = pd.to_datetime(train.DateTime)
train['date'] = train.DateTime.dt.date
train  = train.groupby('date').sum().reset_index()
train_raw = train.copy()

# 데이터 수정

In [4]:
# scaling
mini = train.iloc[:,1:].min()
size = train.iloc[:,1:].max() - train.iloc[:,1:].min()
train.iloc[:,1:] = (train.iloc[:,1:] -  mini) / size

input_window = 30
output_window = 7

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 4))
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1: ].values

# model 생성

In [5]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, _ = self.lstm(x_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        
        out_time = self.time_fc(out_time[:,-7:, :])
        
        return out_time.view(-1,7,4)
    
model = LSTM(input_size = 4, hidden_size = 30).to(device)

# 학습

In [6]:
window_x = torch.tensor(window_x).float().to(device)
window_y = torch.tensor(window_y).float().to(device)

# Train model
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.MSELoss(size_average = True)
num_epochs  = 1000
train_error = []
for t in range(num_epochs):
    train_pred = model(window_x)
    loss = criterion(train_pred, window_y) ### trend
    train_error.append(loss)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if t % 100 == 0 and t !=0:
        print(f"{t} Epochs train MSE: {loss.item():1.5f}")

100 Epochs train MSE: 0.00776
200 Epochs train MSE: 0.00548
300 Epochs train MSE: 0.00517
400 Epochs train MSE: 0.00380
500 Epochs train MSE: 0.00214
600 Epochs train MSE: 0.00162
700 Epochs train MSE: 0.00123
800 Epochs train MSE: 0.00231
900 Epochs train MSE: 0.00095


# 예측

In [7]:
submission = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/raw/submission.csv", encoding = 'euc-kr')

#last_month = torch.tensor(window_x[-1,:,:][np.newaxis,...]).float().to(device) <- 수정 전
last_month = train.iloc[-30:,1:].values[np.newaxis,...] # <- 수정 후
last_month = torch.tensor(last_month).float().to(device) # <- 수정 후

for start in range((len(submission) - output_window)//7 + 2):
    start = start * 7
    next_week = model(last_month)
    #last_month = torch.cat([last_month[-7:], next_week], axis = 1) <- 수정 전
    last_month = torch.cat([last_month[:,7:,:], next_week], axis = 1)# <- 수정 후

    pred_week = next_week.cpu().detach().numpy().reshape(output_window,4)
    pred_week = pred_week * size.values + mini.values
    pred_week = pred_week.astype(int)
    
    if start/7 == (len(submission) - output_window)//7 + 1:
        submission.iloc[start :, 1:] = pred_week[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_week
submission

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,3669,3700,873,110042
1,2020-11-10,2395,2409,537,71829
2,2020-11-11,2428,2476,546,72408
3,2020-11-12,2206,2230,467,65190
4,2020-11-13,1981,1976,393,57185
...,...,...,...,...,...
56,2021-01-04,2548,2457,547,85765
57,2021-01-05,2461,2379,547,81531
58,2021-01-06,2283,2198,480,74574
59,2021-01-07,1591,1500,266,53684


In [8]:
submission_raw = submission.copy()
submission.columns = ['date', '사용자', '세션', '신규방문자', '페이지뷰']
df_all = pd.concat([train_raw, submission], axis=0).reset_index(drop=True)

In [9]:
# epochs = 1000
fig = make_subplots(shared_xaxes=True,rows=4, cols=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['사용자'], name='users'), row=1, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['세션'], name='sessions'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['신규방문자'], name='new_users'), row=3, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['페이지뷰'], name='page views'), row=4, col=1)

# 예측파일 저장

In [None]:
submission.to_csv('submission.csv', index = False, encoding = 'euc-kr') # 3.87 (채 제출)