# baseline code + 8개 변수 사용

# library import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
# define 'device' to upload tensor in gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 데이터 불러오기

In [3]:
all = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/all_preped_01.csv")
print(all.shape)
print(all.columns)
all_raw = all.copy()

(555, 10)
Index(['date', '사용자', '세션', '신규방문자', '페이지뷰', 'cnt_signin', 'cnt_login',
       'cnt_sub', 'total_participants', 'isTrain'],
      dtype='object')


# 데이터 수정

In [None]:
all.loc[all['isTrain']==1, :]

Unnamed: 0,date,사용자,세션,신규방문자,페이지뷰,cnt_signin,cnt_login,cnt_sub,total_participants,isTrain
0,2019-07-04,625,632,232,4693,7.0,83.0,2.0,786,1
1,2019-07-05,596,600,175,5950,5.0,119.0,6.0,786,1
2,2019-07-06,526,540,140,5309,5.0,115.0,10.0,786,1
3,2019-07-07,422,453,86,5374,14.0,144.0,5.0,786,1
4,2019-07-08,505,545,105,6402,16.0,179.0,14.0,786,1
...,...,...,...,...,...,...,...,...,...,...
489,2020-11-04,4516,4472,1196,112683,114.0,460.0,445.0,3513,1
490,2020-11-05,4155,4037,1044,102901,98.0,378.0,521.0,3513,1
491,2020-11-06,3663,3576,825,88015,66.0,367.0,388.0,3513,1
492,2020-11-07,2472,2417,531,57386,34.0,224.0,324.0,3513,1


In [None]:
# scaling (train을 사용한 scaler)
train = all.loc[all['isTrain']==1, :] # 스케일링 전

mini = train.iloc[:,1:-1].min()
size = train.iloc[:,1:-1].max() - train.iloc[:,1:-1].min()

all.iloc[:,1:-1] = (all.iloc[:,1:-1] -  mini) / size
train = all.loc[all['isTrain']==1, [col for col in all.columns if col != 'isTrain']] # 스케일링 후

input_window = 30
output_window = 7

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 8)) # (데이터 수, time sequence, input_dim)
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 4))  # (데이터 수, time sequence, output_dim)

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1:5].values

# model 생성

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            batch_first=True)
        self.hidden_lstm = nn.LSTM(input_size = hidden_size,
                                   hidden_size = hidden_size,
                                   batch_first=True)
        
        self.time_fc = nn.Linear(hidden_size, 4)
    
    def forward(self, x_time):
    
        out_time, _ = self.lstm(x_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        out_time, _ = self.hidden_lstm(out_time)
        
        
        out_time = self.time_fc(out_time[:,-7:, :])
        
        return out_time.view(-1,7,4)
    
model = LSTM(input_size = 8, hidden_size = 30).to(device)

In [None]:
print(model)

LSTM(
  (lstm): LSTM(8, 30, batch_first=True)
  (hidden_lstm): LSTM(30, 30, batch_first=True)
  (time_fc): Linear(in_features=30, out_features=4, bias=True)
)


# 학습

In [None]:
window_x = torch.tensor(window_x).float().to(device)
window_y = torch.tensor(window_y).float().to(device)

# Train model
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.MSELoss(size_average = True)
num_epochs  = 1000
train_error = []
for t in range(num_epochs):
    train_pred = model(window_x)
    loss = criterion(train_pred, window_y) ### trend
    train_error.append(loss)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if t % 50 == 0 and t !=0:
        print(f"{t} Epochs train MSE: {loss.item():1.5f}")

50 Epochs train MSE: 0.01205
100 Epochs train MSE: 0.01209
150 Epochs train MSE: 0.00894
200 Epochs train MSE: 0.00728
250 Epochs train MSE: 0.00424
300 Epochs train MSE: 0.00259
350 Epochs train MSE: 0.00191
400 Epochs train MSE: 0.00143
450 Epochs train MSE: 0.00120
500 Epochs train MSE: 0.00094
550 Epochs train MSE: 0.00074
600 Epochs train MSE: 0.00068
650 Epochs train MSE: 0.00058
700 Epochs train MSE: 0.00077
750 Epochs train MSE: 0.00032
800 Epochs train MSE: 0.00036
850 Epochs train MSE: 0.00023
900 Epochs train MSE: 0.00038
950 Epochs train MSE: 0.00019


# 예측

In [None]:
test = all.loc[all['isTrain']==0, [col for col in all.columns if col != 'isTrain']].dropna().reset_index(drop=True) # 한달만 예측(추가 데이터가 나와야 뒤의 한달을 예측할 수 있음)
submission = pd.read_csv("/content/drive/MyDrive/dacon/daconcup/Data/raw/submission.csv", encoding = 'euc-kr')

# df -> array -> tensor
last_month = train.iloc[-30:,1:].values[np.newaxis,...] # <- 수정 후 array (30, 8) -> (1, 30, 8) using np.newaxis
last_month = torch.tensor(last_month).float().to(device) # <- 수정 후 tensor (1, 30, 8)

for start in range((len(test) - output_window)//7 + 2):
    """ 
    i) train 뒤에서 30일을 뽑아서 test의 7일을 예측
    ii) 예측한 값을 train 뒤에 붙임
    위 과정 반복
    """
    start = start * 7
    next_week = model(last_month) # (1, 7, 4)

    if start/7 == (len(test) - output_window)//7 + 1: # 마지막 예측 부분(7일이 채 안됨)
        test.iloc[start : , 1:5] = next_week.cpu().detach().numpy().reshape(-1, 4)[-test.iloc[start:, :].shape[0]:, :]
        next_week = test.iloc[start :, 1:].values[np.newaxis,...] # (1, x, 8) array
    else:
        test.iloc[start : start + output_window, 1:5] = next_week.cpu().detach().numpy().reshape(output_window,4)
        next_week = test.iloc[start : start + 7, 1:].values[np.newaxis,...] # (1, 7, 8) array


    next_week = torch.tensor(next_week).float().to(device)  # (1, 7, 8) tensor
    last_month = torch.cat([last_month[:,7:,:], next_week], axis = 1)# <- 수정 후 (1, 23, 8) + (1, 7, 8) = (1, 30, 8)
    
    pred_week = next_week.cpu().detach().numpy()[0]
    pred_week = pred_week * size.values + mini.values
    pred_week = pred_week.astype(int)

    
    if start/7 == (len(test) - output_window)//7 + 1: # 마지막 예측 부분(7일이 채 안됨)
        submission.iloc[start : start + test.iloc[start :, 1:].shape[0], 1:] = pred_week[-test.iloc[start :, 1:].shape[0]:,:4] # 남는 날짜만큼만 추가
    else:
        submission.iloc[start : start + output_window, 1:] = pred_week[:, :4]
submission

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,3234,3192,748,74250
1,2020-11-10,2691,2647,609,61547
2,2020-11-11,2784,2761,628,66564
3,2020-11-12,2478,2453,529,60445
4,2020-11-13,2358,2379,509,57056
...,...,...,...,...,...
56,2021-01-04,0,0,0,0
57,2021-01-05,0,0,0,0
58,2021-01-06,0,0,0,0
59,2021-01-07,0,0,0,0


In [None]:
submission_raw = submission.copy()
submission.columns = ['date', '사용자', '세션', '신규방문자', '페이지뷰']
df_all = pd.concat([all_raw[['date', '사용자', '세션', '신규방문자', '페이지뷰']].iloc[:-61, :], submission], axis=0).reset_index(drop=True)

In [None]:
fig = make_subplots(shared_xaxes=True,rows=4, cols=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['사용자'], name='users'), row=1, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['세션'], name='sessions'), row=2, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['신규방문자'], name='new_users'), row=3, col=1)
fig.add_trace(go.Scatter(x=df_all['date'], y=df_all['페이지뷰'], name='page views'), row=4, col=1)

# 예측파일 저장

In [None]:
submission_raw

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,3234,3192,748,74250
1,2020-11-10,2691,2647,609,61547
2,2020-11-11,2784,2761,628,66564
3,2020-11-12,2478,2453,529,60445
4,2020-11-13,2358,2379,509,57056
...,...,...,...,...,...
56,2021-01-04,0,0,0,0
57,2021-01-05,0,0,0,0
58,2021-01-06,0,0,0,0
59,2021-01-07,0,0,0,0


In [None]:
submission_raw.to_csv('/content/drive/MyDrive/dacon/daconcup/submission/submission_using_baseline+vars.csv', index = False, encoding = 'euc-kr')