In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import math
import matplotlib.pyplot as plt

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [2]:
plt.rc('font', family='NanumBarunGothic') 


# 모델 해석

- 1) baseline(lstm) + isholiday(row) -> 3.8145142142
    - lstm 모델 -> 트렌드 잡아냄
    - 11-09 ~ 11-16, 12-04에 과소추정함

- 2) baseline(lstm) + isholiday(row) + earlystopping -> 5.9840038661 
    - 과소 추정
    - trend 못 잡음

- 3) rf_reg -> 5fold 써서 4번째 fold로 학습 -> 6.5870634604
    - 과소 추정
    - trend 못 잡음

- 4) lstm6층 + 변수8개 -> 4.1833511924
    - 전반적으로 과소추정
    - pageviews는 잘 맞춤. 약간 올리면 좋을 듯.

- 5) facebook prophet -> 2.8862838808 - best
    - 변동성 잡지 못함
    - 회귀식처럼 아주 적절한 평균값을 구해냄

- 6) submission_04_input7.csv -> 5.1455137603
    - 변동성 못잡고 베리 구림

In [10]:
from sklearn.metrics import mean_squared_error

def train_preprocess(data):

    # 일별 데이터 생성
    data['DateTime'] = data['DateTime'].apply(lambda x: x[:10])
    df = data.groupby('DateTime')[['사용자', '세션', '신규방문자', '페이지뷰']].sum()
    y_cols = ['사용자', '세션', '신규방문자', '페이지뷰']
    df.columns = y_cols
    df = df.reset_index()

    return df



def vis_model(train_path, train_add_p, sub_path):
    """
    > input:
        train_path : train.csv가 들어있는 경로
        sub_path : 제출할 submission.csv가 들어있는 경로
    > output:
        train + sub = 전체 기간동안의 모든 데이터
    """

    train = pd.read_csv(train_path, encoding='cp949')
    train_add = pd.read_csv(train_add_p, encoding='cp949') # answer
    sub = pd.read_csv(sub_path, encoding='cp949')          # pred

    train = train_preprocess(train)
    train_add = train_preprocess(train_add)
    sub_all = pd.concat([train, sub], axis=0).reset_index(drop=True)
    ans_all = pd.concat([train, train_add], axis=0).reset_index(drop=True)
    
    fig = make_subplots(shared_xaxes=True,rows=4, cols=1)
    fig.add_trace(go.Scatter(x=sub_all['DateTime'], y=sub_all['사용자'], name='users', mode='lines+markers'), row=1, col=1)
    fig.add_trace(go.Scatter(x=sub_all['DateTime'], y=sub_all['세션'], name='sessions', mode='lines+markers'), row=2, col=1)
    fig.add_trace(go.Scatter(x=sub_all['DateTime'], y=sub_all['신규방문자'], name='new_users', mode='lines+markers'), row=3, col=1)
    fig.add_trace(go.Scatter(x=sub_all['DateTime'], y=sub_all['페이지뷰'], name='page views', mode='lines+markers'), row=4, col=1)

    fig.add_trace(go.Scatter(x=ans_all['DateTime'], y=ans_all['사용자'], name='users_answer', mode='lines+markers', line=dict(color='firebrick', width=1)), row=1, col=1)
    fig.add_trace(go.Scatter(x=ans_all['DateTime'], y=ans_all['세션'], name='sessions_anwer', mode='lines+markers', line=dict(color='firebrick', width=1)), row=2, col=1)
    fig.add_trace(go.Scatter(x=ans_all['DateTime'], y=ans_all['신규방문자'], name='new_users_answer', mode='lines+markers', line=dict(color='firebrick', width=1)), row=3, col=1)
    fig.add_trace(go.Scatter(x=ans_all['DateTime'], y=ans_all['페이지뷰'], name='page views_answer', mode='lines+markers', line=dict(color='firebrick', width=1)), row=4, col=1)

    fig.update_xaxes(range=['2020-11-09', '2020-12-08'])

    fig.show()


    # val loss 계산
    cols = ['사용자', '세션', '신규방문자', '페이지뷰']

    errors = {}
    for i, col in enumerate(cols):
        errors[col] = math.sqrt(mean_squared_error(train_add.iloc[:, i+1], sub.iloc[:30, i+1]))
    
    # pd.DataFrame.from_dict(errors, orient='index').T.plot(kind='bar')
    # plt.show()

    return errors

In [11]:
# 1) baseline(lstm) + isholiday(row) -> 3.8145142142
train_p = "/content/drive/MyDrive/dacon/daconcup/Data/raw/train.csv"
train_add_p = '/content/drive/MyDrive/dacon/daconcup/Data/raw/add/2차_train.csv'
sub_p = '/content/drive/MyDrive/dacon/daconcup/submission/01_baseline_plus_isholiday.csv'
print(vis_model(train_p, train_add_p, sub_p))

{'사용자': 917.5054949880864, '세션': 913.8181620723749, '신규방문자': 234.4750875181981, '페이지뷰': 27036.510187152482}


In [12]:
# 4) lstm6층 + 변수8개 -> 4.1833511924
train_p = "/content/drive/MyDrive/dacon/daconcup/Data/raw/train.csv"
train_add_p = '/content/drive/MyDrive/dacon/daconcup/Data/raw/add/2차_train.csv'
sub_p = '/content/drive/MyDrive/dacon/daconcup/submission/submission_using_baseline+vars_lstm6.csv'
print(vis_model(train_p, train_add_p, sub_p))

{'사용자': 1067.5939615165807, '세션': 1062.8513379897804, '신규방문자': 336.40174395108795, '페이지뷰': 23645.87209359525}


- 전반적으로 과소추정하고 있음. 

- page views는 잘 맞는 편인 듯. 약간 올려주면 될것 같은데...

In [13]:
# 5) facebook prophet -> 2.8862838808 - best
train_p = "/content/drive/MyDrive/dacon/daconcup/Data/raw/train.csv"
train_add_p = '/content/drive/MyDrive/dacon/daconcup/Data/raw/add/2차_train.csv'
sub_p = '/content/drive/MyDrive/dacon/daconcup/submission/submission (14).csv'
print(vis_model(train_p, train_add_p, sub_p))

{'사용자': 743.6056877548882, '세션': 725.4077190252606, '신규방문자': 176.74439314808163, '페이지뷰': 21339.723342907164}


- 변동성은 잡아내지 못했지만, 회귀식처럼 평균적인 값을 잘 찾아냄. 