In [None]:
# basic packages
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
import os
os.chdir('/content/drive/MyDrive/dacon/daconcup/')
import utils # is_holiday 변수

# packages for models
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib


# preprocessing
---

- 변수 

    1. 이전 2주동안의 y들 -> 14*4=56

    2. y들의 통계량 -> 분산, 평균 -> 2*4=8

    3. is_holiday -> 1

    4. 대회 참가자 수 -> 1

    → 총 변수 66




In [None]:
data = pd.read_csv('/content/drive/MyDrive/dacon/daconcup/Data/train.csv', encoding='cp949')
info_cpt = pd.read_csv('/content/drive/MyDrive/dacon/daconcup/Data/info_competition.csv', encoding='cp949')
submission = pd.read_csv('/content/drive/MyDrive/dacon/daconcup/Data/submission.csv', encoding='cp949')

In [None]:
def info_cpt_preprocess(data):
    
    # 날짜는 날짜형태로 변경
    data['period_start'] = pd.to_datetime(data['period_start'])
    data['period_end'] = pd.to_datetime(data['period_end'])
    data['merge_deadline'] = pd.to_datetime(data['merge_deadline'])

    # 날짜별 시행 대회 수
    min_date = min(data['period_start'])
    max_date = max(data['period_end'])
    cnt_cpt_df = pd.DataFrame()
    cnt_cpt_df['date'] = pd.date_range(min_date, max_date)
    for idx, row in data.iterrows():
        date_interval = pd.date_range(row['period_start'].date(), row['period_end'].date())
        col_name = row['name']
        
        cnt_cpt_df[col_name] = 0
        cnt_cpt_df.loc[cnt_cpt_df['date'].apply(lambda x: x in date_interval), col_name] = row['participants']

    cnt_cpt_df['total_participants'] = np.sum(cnt_cpt_df.iloc[:, 1:], axis=1)

    return cnt_cpt_df # date | name(대회이름)s | total_participants
    



def train_preprocess(data, cnt_cpt_df):

    # 일별 데이터 생성
    data['date'] = pd.to_datetime(data['DateTime'].apply(lambda x: x[:10]))
    df = data.groupby('date')[['사용자', '세션', '신규방문자', '페이지뷰']].sum()
    y_cols = ['y_user', 'y_sess', 'y_new', 'y_views']
    df.columns = y_cols
    df = df.reset_index()
    # print(df.info())

    # 공휴일 여부 추가
    df = utils.add_isHoliday_column(df)


    # lag1~14변수 생성
    for col in y_cols:
        for i in range(1, 15): 
            col_name = col[2:] + '_lag' + str(i)
            df[col_name] = df[col].shift(i)

    # lag 변수들의 분산, 평균, 중앙값
    for col in y_cols:
        factor = col[2:]
        new_cols = [factor+'_lag_mean', factor+'_lag_std']
        df[new_cols] = df[[col for col in df if col.startswith(factor)]].apply(pd.DataFrame.describe, axis=1)[['mean', 'std']]

    # 대회 참가자 수
    df = df.merge(cnt_cpt_df.loc[:, ['date', 'total_participants']], on='date', how='left')

    return df

In [None]:
data_raw = data.copy()

cnt_cpt_df = info_cpt_preprocess(info_cpt)
data = train_preprocess(data_raw, cnt_cpt_df)
data.dropna(inplace=True)
data = data.reset_index(drop=True)
print('data shape:', data.shape)

data shape: (778, 71)


# Modeling
- 랜포 - 모델 4개 만들어야 함

In [None]:
# 5-folds cross validation 
    ## 5덩어리로 나눠서 마지막 7일만 예측

def rf_reg_cv(data, y_name):
    X = data.iloc[:, 5:]
    y = data[y_name]

    errors = []
    for i in range(1, 5):
        start_idx = (data.shape[0] // 5) * (i-1) - 7 # test로 썼던 데이터를 train으로 사용
        last_idx = (data.shape[0] // 5) * i 
        
        if last_idx > data.shape[0]:
            last_idx = data.shape[0]

        if start_idx < 0:
            start_idx = 0
        train_idx = list(x for x in range(start_idx, last_idx-7))
        test_idx = list(x for x in range(last_idx-7, last_idx))

        X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
        y_train, y_test = y[train_idx], y[test_idx]    
        
        rf_reg = RandomForestRegressor(min_samples_leaf=3, bootstrap=False)
        rf_reg.fit(X_train, y_train)
        y_pred = rf_reg.predict(X_test)

        y_test_mean = np.mean(y_test)
        error = mean_squared_error(y_test, y_pred, squared=False)/y_test_mean # RMSE는 False
                                                                              # rmse가 시기별로 y에 따라 달라져서 
                                                                              # 해당 시기에 rmse가 차지하는 비율을 구함

        errors.append(error)
    print('Modeling Complete for' + y_name[2:])
    path = '/content/drive/MyDrive/dacon/daconcup/models/rf_reg_'+y_name[2:]+'.pkl'
    joblib.dump(rf_reg, path)
    print(f'Model 저장 - {path}')
    print('validation error for '+ y_name[2:] +': ' + str(np.mean(error)))

    return errors, rf_reg

In [None]:
# 사용자수
errors_user, rf_reg_user = rf_reg_cv(data, 'y_user')

Modeling Complete foruser
Model 저장 - /content/drive/MyDrive/dacon/daconcup/models/rf_reg_user.pkl
validation error for user: 0.18544947753303118


In [None]:
x, y =  zip(*sorted(zip(rf_reg_user.feature_importances_, data.columns[5:]), reverse=True)[:10])

trace = go.Bar(x=x[::-1], y=y[::-1],
               marker=dict(
                   color=x,
                   colorscale='Viridis',
               ),
               name='Random Forest Feature importance',
               orientation='h'
               )

fig = go.Figure(data=[trace])
fig.update_xaxes(range=[0,1])
fig.update_layout(
    width = 800,
    height = 500,
    title = "y_user feature_importance_ by rf_reg"
)
fig.show();

In [None]:
# 세션 수
errors_sess, rf_reg_sess = rf_reg_cv(data, 'y_sess')

Modeling Complete forsess
Model 저장 - /content/drive/MyDrive/dacon/daconcup/models/rf_reg_sess.pkl
validation error for sess: 0.19434366268552414


In [None]:
x, y =  zip(*sorted(zip(rf_reg_sess.feature_importances_, data.columns[5:]), reverse=True)[:10])

trace = go.Bar(x=x[::-1], y=y[::-1],
               marker=dict(
                   color=x,
                   colorscale='Viridis',
               ),
               name='Random Forest Feature importance',
               orientation='h'
               )

fig = go.Figure(data=[trace])
fig.update_xaxes(range=[0,1])
fig.update_layout(
    width = 800,
    height = 500,
    title = "y_sess feature_importance_ by rf_reg"
)
fig.show();

In [None]:
# 신규 사용자 수
errors_new, rf_reg_new = rf_reg_cv(data, 'y_new')

Modeling Complete fornew
Model 저장 - /content/drive/MyDrive/dacon/daconcup/models/rf_reg_new.pkl
validation error for new: 0.3196820347048656


In [None]:
x, y =  zip(*sorted(zip(rf_reg_new.feature_importances_, data.columns[5:]), reverse=True)[:10])

trace = go.Bar(x=x[::-1], y=y[::-1],
               marker=dict(
                   color=x,
                   colorscale='Viridis',
               ),
               name='Random Forest Feature importance',
               orientation='h'
               )

fig = go.Figure(data=[trace])
fig.update_xaxes(range=[0,1])
fig.update_layout(
    width = 800,
    height = 500,
    title = "y_new feature_importance_ by rf_reg"
)
fig.show();

In [None]:
# 페이지 뷰수
errors_views, rf_reg_views = rf_reg_cv(data, 'y_views')

Modeling Complete forviews
Model 저장 - /content/drive/MyDrive/dacon/daconcup/models/rf_reg_views.pkl
validation error for views: 0.4264382538552711


In [None]:
x, y =  zip(*sorted(zip(rf_reg_views.feature_importances_, data.columns[5:]), reverse=True)[:10])

trace = go.Bar(x=x[::-1], y=y[::-1],
               marker=dict(
                   color=x,
                   colorscale='Viridis',
               ),
               name='Random Forest Feature importance',
               orientation='h'
               )

fig = go.Figure(data=[trace])
fig.update_xaxes(range=[0,1])
fig.update_layout(
    width = 800,
    height = 500,
    title = "y_views feature_importance_ by rf_reg"
)
fig.show();

### 랜덤포레스트 모델
- rf_reg_user : 사용자
- rf_reg_sess : 세션
- rf_reg_new : 신규 방문자
- rf_reg_views : 페이지뷰


# 예측

In [None]:
# prediction
    # 하나씩 예측하면서 예측한 결과로 또 예측하게 만들기
submission = pd.read_csv('/content/drive/MyDrive/dacon/daconcup/Data/submission.csv', encoding='cp949', parse_dates=['DateTime'])
submission_raw = submission.copy()

submission.columns = ['date', 'y_user', 'y_sess', 'y_new', 'y_views']
df_for_sub = pd.concat([data, submission], axis=0)

# holiday, 대회참가자수 변수 추가
del df_for_sub['isHoliday'], df_for_sub['total_participants'] 
df_for_sub = utils.add_isHoliday_column(df_for_sub)
df_for_sub = df_for_sub.merge(cnt_cpt_df[['date', 'total_participants']], how='left', on='date')

In [None]:
# 2020-11-09부터 2021-01-08까지 돌면서 1) x값 생성 2) prediction하고 그 값을 y에 채워넣기
import datetime

for day in pd.date_range('2020-11-09', '2021-01-08').astype(str).tolist():

    cols = ['user', 'sess', 'new', 'views']
    
    val_lst = []
    for i in cols:
        yname = 'y_' + i
        
    # lag 변수에 값 추가
        for j in range(1, 15):
            lag_day = datetime.datetime.strptime(day, '%Y-%m-%d') - datetime.timedelta(j)
            val_lst.append(df_for_sub.loc[df_for_sub['date']==lag_day, yname].values[0])
    # print(lag_day, val_lst)

    # lag 변수의 평균, 표준편차
    for i in range(4):
        start_idx = i * 14
        last_idx = (i+1) * 14

        mean = np.mean(val_lst[start_idx:last_idx])
        std = np.std(val_lst[start_idx:last_idx])
        
        val_lst += [mean, std]
    
    df_for_sub.loc[df_for_sub['date']==day, df_for_sub.columns[5:-2]] = val_lst

    X = df_for_sub[df_for_sub['date']==day].iloc[:, 5:].values
    y_pred_user = rf_reg_user.predict(X)[0]
    y_pred_sess = rf_reg_sess.predict(X)[0]
    y_pred_new = rf_reg_new.predict(X)[0]
    y_pred_views = rf_reg_views.predict(X)[0]
    
    print(X)
    print(f"{day}의 pred값 : {y_pred_user}, {y_pred_sess}, {y_pred_new}, {y_pred_views}")
    # 예측한 값을 현재의 y열에 추가
    df_for_sub.loc[df_for_sub['date']==day, df_for_sub.columns[1:5]] = [y_pred_user, y_pred_sess, y_pred_new, y_pred_views]

In [None]:
sub = df_for_sub.loc[df_for_sub['date'].apply(lambda x: x in pd.date_range('2020-11-09', '2021-01-08')), df_for_sub.columns[:5]].reset_index(drop=True)
sub.columns = submission_raw.columns 
sub

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,1762.0000,1736.2500,526.000000,58371.200000
1,2020-11-10,1762.0000,1736.2500,526.000000,55054.666667
2,2020-11-11,1762.0000,1736.2500,216.250000,49450.687500
3,2020-11-12,1762.0000,1736.2500,216.250000,55054.666667
4,2020-11-13,1762.0000,1736.2500,217.227333,36760.666667
...,...,...,...,...,...
56,2021-01-04,1470.2875,1501.0075,325.952500,22091.610833
57,2021-01-05,1470.2875,1501.0075,325.952500,22091.610833
58,2021-01-06,1470.2875,1501.0075,325.952500,22091.610833
59,2021-01-07,1470.2875,1501.0075,325.952500,22091.610833


In [None]:
sub.to_csv('/content/drive/MyDrive/dacon/daconcup/submission/03_randomforest_reg.csv', index = False, encoding = 'euc-kr')