# Parameter Tuning 
* 2020.09.19

In [32]:
import os
import sys
import joblib
import pickle as pkl

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


import pandas as pd 
import numpy as np


from IPython.display import display
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_info_columns', 500)

In [33]:
import BoostingModel as model

In [34]:
data_v4 = joblib.load(os.path.join('..', '..', '6th_FE_ver4.pkl'))
locals().update(data_v4)

In [35]:
X = data_v4['X'] ; y = data_v4['y']
X.shape, y.shape

((35379, 386), (35379,))

In [36]:
y2 = np.log1p(y)

In [37]:
X.drop(["날짜"], axis=1, inplace=True)

In [48]:
embedding = [col.startswith("v") for col in X.columns]

In [52]:
emb = X.columns[embedding]

In [53]:
X2 = X.drop(emb, axis=1)

In [54]:
X2.shape

(35379, 295)

## LGBM

### lgbm_params1

In [56]:
lgbm_params1 = {'num_leaves': 47, 'max_depth': 8, 'min_child_samples': 39, 
                'learning_rate': 0.03, 'reg_lambda': 0.3, 'min_split_gain': 0.01, 
                'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 5, 'max_bin': 56, 
                'boosting': 'dart', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 10000, 
                'force_col_wise' : 'true', 'verbose' : -1}

In [57]:
mape1, pred1 = model.lgbm_model(X2, y2, lgbm_params1, version='0919-1', cv_splits=5, scaling='MinMax', epoch=10000)

[10000]	valid_0's mape: 0.004583
[10000]	valid_0's mape: 0.00456599
[10000]	valid_0's mape: 0.0045454
[10000]	valid_0's mape: 0.00460728
[10000]	valid_0's mape: 0.00458216


In [58]:
pd.DataFrame({
    'val'  : mape1['val_mape'], 
    'test' : mape1['test_mape']
})

Unnamed: 0,val,test
0,7.25205,7.304394
1,7.236981,7.203601
2,7.207314,7.175047
3,7.292055,7.188623
4,7.245609,7.205809


In [59]:
mape1['final_mape']

[7.145051175115399]

In [64]:
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [65]:
# MAPE 
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


# MAPE_exp  
def MAPE_exp(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((np.exp(y_true) - np.exp(y_pred)) / np.exp(y_true))) * 100

In [62]:
# LightGBM
def lgbm_model(X, y, params, cv_splits=5, epoch=10000):        
    mape = {'val_mape' : [], 'test_mape' : [], 'final_mape' : []}
    pred = {'val_idx'  : [], 'val_pred'  : [],
            'test_idx' : [], 'test_pred' : [],
            'final_pred' : []}      # final : test set mean 값
    

    # train, test split
    X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=77)
    pred['test_idx'].append(X_test_.index)


    # K Fold Cross Validation
    cv = KFold(n_splits=cv_splits, random_state=77, shuffle=True)
    for t,v in cv.split(X_train_):
        X_train , X_val = X_train_.iloc[t] , X_train_.iloc[v]            
        y_train , y_val = y_train_.iloc[t] , y_train_.iloc[v]

        pred['val_idx'].append(v)

            

        # modeling 
        train_T = lgb.Dataset(X_train, label=y_train.values) 
        val_T   = lgb.Dataset(X_val, label=y_val.values)   


        model = lgb.train(params, train_T, epoch, valid_sets = val_T, verbose_eval=2500, early_stopping_rounds=500)


        val_pred = model.predict(X_val)
        pred['val_pred'].append(np.exp(val_pred))

        test_pred = model.predict(X_test_)
        pred['test_pred'].append(np.exp(test_pred))


        # mape
        mape['val_mape'].append(MAPE_exp(y_val, val_pred))
        mape['test_mape'].append(MAPE_exp(y_test_, test_pred))

        

    # final values
    final_test = np.mean(pred['test_pred'], axis=0)
    final_mape = MAPE(np.exp(y_test_), final_test)
    
    pred['final_pred'].append(final_test)
    mape['final_mape'].append(final_mape)



    
    return mape, pred

In [66]:
mape2, pred2 = lgbm_model(X2, y2, lgbm_params1, cv_splits=5, epoch=10000)

[2500]	valid_0's mape: 0.0182865
[5000]	valid_0's mape: 0.00726642
[7500]	valid_0's mape: 0.004858
[10000]	valid_0's mape: 0.00459553
[2500]	valid_0's mape: 0.0181886
[5000]	valid_0's mape: 0.00719946
[7500]	valid_0's mape: 0.00483586
[10000]	valid_0's mape: 0.00457631
[2500]	valid_0's mape: 0.01822
[5000]	valid_0's mape: 0.00720081
[7500]	valid_0's mape: 0.00482988
[10000]	valid_0's mape: 0.00457054
[2500]	valid_0's mape: 0.0182859
[5000]	valid_0's mape: 0.00727993
[7500]	valid_0's mape: 0.00489526
[10000]	valid_0's mape: 0.00463623
[2500]	valid_0's mape: 0.0182858
[5000]	valid_0's mape: 0.00725649
[7500]	valid_0's mape: 0.00486526
[10000]	valid_0's mape: 0.00460719


In [67]:
pd.DataFrame({
    'val'  : mape2['val_mape'], 
    'test' : mape2['test_mape']
})

Unnamed: 0,val,test
0,7.268045,7.321157
1,7.252194,7.231868
2,7.241304,7.203256
3,7.331132,7.215654
4,7.281935,7.236543


### lgbm_params2

In [17]:
lgbm_params2 = {'num_leaves': 47, 'max_depth': 8, 'min_child_samples': 39, 
                'learning_rate': 0.03, 'reg_lambda': 0.3, 'min_split_gain': 0.01, 
                'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 5, 'max_bin': 56, 
                'boosting': 'dart', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 10000, 
                'verbose' : -1}

In [18]:
%%time
mape2, pred2 = model.lgbm_model(X, y2, lgbm_params2, version='0919-2', cv_splits=5, scaling='MinMax', epoch=10000)

[10000]	valid_0's mape: 0.00457212
[10000]	valid_0's mape: 0.00456751
[10000]	valid_0's mape: 0.00454316
[10000]	valid_0's mape: 0.00460777
[10000]	valid_0's mape: 0.00457778
CPU times: user 2h 21min 59s, sys: 54.3 s, total: 2h 22min 53s
Wall time: 18min 34s


In [19]:
pd.DataFrame({
    'val'  : mape2['val_mape'], 
    'test' : mape2['test_mape']
})

Unnamed: 0,val,test
0,7.238962,7.286363
1,7.239987,7.192826
2,7.205753,7.161827
3,7.290109,7.17796
4,7.241947,7.188442


In [20]:
mape2['final_mape']

[7.131422999573787]

### lgbm_params3

In [23]:
lgbm_params3 = {'num_leaves': 35, 'max_depth': 16, 'min_child_samples': 16, 
                'learning_rate': 0.02, 'reg_lambda': 0.3, 'min_split_gain': 0.01, 
                'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 5, 
                'boosting': 'dart', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 10000, 
                'verbose' : -1}

In [26]:
%%time
mape3, pred3 = model.lgbm_model(X, y2, lgbm_params3, version='0919-3', cv_splits=5, scaling=False, epoch=10000)

[10000]	valid_0's mape: 0.00655342
[20000]	valid_0's mape: 0.0311102
[10000]	valid_0's mape: 0.00656492
[20000]	valid_0's mape: 0.0173574
[10000]	valid_0's mape: 0.00654964
[20000]	valid_0's mape: 0.0352045
[10000]	valid_0's mape: 0.00660052
[20000]	valid_0's mape: 0.0353653
[10000]	valid_0's mape: 0.00659053
[20000]	valid_0's mape: 0.0313237
CPU times: user 3h 59min 37s, sys: 23.7 s, total: 4h
Wall time: 30min 13s


In [27]:
pd.DataFrame({
    'val'  : mape3['val_mape'], 
    'test' : mape3['test_mape']
})

Unnamed: 0,val,test
0,45.569469,45.566178
1,30.840527,30.841481
2,49.220667,49.212339
3,49.09166,49.044979
4,45.35192,45.333955


In [28]:
mape3['final_mape']

[43.999786478274494]