# Modeling 

#### BoostingModel.py

```python
mape, pred = model.MODEL(X, y, parameter, version='0919-1', cv_splits=5, scaling='MinMax', epoch=500)
```
* `MODEL` : lgbm_model, xgb_model        
* `version` : set file name       
* `cv_split` : set K (default=5)      
* `scaling` : 'MinMax' or 'Standard' (default = False)       
* `epoch` : n_estimators (default = 30000) 
    

In [1]:
import os
import sys
import joblib
import pickle as pkl

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import pandas as pd 
import numpy as np

from IPython.display import display
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_info_columns', 500)

In [2]:
import BoostingModel as model

In [3]:
data_v4 = joblib.load(os.path.join('..', '..', '6th_FE_ver4.pkl'))
locals().update(data_v4)

In [4]:
X = data_v4['X'] ; y = data_v4['y']
X.shape, y.shape

((35379, 386), (35379,))

In [5]:
y2 = np.log1p(y)

In [6]:
X.drop(["날짜"], axis=1, inplace=True)

In [35]:
# test values
np.exp(y_test_)

8734     18316001.0
7885      8569001.0
2413     15895001.0
12475    17194001.0
4571      6467001.0
            ...    
14158    27605001.0
28909    13095001.0
34363    31087001.0
26499    39283001.0
4615     12456001.0
Name: 취급액, Length: 7076, dtype: float64

## LGBM

In [7]:
lgbm_params4 = {'num_leaves': 47, 'max_depth': 8, 'min_child_samples': 39, 
                'learning_rate': 0.03, 'reg_lambda': 0.3, 'min_split_gain': 0.01, 
                'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 5, 'max_bin': 56, 
                'boosting': 'dart', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 500, 
                'force_col_wise' : 'true', 'verbose' : -1}

In [28]:
mape1, pred1 = model.lgbm_model(X, y2, lgbm_params4, version='0919-1', cv_splits=5, scaling='MinMax', epoch=500)

In [29]:
mape1

{'val_mape': [61.267718058711004,
  61.22249016163258,
  61.28174756269321,
  61.28356784561595,
  61.25151624295878],
 'test_mape': [61.28675975138324,
  61.21705560927788,
  61.22266178058934,
  61.23126122711551,
  61.219881235079946],
 'final_mape': [61.23552392068918]}

In [30]:
pred1

{'val_idx': [array([    0,     2,    13, ..., 28273, 28282, 28290]),
  array([    1,     7,    15, ..., 28295, 28299, 28300]),
  array([    3,    10,    11, ..., 28287, 28289, 28297]),
  array([    4,    36,    43, ..., 28294, 28301, 28302]),
  array([    5,     6,     8, ..., 28293, 28296, 28298])],
 'val_pred': [array([ 2062615.01528272,  6844272.34430122, 14191667.97187097, ...,
          9584616.76231178,  1996739.36433966, 17717672.65089969]),
  array([1084230.85021373, 3169955.2083374 , 2329935.30976519, ...,
         2593848.07032626, 1616499.85540889, 3237168.10112791]),
  array([12260711.03915866,  8533380.48457289,  2609808.9003501 , ...,
          5938023.82507422, 16863506.97109747,  9466756.88217323]),
  array([ 1783953.33291051, 23547273.66516244, 13286369.54617228, ...,
         19965048.29789498, 14121373.90086333, 17813302.72414961]),
  array([15633570.9191316 ,   567547.7649327 , 20598581.47715246, ...,
         18948061.81098977,   871888.00051773, 22064481.44131113]

## xgboost

In [11]:
xgb_params1 = {'min_child_weight': 1, 'eta': 0.02, 'colsample_bytree': 0.9, 'n_estimators' : 500,
               'max_depth': 20, 'subsample': 0.9, 'lambda': 1. , 'booster' : 'gbtree', 'silent': 1,
               'objective': 'reg:linear', 'metrics':'mae', 'gamma' : 0.01}

In [12]:
mape2, pred2 = model.xgb_model(X, y2, xgb_params1, version='0919-1', cv_splits=5, scaling=False, epoch=500)

Parameters: { metrics, n_estimators, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:15.76526	valid-rmse:15.74770
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[499]	train-rmse:0.01928	valid-rmse:0.03165
Parameters: { metrics, n_estimators, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:15.76154	valid-rmse:15.76279
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rm

In [21]:
mape2

{'val_mape': [2.097651163113353,
  2.086478012165073,
  2.208367815501897,
  2.1161489323382505,
  2.0750105581570555],
 'test_mape': [2.106844319702098,
  2.103160719746881,
  2.1700155841151427,
  2.0838209635103997,
  2.07438958202026],
 'final_mape': [1.9278161007864127]}

In [14]:
pred2

{'val_idx': [array([    0,     2,    13, ..., 28273, 28282, 28290]),
  array([    1,     7,    15, ..., 28295, 28299, 28300]),
  array([    3,    10,    11, ..., 28287, 28289, 28297]),
  array([    4,    36,    43, ..., 28294, 28301, 28302]),
  array([    5,     6,     8, ..., 28293, 28296, 28298])],
 'val_pred': [array([ 5359891.5, 18453320. , 36154252. , ..., 27063412. ,  5039183. ,
         47967540. ], dtype=float32),
  array([2306969.8, 7339278.5, 5357295.5, ..., 6636197.5, 3970481.2,
         7674997. ], dtype=float32),
  array([32312788., 23288104.,  6344466., ..., 15488979., 46671652.,
         25048716.], dtype=float32),
  array([ 4075938.2, 69799690. , 36145424. , ..., 59171490. , 35634350. ,
         50069652. ], dtype=float32),
  array([41828780. ,  1401671.9, 55853004. , ..., 51479730. ,  2102784.8,
         65832644. ], dtype=float32)],
 'test_idx': [Int64Index([ 8734,  7885,  2413, 12475,  4571,  9835, 35907, 14833, 10316,
              31140,
              ...
         

## Pred Load

In [32]:
with open('xgb_mape.pickle0919-1', 'rb') as f:
    data = pkl.load(f)

In [33]:
data

{'val_mape': [2.097651163113353,
  2.086478012165073,
  2.208367815501897,
  2.1161489323382505,
  2.0750105581570555],
 'test_mape': [2.106844319702098,
  2.103160719746881,
  2.1700155841151427,
  2.0838209635103997,
  2.07438958202026],
 'final_mape': [1.9278161007864127]}

In [37]:
with open('xgb_pred.pickle0919-1', 'rb') as f:
    data2 = pkl.load(f)

In [38]:
data2

{'val_idx': [array([    0,     2,    13, ..., 28273, 28282, 28290]),
  array([    1,     7,    15, ..., 28295, 28299, 28300]),
  array([    3,    10,    11, ..., 28287, 28289, 28297]),
  array([    4,    36,    43, ..., 28294, 28301, 28302]),
  array([    5,     6,     8, ..., 28293, 28296, 28298])],
 'val_pred': [array([ 5359891.5, 18453320. , 36154252. , ..., 27063412. ,  5039183. ,
         47967540. ], dtype=float32),
  array([2306969.8, 7339278.5, 5357295.5, ..., 6636197.5, 3970481.2,
         7674997. ], dtype=float32),
  array([32312788., 23288104.,  6344466., ..., 15488979., 46671652.,
         25048716.], dtype=float32),
  array([ 4075938.2, 69799690. , 36145424. , ..., 59171490. , 35634350. ,
         50069652. ], dtype=float32),
  array([41828780. ,  1401671.9, 55853004. , ..., 51479730. ,  2102784.8,
         65832644. ], dtype=float32)],
 'test_idx': [Int64Index([ 8734,  7885,  2413, 12475,  4571,  9835, 35907, 14833, 10316,
              31140,
              ...
         