# Tuning ver3
* scaling 
* xgboost
* LGBM 

## colab setting

```javascript
function ClickConnect() {
    var buttons = document.querySelectorAll("colab-dialog.yes-no-dialog paper-button#cancel"); 
    buttons.forEach(function(btn) { btn.click(); }); 
    console.log("1분마다 자동 재연결"); 
    document.querySelector("colab-toolbar-button#connect").click(); 
    } 

setInterval(ClickConnect,1000*60);
```

```python
# 코랩 한글 설치 
import matplotlib as mpl
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

!apt -qq -y install fonts-nanum

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()
```

```python
# 한글 테스트 
plt.figure(figsize=(5,5))
plt.plot([0,1], [0,1], label='한글테스트용')
plt.legend()
plt.show()
```

In [12]:
import os
import sys
import joblib
import pickle as pkl

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import pandas as pd 
import numpy as np

from IPython.display import display
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_info_columns', 500)

import datetime
from tqdm import tqdm

In [2]:
# visualize
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)

In [None]:
# !pip install bayesian-optimization
# !pip install optuna

### LGBM / xgboost with gpu
* https://somjang.tistory.com/entry/Ensemble-Colab%EC%97%90%EC%84%9C-LightGBM-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0

In [None]:
# !git clone https://github.com/Microsoft/LightGBM
# cd LightGBM
# !mkdir build
# !cmake -DUSE_GPU=1
# !make -j$(nproc)
# !sudo apt-get -y install python-pip
# !sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
# %cd /content/LightGBM/python-package
# !sudo python setup.py install
# !pip3 uninstall scikit-learn
# !pip3 install scikit-learn==0.21.3

In [None]:
# from tensorflow.python.client import device_lib
# device_lib.list_local_devices()

# Data
* embedding ver4 (90 dim)

In [None]:
# colab 
# cd /content/drive/My Drive/쇼핑광고등어

# data_v4 = joblib.load(os.path.join('1. Data', '05_분석데이터', '5th_FE_ver4.pkl'))
# locals().update(data_v4)

In [3]:
data_v4 = joblib.load(os.path.join('..', '..', '0.Data', '05_분석데이터', '5th_FE_ver4.pkl'))
locals().update(data_v4)

In [4]:
X = data_v4['X'] ; y = data_v4['y']
X = X.drop(["log최근3개월상품군추세"], axis=1)
X.shape, y.shape

((35379, 333), (35379,))

In [5]:
# log(y)
y2 = np.log1p(y)

# Modeling 
* cv = 5, epoch = 30000, earlystopping = 500, random_state=77

In [6]:
def MAPE_exp(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((np.exp(y_true) - np.exp(y_pred)) / np.exp(y_true))) * 100

In [7]:
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from bayes_opt import BayesianOptimization

In [8]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

## LGBM
* scaling : val 과 test 에 동일하게 적용해 줘야 함
* https://stackoverflow.com/questions/58823264/how-to-scale-train-validation-and-test-sets-properly-using-standardscaler

In [10]:
def lgbm_model(X, y, params, n) : 

    mape = {'val_mape' : [], 'test_mape' : []}
    pred = {'val_pred' : [], 'test_idx' : [], 'test_pred' : []}    
    lgbm_feat_imp = [0] * len(X.columns)


    X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=77)
    pred['test_idx'].append(X_test_.index)


    cv = KFold(n_splits=n, random_state=77, shuffle=True)
    for t,v in cv.split(X_train_):
        X_train , X_val = X_train_.iloc[t] , X_train_.iloc[v]            
        y_train , y_val = y_train_.iloc[t] , y_train_.iloc[v]

        
        # scaling
        scaler = StandardScaler()

        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)


        train_T = lgb.Dataset(X_train, label=y_train.values) 
        val_T   = lgb.Dataset(X_val, label=y_val.values)       
            

        model = lgb.train(params, train_T, 30000, valid_sets = val_T, verbose_eval=10000, early_stopping_rounds=500)
        

        # val
        val_pred = model.predict(X_val)
        pred['val_pred'].append(np.exp(val_pred))              
        mape['val_mape'].append(MAPE_exp(y_val, val_pred))


        # test 
        X_test = scaler.transform(X_test_)

        test_pred = model.predict(X_test)
        pred['test_pred'].append(np.exp(test_pred))
        mape['test_mape'].append(MAPE_exp(y_test_, test_pred))


        lgbm_feat_imp += model.feature_importance() / n
                    
    
    return mape, pred, lgbm_feat_imp

### lgbm_params6
* optuna

In [38]:
def objective_lgbm(trial, X, y):
    
    params = {
        'num_leaves': int(trial.suggest_loguniform('num_leaves', 8, 32)),  
        'max_depth': trial.suggest_int('max_depth', 8, 32), 
        'min_child_samples': trial.suggest_int('min_child_samples', 28, 48),  
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 5.0),  
        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.015),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 1.0), 
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 0.5), 
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.01, 0.05),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0), 
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 10000, 30000),
        'boost_from_average': trial.suggest_categorical('boost_from_average', [True, False]),  # True
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),  
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 7), 
        'max_bin': trial.suggest_int('max_bin', 32, 64),  
        'seed': 77,
        'n_jobs': -1,
        'device_type' : 'gpu', 
        'objective': 'regression',
        'num_iterations': 30000,
        'metric': 'mape',
        'importance_type': 'gain'
    }
    
    mape, pred, lgbm_feat_imp = lgbm_model(X, y, params, 5)

    return np.mean(mape['test_mape'])

In [39]:
%%time

lgbm_study1 = optuna.create_study()
lgbm_study1.optimize(lambda x: objective_lgbm(x, X, y2), timeout=1000, n_jobs=-1)
print(lgbm_study1.best_params, lgbm_study1.best_value)

Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0222671
[30000]	valid_0's mape: 0.0196938
Did not meet early stopping. Best iteration is:
[30000]	valid_0's mape: 0.0196938
[10000]	valid_0's mape: 0.019235
[20000]	valid_0's mape: 0.0204642
Training until validation scores don't improve for 500 rounds
[20000]	valid_0's mape: 0.0186916
Early stopping, best iteration is:
[22213]	valid_0's mape: 0.0186438
[10000]	valid_0's mape: 0.0189307
Training until validation scores don't improve for 500 rounds
[30000]	valid_0's mape: 0.0195794
Did not meet early stopping. Best iteration is:
[30000]	valid_0's mape: 0.0195794
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0220064
[10000]	valid_0's mape: 0.0190019
Early stopping, best iteration is:
[16605]	valid_0's mape: 0.0187062
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's map

In [40]:
lgbm_params6 = lgbm_study1.best_params.copy()
lgbm_params6['num_leaves'] = int(lgbm_params6['num_leaves'])
lgbm_params6['n_jobs'] = -1
lgbm_params6['device_type'] = 'gpu', 
lgbm_params6['num_iterations'] = 30000
lgbm_params6['objective'] = 'regression'
lgbm_params6['metric'] = 'mape'
lgbm_params6['is_training_metric'] = True
lgbm_params6['verbose'] = -1

In [41]:
%%time

lgbm_mape6, lgbm_pred6, lgbm_feat_imp6 = lgbm_model(X, y2, lgbm_params6, 5)

Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.019218
[20000]	valid_0's mape: 0.0186585
Early stopping, best iteration is:
[25630]	valid_0's mape: 0.0185639
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0189379
[20000]	valid_0's mape: 0.0182159
Early stopping, best iteration is:
[27695]	valid_0's mape: 0.0180597
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0195123
[20000]	valid_0's mape: 0.0188799
Early stopping, best iteration is:
[25451]	valid_0's mape: 0.0187909
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0191862
[20000]	valid_0's mape: 0.0185209
[30000]	valid_0's mape: 0.0183578
Did not meet early stopping. Best iteration is:
[29564]	valid_0's mape: 0.0183571
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.0194571
[20000]	valid_0's mape: 0.0189833
Early stopping, best iterati

In [42]:
pd.DataFrame(lgbm_mape6)

Unnamed: 0,val_mape,test_mape
0,33.99238,33.378532
1,32.851865,33.300977
2,35.008695,33.608347
3,32.60946,33.213336
4,34.691284,33.087795


In [43]:
print(np.mean(lgbm_mape6['val_mape']), np.mean(lgbm_mape6['test_mape']))

33.83073689977428 33.31779742838251


### lgbm_params7
* Bayesian Optimization : https://github.com/fmfn/BayesianOptimization

In [None]:
params_range = {
                'num_leaves': (8, 64),
                'max_depth': (4, 64),
                'min_child_samples': (16, 64),  
                'reg_lambda': (0.01, 0.99), 
                'min_split_gain': (0.005, 0.035),
                'colsample_bytree': (0.5, 0.9), 
                'subsample': (0.7, 0.9), 
                'subsample_freq': (0, 7),
                'max_bin': (30, 75), 
                'learning_rate' : (.001, .01)
               }

In [None]:
def Bayes_running(num_leaves, max_depth, min_child_samples, reg_lambda, min_split_gain, colsample_bytree, subsample, subsample_freq, max_bin, learning_rate):
    params = { 
                'boosting': 'gbdt', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 35000, 
              
                "num_leaves": int(round(num_leaves)),
                "max_depth" : int(round(max_depth)),
                'min_child_samples': int(round(min_child_samples)), 
                'reg_lambda': max(reg_lambda, 0), 
                "min_split_gain": min_split_gain,
                'colsample_bytree': colsample_bytree, 
                'subsample': max(min(subsample,1),0), 
                'subsample_freq': int(round(subsample_freq)),
                'max_bin': int(round(max_bin)), 
              
                "learning_rate": learning_rate, 
                "n_jobs" : -1, 
                'device_type' : 'gpu', 
                'verbose' : -1
    }
    
    print(params)


    mape, pred, lgbm_feat_imp = lgbm_model(X, y2, params, 5)
    return -np.mean(mape['test_mape'])

In [None]:
%%time 

Bayesian = BayesianOptimization(Bayes_running, params_range, random_state = 77)
Bayesian.maximize(init_points = 10, n_iter = 20)

In [None]:
print('Final result: ', Bayesian.max)

In [None]:
lgbm_params7 = Bayesian.max["params"].copy()
lgbm_params7['num_leaves'] = int(lgbm_params7['num_leaves'])
lgbm_params7["max_depth"] = int(round(lgbm_params7["max_depth"]))
lgbm_params7["min_child_samples"] = int(round(lgbm_params7["min_child_samples"]))
lgbm_params7["subsample_freq"] = int(round(lgbm_params7["subsample_freq"]))
lgbm_params7["max_bin"] = int(round(lgbm_params7["max_bin"]))

lgbm_params7['n_jobs'] = -1
lgbm_params7['num_iterations'] = 35000
lgbm_params7['objective'] = 'regression'
lgbm_params7['metric'] = 'mape'
lgbm_params7['is_training_metric'] = True
lgbm_params7['device_type'] = 'gpu'
lgbm_params7['verbose'] = -1

In [None]:
%%time

lgbm_mape7, lgbm_pred7, lgbm_feat_imp7 = lgbm_model(X, y2, lgbm_params7, 5)

In [None]:
pd.DataFrame(lgbm_mape7)

In [None]:
print(np.mean(lgbm_mape7['val_mape']), np.mean(lgbm_mape7['test_mape']))

## xgboost 

In [9]:
def xgb_model(X, y, params, n) : 

    mape = {'val_mape' : [], 'test_mape' : []}
    pred = {'val_pred' : [], 'test_idx' : [], 'test_pred' : []}    


    X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=77)
    pred['test_idx'].append(X_test_.index)


    cv = KFold(n_splits=n, random_state=77, shuffle=True)
    for t,v in cv.split(X_train_):
        X_train , X_val = X_train_.iloc[t] , X_train_.iloc[v]            
        y_train , y_val = y_train_.iloc[t] , y_train_.iloc[v]


        # scaling
        scaler = StandardScaler()

        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)


        train_T = xgb.DMatrix(X_train, label=y_train) 
        val_T   = xgb.DMatrix(X_val,   label=y_val)     

        watchlist = [(train_T, 'train'), (val_T, 'valid')]  
            

        model = xgb.train(params, train_T, 20000, watchlist, verbose_eval=10000, early_stopping_rounds=500)
        

        # val
        val_pred = model.predict(val_T)
        pred['val_pred'].append(np.exp(val_pred))              
        mape['val_mape'].append(MAPE_exp(y_val, val_pred))


        # test 
        X_test = scaler.transform(X_test_)
        test_T = xgb.DMatrix(X_test) 

        test_pred = model.predict(test_T)
        pred['test_pred'].append(np.exp(test_pred))
        mape['test_mape'].append(MAPE_exp(y_test_, test_pred))

                    
    
    return mape, pred

In [63]:
xgb_params1 = {'min_child_weight': 1, 'eta': 0.02, 'colsample_bytree': 0.9, 'n_estimators' : 10000,
               'max_depth': 20, 'subsample': 0.9, 'lambda': 1. , 'booster' : 'gbtree', 'silent': 1,
               'predictor': 'gpu_predictor', 'objective': 'reg:linear', 'metrics':'mae', 'gamma' : 0.01}

In [64]:
xgb_mape1, xgb_pred1 = xgb_model(X, y2, xgb_params1, 5)

[0]	train-rmse:15.7649	valid-rmse:15.7419
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[1505]	train-rmse:0.03322	valid-rmse:0.426456

[0]	train-rmse:15.7613	valid-rmse:15.7565
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[3150]	train-rmse:0.030091	valid-rmse:0.420489

[0]	train-rmse:15.7595	valid-rmse:15.7639
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[1498]	train-rmse:0.032823	valid-rmse:0.439343

[0]	train-rmse:15.7568	valid-rmse:15.7746
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[

In [65]:
pd.DataFrame(xgb_mape1)

Unnamed: 0,val_mape,test_mape
0,34.099815,33.774486
1,33.68469,33.544361
2,36.054443,33.788598
3,33.672722,33.864143
4,34.499581,32.908635


In [66]:
print(np.mean(xgb_mape1['val_mape']), np.mean(xgb_mape1['test_mape']))

34.402250361874444 33.576044777386606


### xgb_params2
* optuna

In [13]:
def objective_xgb(trial, X, y):
    
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'max_depth': trial.suggest_int('max_depth', 4, 32),
        'eta': trial.suggest_uniform('eta', 0.001, 0.01),
        'gamma': trial.suggest_loguniform('gamma', 1e-3, 1.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'subsample': trial.suggest_uniform('subsample', 1e-3, 1.0),  
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 1e-3, 1.0),  
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 1e-3, 1.0),  
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 10.0), 
        'max_bin': trial.suggest_int('max_bin', 63, 256),  
        'nthread': -1,
        'silent' : 1, 
        'objective': 'reg:squarederror',
        'eval_metric' : 'mae',
        'n_estimators': 20000,
        'seed': 77
    }
    
    mape, pred = xgb_model(X, y, params, 5)

    return np.mean(mape['test_mape'])

In [None]:
%%time

xgb_study1 = optuna.create_study()
xgb_study1.optimize(lambda x: objective_xgb(x, X, y2), timeout=1000, n_jobs=-1)
print(xgb_study1.best_params, xgb_study1.best_value)

[0]	train-mae:15.9255	valid-mae:15.9019
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 500 rounds.
[0]	train-mae:15.9283	valid-mae:15.9048
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 500 rounds.
[0]	train-mae:16.0116	valid-mae:15.988
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 500 rounds.
[0]	train-mae:15.9315	valid-mae:15.908
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 500 rounds.
Stopping. Best iteration:
[2814]	train-mae:0.140114	valid-mae:0.312487

[0]	train-mae:15.9218	valid-mae:15.9162
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 500 rounds.
Stopping. 

In [None]:
xgb_params2 = lgbm_study1.best_params.copy()
xgb_params2['n_jobs'] = -1
xgb_params2['num_iterations'] = 20000
xgb_params2['objective'] = 'reg:squarederror'
xgb_params2['metric'] = 'mae'
xgb_params2['is_training_metric'] = True
xgb_params2['silent'] = 1

In [None]:
%%time

xgb_mape2, xgb_pred2 = xgb_model(X, y2, lgbm_params2, 5)

In [None]:
pd.DataFrame(xgb_mape2)

In [None]:
print(np.mean(xgb_mape2['val_mape']), np.mean(xgb_mape2['test_mape']))

# Results

In [None]:
final = {
    'lgbm_6th' : [lgbm_mape6, lgbm_pred6, lgbm_feat_imp6] , 
   # 'lgbm_7th' : [lgbm_mape7, lgbm_pred7, lgbm_feat_imp7] 
}

In [None]:
with open("0916_pred.pkl", 'wb') as f:
    pkl.dump(final,f)