### Parameter Tuning 
* LGBM
* optuna
* GPU ver.

In [1]:
import os
import sys
import joblib
import pickle as pkl

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


import pandas as pd 
import numpy as np


from IPython.display import display
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('display.max_info_columns', 500)

In [2]:
# visualize
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)

In [None]:
# !git clone https://github.com/Microsoft/LightGBM
# cd LightGBM
# !mkdir build
# !cmake -DUSE_GPU=1
# !make -j$(nproc)
# !sudo apt-get -y install python-pip
# !sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
# %cd /content/LightGBM/python-package
# !sudo python setup.py install
# !pip3 uninstall scikit-learn
# !pip3 install scikit-learn==0.21.3

In [3]:
cd /content/drive/My Drive/Colab Notebooks

/content/drive/My Drive/Colab Notebooks


In [6]:
import BoostingModel as model
import SHAPvalue as SHAP
import optimization as opt

In [7]:
data_v4 = joblib.load(os.path.join('7th_train_FE.pkl'))
locals().update(data_v4)

In [8]:
X = data_v4["X"] ; y = data_v4["y"]
X.shape, y.shape

((35379, 402), (35379,))

In [9]:
y2 = np.log1p(y)

## Modeling

In [10]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

### lgbm_params6

In [12]:
def objective_lgbm(trial, X, y):
    
    params = {
        'num_leaves': int(trial.suggest_loguniform('num_leaves', 8, 64)),  
        'max_depth': trial.suggest_int('max_depth', 8, 64), 
        'min_child_samples': trial.suggest_int('min_child_samples', 16, 64),  
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 5.0),  
        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.01),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 1.0), 
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1.0), 
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.01, 0.05),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0), 
        'boost_from_average': trial.suggest_categorical('boost_from_average', [True, False]), 
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),  
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 7), 
        'max_bin': trial.suggest_int('max_bin', 32, 64),  

        'seed': 77,
        'n_jobs': -1,
        'device_type' : 'gpu', 
        'objective': 'regression',
        'num_iterations': 30000,
        'metric': 'mape',
        'importance_type': 'gain'
    }
    
    mape, pred = opt.lgbm_model(X, y, params, cv_splits=5, scaling="MinMax", epoch=30000)

    return mape['final_mape'][0]

In [13]:
%%time

lgbm_study2 = optuna.create_study()
lgbm_study2.optimize(lambda x: objective_lgbm(x, X, y2), timeout=1000, n_jobs=-1)
print(lgbm_study2.best_params, lgbm_study2.best_value)

Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0222207
[2500]	valid_0's mape: 0.020197
[5000]	valid_0's mape: 0.020522
[5000]	valid_0's mape: 0.0190858
[7500]	valid_0's mape: 0.0197958
[10000]	valid_0's mape: 0.01939
[7500]	valid_0's mape: 0.0186492
[12500]	valid_0's mape: 0.0191581
[10000]	valid_0's mape: 0.0184756
[15000]	valid_0's mape: 0.0190184
[12500]	valid_0's mape: 0.018398
[17500]	valid_0's mape: 0.0189143
[15000]	valid_0's mape: 0.0183592
[20000]	valid_0's mape: 0.0188393
[17500]	valid_0's mape: 0.0183433
Early stopping, best iteration is:
[17841]	valid_0's mape: 0.0183386
[22500]	valid_0's mape: 0.0187834
Training until validation scores don't improve for 500 rounds
[25000]	valid_0's mape: 0.0187404
Early stopping, best iteration is:
[25678]	valid_0's mape: 0.0187301
[2500]	valid_0's mape: 0.0201242
Training until validation scores don't improve for 500 rounds
[2500]	valid_0

In [14]:
lgbm_params6 = lgbm_study2.best_params.copy()
lgbm_params6['num_leaves'] = int(lgbm_params6['num_leaves'])
lgbm_params6['n_jobs'] = -1
lgbm_params6['device_type'] = 'gpu', 
lgbm_params6['num_iterations'] = 30000
lgbm_params6['objective'] = 'regression'
lgbm_params6['metric'] = 'mape'
lgbm_params6['is_training_metric'] = True
lgbm_params6['verbose'] = -1

In [15]:
lgbm_params6

{'boost_from_average': True,
 'colsample_bytree': 0.9584009927992173,
 'device_type': ('gpu',),
 'is_training_metric': True,
 'learning_rate': 0.00835697155742915,
 'max_bin': 58,
 'max_depth': 30,
 'metric': 'mape',
 'min_child_samples': 42,
 'min_child_weight': 1.5988122260167432,
 'min_split_gain': 0.0165654210239466,
 'n_jobs': -1,
 'num_iterations': 30000,
 'num_leaves': 24,
 'objective': 'regression',
 'reg_alpha': 0.8797261747326673,
 'reg_lambda': 0.49350813300094204,
 'subsample': 0.9513225021923524,
 'subsample_freq': 2,
 'verbose': -1}

In [16]:
%%time

lgbm_mape6, lgbm_pred6, lgbm_shap6 = SHAP.lgbm_SHAP(X, y2, lgbm_params6, version='0921-lgbm-optuna2', cv_splits=5, scaling="MinMax", epoch=30000)

Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0202573
[5000]	valid_0's mape: 0.019106
[7500]	valid_0's mape: 0.018662
[10000]	valid_0's mape: 0.018475
[12500]	valid_0's mape: 0.0183925
[15000]	valid_0's mape: 0.0183653
Early stopping, best iteration is:
[16474]	valid_0's mape: 0.0183516
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0201943
[5000]	valid_0's mape: 0.0189757
[7500]	valid_0's mape: 0.0184468
[10000]	valid_0's mape: 0.0181995
[12500]	valid_0's mape: 0.018081
[15000]	valid_0's mape: 0.0180178
[17500]	valid_0's mape: 0.0179847
[20000]	valid_0's mape: 0.0179657
Early stopping, best iteration is:
[20769]	valid_0's mape: 0.0179613
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0210516
[5000]	valid_0's mape: 0.0196477
[7500]	valid_0's mape: 0.0190719
[10000]	valid_0's mape: 0.0187995
[12500]	valid_0's mape: 0.0186624
[15000]	valid_0's mape: 0.0185938
[17500]	v

In [17]:
lgbm_mape6

{'final_mape': [32.776083490318676],
 'test_mape': [33.70492031621013,
  33.681644910111594,
  33.814545798132414,
  33.26083750989338,
  33.93643016750769],
 'val_mape': [33.70619068919631,
  32.46509661361721,
  34.54264678039285,
  32.86193074852601,
  34.53599985578924]}

### lgbm_params2
* 위에꺼랑 파라미터 똑같은데 Standard scaling 해서 사용 노노 

In [26]:
%%time

lgbm_study1 = optuna.create_study()
lgbm_study1.optimize(lambda x: objective_lgbm(x, X, y2), timeout=1000, n_jobs=-1)
print(lgbm_study1.best_params, lgbm_study1.best_value)

Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
[5000]	valid_0's mape: 0.0216781
[2500]	valid_0's mape: 0.0284613
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0235434
[7500]	valid_0's mape: 0.0206289
[10000]	valid_0's mape: 0.020005
[5000]	valid_0's mape: 0.0251375
[5000]	valid_0's mape: 0.0214603
[12500]	valid_0's mape: 0.0195769
[7500]	valid_0's mape: 0.0237265
[15000]	valid_0's mape: 0.0192772
[7500]	valid_0's mape: 0.0204989
[2500]	valid_0's mape: 0.0229585
[17500]	valid_0's mape: 0.0190547
[10000]	valid_0's mape: 0.0228453
[10000]	valid_0's mape: 0.0199464
[20000]	valid_0's mape: 0.0188874
[22500]	valid_0's mape: 0.0187669
[12500]	valid_0's mape: 0.0222306
[12500]	valid_0's mape: 0.01961
[25000]	valid_0's mape: 0.0186722
Did not meet early stopping. Best iteration is:
[25000]	valid_0's mape: 0.0186722
[15000]	valid_0's mape: 0.0217332
[15000]	valid_0's mape: 0.0193

In [27]:
lgbm_params2 = lgbm_study1.best_params.copy()
lgbm_params2['num_leaves'] = int(lgbm_params2['num_leaves'])
lgbm_params2['n_jobs'] = -1
lgbm_params2['device_type'] = 'gpu', 
lgbm_params2['num_iterations'] = 30000
lgbm_params2['objective'] = 'regression'
lgbm_params2['metric'] = 'mape'
lgbm_params2['is_training_metric'] = True
lgbm_params2['verbose'] = -1

In [29]:
lgbm_params2

{'boost_from_average': False,
 'colsample_bytree': 0.9421645130749837,
 'device_type': ('gpu',),
 'is_training_metric': True,
 'learning_rate': 0.0044486228697341815,
 'max_bin': 35,
 'max_depth': 22,
 'metric': 'mape',
 'min_child_samples': 45,
 'min_child_weight': 2.264542217981155,
 'min_split_gain': 0.03108699091566872,
 'n_jobs': -1,
 'num_iterations': 30000,
 'num_leaves': 15,
 'objective': 'regression',
 'reg_alpha': 0.8820218786083174,
 'reg_lambda': 0.026471295694915672,
 'subsample': 0.8410870101253838,
 'subsample_freq': 4,
 'verbose': -1}

In [33]:
%%time

lgbm_mape2, lgbm_pred2, lgbm_shap2 = SHAP.lgbm_SHAP(X, y2, lgbm_params2, version='0921-lgbm-optuna', cv_splits=5, scaling="Standard", epoch=30000)

Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0235572
[5000]	valid_0's mape: 0.0214479
[7500]	valid_0's mape: 0.02051
[10000]	valid_0's mape: 0.0199514
[12500]	valid_0's mape: 0.0195776
[15000]	valid_0's mape: 0.0193103
[17500]	valid_0's mape: 0.0191312
[20000]	valid_0's mape: 0.0189848
[22500]	valid_0's mape: 0.0188847
[25000]	valid_0's mape: 0.0188017
[27500]	valid_0's mape: 0.0187364
[30000]	valid_0's mape: 0.0186915
Did not meet early stopping. Best iteration is:
[30000]	valid_0's mape: 0.0186915
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0235625
[5000]	valid_0's mape: 0.0213408
[7500]	valid_0's mape: 0.0203425
[10000]	valid_0's mape: 0.0197302
[12500]	valid_0's mape: 0.0193319
[15000]	valid_0's mape: 0.0190137
[17500]	valid_0's mape: 0.0187706
[20000]	valid_0's mape: 0.0185957
[22500]	valid_0's mape: 0.0184683
[25000]	valid_0's mape: 0.0183503
[27500]	valid_0's mape: 0.0182683
[30000]	valid_0's ma

In [34]:
lgbm_mape2

{'final_mape': [33.66846099230518],
 'test_mape': [34.27476909639607,
  34.34494129084431,
  34.480448985031124,
  34.066059999321816,
  34.3199544625717],
 'val_mape': [34.53151182348836,
  33.15114511547503,
  35.42986782793108,
  33.59155226231744,
  35.146519138391696]}