In [2]:
import os
import sys
import joblib
import pickle as pkl

In [3]:
import pandas as pd 
import numpy as np

import joblib
import pickle as pkl

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
# !git clone https://github.com/Microsoft/LightGBM
# cd LightGBM
# !mkdir build
# !cmake -DUSE_GPU=1
# !make -j$(nproc)
# !sudo apt-get -y install python-pip
# !sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
# %cd /content/LightGBM/python-package
# !sudo python setup.py install
# !pip3 install scikit-learn==0.21.3 --upgrade

In [4]:
# MAPE 
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
# MAPE_exp  
def MAPE_exp(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100

# Modeling 

In [6]:
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
import shap
import optuna 

### lightgbm

In [8]:
# for hyperparameter tuning : lgbm
def lgbm_oof(X, y, params, version, x_test=None, scaling=False, shap=False, cv_splits=5, epoch=30000):

    mape = {'val_mape'   : [], 'final_mape' : []}
    pred = {'val_logpred': [], 'val_pred' : [], 'test_logpred': [], 'test_pred' : []}
    scal = {'train_scaler' : []}
    SHAP = {'shap_values': [], 'expected_values' : []}

    
    oof_pred = np.zeros(len(X))  # val을 통해 예측한 값 
    if x_test is not None:       # hyperparameter tuning된 test 값으로 예측할 때  
        test_pred = np.zeros(len(x_test))
    
    
    # models save 
    models = [] 
    
    
    # K Fold Cross Validation
    cv = KFold(n_splits=cv_splits, random_state=77, shuffle=True)
    for t,v in cv.split(X):
        X_train , X_val = X.iloc[t] , X.iloc[v]            
        y_train , y_val = y.iloc[t] , y.iloc[v]
        
        
        # scaling : MinMax or Standard 
        if scaling : 
            if scaling == 'MinMax' : 
                scaler = MinMaxScaler()
            elif scaling == 'Standard' : 
                scaler = StandardScaler() 

            X_train = scaler.fit_transform(X_train)
            X_val   = scaler.transform(X_val)

            scal.append(scaler)
            
            if x_test is not None : 
                X_test = scaler.transform(x_test)
                
        else : 
            X_train = X_train.values
            X_val = X_val.values
            if x_test is not None : 
                X_test = x_test.values 
            
        

        # modeling 
        train_T = lgb.Dataset(X_train, label=y_train.values) 
        val_T   = lgb.Dataset(X_val,   label=y_val.values)  
            
        
        model = lgb.train(params, train_T, epoch, valid_sets = val_T, verbose_eval=2500, early_stopping_rounds=500)
        oof_pred[v] = model.predict(X_val)
        models.append(model) 
        
        
        mape['val_mape'].append(MAPE_exp(y_val, oof_pred[v]))

        
        if x_test is not None :
            test_pred += model.predict(X_test) / 5
        
        
        
        # SHAP
        if shap : 
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            
            SHAP['shap_values'].append(shap_values)
            SHAP['expected_values'].append(explainer.expected_value)    
        
    
    
    mape['final_mape'].append(MAPE_exp(y, oof_pred))
       
    # preds
    pred['val_logpred'].append(oof_pred)
    pred['val_pred'].append(np.expm1(oof_pred))
    if x_test is not None : 
        pred['test_logpred'].append(test_pred)
        pred['test_pred'].append(np.expm1(test_pred))
    
    
    # save models
    with open('lgbm' + version + '.pickle', 'wb') as f:
        pkl.dump(models, f, pkl.HIGHEST_PROTOCOL)

    # save preds 
    with open('lgbm_pred' + version + '.pickle', 'wb') as f:
        pkl.dump(pred, f, pkl.HIGHEST_PROTOCOL)
    
    # save scaler 
    if scaling : 
        with open('lgbm_scaler' + version + '.pickle', 'wb') as f:
            pkl.dump(scal, f, pkl.HIGHEST_PROTOCOL)  

    # save shap values 
    if shap : 
        with open('lgbm_shap' + version + '.pickle', 'wb') as f:
            pkl.dump(SHAP, f, pkl.HIGHEST_PROTOCOL)  
    
            
        
    if x_test is None:
        return models, oof_pred, mape, pred, scal, SHAP 
    else:
        return models, oof_pred, mape, pred, scal, SHAP, test_pred

### prediction ! 

In [45]:
# for pred  
def test_pred(X_test, models, version, cv_splits=5, scaling=None, shap=False):

    SHAP = {'shap_values': [], 'expected_values' : []}    
    pred = {'log_pred': [], 'pred' : []}
    test_pred = np.zeros(len(X_test))  # test dataset 
    
    
    # predict 
    if scaling is not None : 
      for model in models : 
          test_pred += model.predict(X_test) / cv_splits

    else : 
      for model, scaler in zip(models, scaling) : 
          X_test_ = scaler.transform(X_test)
          test_pred += model.predict(X_test_) / cv_splits


        # SHAP
        if shap : 
            shap_values = explainer.shap_values(X_test)
            explainer = shap.TreeExplainer(model)

            SHAP['shap_values'].append(shap_values)
            SHAP['expected_values'].append(explainer.expected_value)    


    # preds
    pred['log_pred'].append(test_pred)
    pred['pred'].append(np.expm1(test_pred))
            
                
    # save test preds 
    with open('testpred_' + version + '.pickle', 'wb') as f:
        pkl.dump(pred, f, pkl.HIGHEST_PROTOCOL)    
    
        
    return SHAP, pred

# Data Load 

In [11]:
cd /content/drive/My Drive/쇼핑광고등어/3. Modeling/재빈

/content/drive/.shortcut-targets-by-id/1cWc-Nky29igOCXv9H2nYRUOItUS3kV8X/쇼핑광고등어/3. Modeling/재빈


In [10]:
data_v4 = joblib.load(os.path.join('..', '..', '1. Data', '05_분석데이터', '7th_train_FE.pkl'))
locals().update(data_v4)

In [11]:
X = data_v4["X"] ; y = data_v4["y"]
X.shape, y.shape

((35379, 402), (35379,))

In [12]:
y2 = np.log1p(y)

### lgbm_params1

In [17]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [20]:
def objective_lgbm(trial, X, y):
    
    params = {
        'num_leaves': int(trial.suggest_loguniform('num_leaves', 8, 64)),  
        'max_depth': trial.suggest_int('max_depth', 8, 128), 
        'min_child_samples': trial.suggest_int('min_child_samples', 16, 64),  
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 5.0),  
        'learning_rate': trial.suggest_uniform('learning_rate', 0.001, 0.01),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.01, 1.0), 
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1.0), 
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.01, 0.05),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0), 
        'boost_from_average': trial.suggest_categorical('boost_from_average', [True, False]), 
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),  
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 7), 
        'max_bin': trial.suggest_int('max_bin', 32, 256),  

        'seed': 77,
        'n_jobs': -1,
        'device_type' : 'gpu', 
        'objective': 'regression',
        'num_iterations': 30000,
        'metric': 'mape',
        'importance_type': 'gain'
    }
    

    models, oof_pred, mape, SHAP, pred = lgbm_oof(X, y, params, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)
    return mape['final_mape'][0]

In [24]:
%%time
lgbm_study1 = optuna.create_study()
lgbm_study1.optimize(lambda x : objective_lgbm(x, X, y2), timeout=1000, n_jobs=-1)
print(lgbm_study1.best_params, lgbm_study1.best_value)

[10000]	valid_0's mape: 0.0225283
Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
Training until validation scores don't improve for 500 rounds
[10000]	valid_0's mape: 0.023095
Training until validation scores don't improve for 500 rounds
[25000]	valid_0's mape: 0.0185404
[7500]	valid_0's mape: 0.0203225
[12500]	valid_0's mape: 0.0218751
[12500]	valid_0's mape: 0.0179172
[12500]	valid_0's mape: 0.0224011
[25000]	valid_0's mape: 0.0189385
[10000]	valid_0's mape: 0.019716
[15000]	valid_0's mape: 0.0213639
[15000]	valid_0's mape: 0.0218644
[27500]	valid_0's mape: 0.0184307
[2500]	valid_0's mape: 0.0202205
[10000]	valid_0's mape: 0.0175547
[17500]	valid_0's mape: 0.0209439
[12500]	valid_0's mape: 0.0193174
[27500]	valid_0's mape: 0.018799
[17500]	valid_0's mape: 0.0214389
[30000]	valid_0's mape: 0.0183485
Did not meet early stopping. Best iteration is:
[

In [25]:
lgbm_params1 = lgbm_study1.best_params.copy()
lgbm_params1['num_leaves'] = int(lgbm_params1['num_leaves'])
lgbm_params1['n_jobs'] = -1
lgbm_params1['device_type'] = 'gpu', 
lgbm_params1['num_iterations'] = 30000
lgbm_params1['objective'] = 'regression'
lgbm_params1['metric'] = 'mape'
lgbm_params1['is_training_metric'] = True
lgbm_params1['verbose'] = -1

In [26]:
lgbm_models1, lgbm_oof_pred1, lgbm_mape1, lgbm_SHAP1, lgbm_pred1 = lgbm_oof(X, y2, lgbm_params1, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)

Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0220874
[5000]	valid_0's mape: 0.0199584
[7500]	valid_0's mape: 0.0191116
[10000]	valid_0's mape: 0.0186186
[12500]	valid_0's mape: 0.0183064
[15000]	valid_0's mape: 0.0180852
[17500]	valid_0's mape: 0.0179287
[20000]	valid_0's mape: 0.01783
[22500]	valid_0's mape: 0.017754
[25000]	valid_0's mape: 0.0176957
[27500]	valid_0's mape: 0.0176512
[30000]	valid_0's mape: 0.0176184
Did not meet early stopping. Best iteration is:
[29996]	valid_0's mape: 0.0176183
Training until validation scores don't improve for 500 rounds
[2500]	valid_0's mape: 0.0226854
[5000]	valid_0's mape: 0.0204527
[7500]	valid_0's mape: 0.0194976
[10000]	valid_0's mape: 0.0189128
[12500]	valid_0's mape: 0.0185388
[15000]	valid_0's mape: 0.0182932
[17500]	valid_0's mape: 0.0181116
[20000]	valid_0's mape: 0.0179794
[22500]	valid_0's mape: 0.0178751
[25000]	valid_0's mape: 0.0177947
[27500]	valid_0's mape: 0.0177371
[30000]	valid_0's map

In [27]:
lgbm_mape1

{'final_mape': [32.01987603976899],
 'val_mape': [32.160711058602736,
  32.75895467177284,
  31.501214419563002,
  31.676794042546696,
  32.001703438157044]}

## model load & pred

In [8]:
# load
with open('lgbm0925_optuna1.pickle', 'rb') as f:
  lgbm_model1 = pkl.load(f)

In [9]:
lgbm_model1

[<lightgbm.basic.Booster at 0x7fdb1bf95400>,
 <lightgbm.basic.Booster at 0x7fdb1bf95b00>,
 <lightgbm.basic.Booster at 0x7fdb1bf95ba8>,
 <lightgbm.basic.Booster at 0x7fdb1bf95c50>,
 <lightgbm.basic.Booster at 0x7fdb1bf95cf8>]

In [53]:
# train, test 잘되나 테스트!!
X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=0)

In [54]:
%%time
shap1, preds1 = test_pred(X_test_, lgbm_model1, 'testtest', cv_splits=5, shap=False)

CPU times: user 2min 51s, sys: 94.5 ms, total: 2min 51s
Wall time: 43.6 s


In [55]:
preds1['pred']

[array([29608350.94458975, 33809805.27627023, 33682458.63395207, ...,
        20541857.59598601, 30435842.50219661, 20850970.7611865 ])]

In [56]:
y_test_

456      13717000.0
35457    20534000.0
23602     4388000.0
9631     63703000.0
18697     9414000.0
            ...    
666       4763000.0
32657     7866000.0
26855     8205000.0
10636    22946000.0
37795    37192000.0
Name: 취급액, Length: 7076, dtype: float64

In [57]:
MAPE(preds1['pred'], y_test_)

64.94991922503529

In [14]:
# load
with open('lgbm_pred0925_optuna1.pickle', 'rb') as f:
  lgbm_pred1 = pkl.load(f)

In [15]:
lgbm_pred1

{'test_logpred': [],
 'test_pred': [],
 'val_logpred': [array([14.8673097 , 14.99788566, 15.40589897, ..., 17.44356795,
         15.93961522, 17.17870384])],
 'val_pred': [array([ 2862796.0636533 ,  3262111.86041748,  4905652.95356748, ...,
         37639625.64281705,  8365403.27208715, 28881268.85953931])]}

In [18]:
len(lgbm_pred1['val_pred'][0])

35379

In [19]:
y

0          2099000.0
1          4371000.0
2          3262000.0
3          6955000.0
4          6672000.0
            ...     
38299     10157000.0
38300     50929000.0
38301    104392000.0
38302     13765000.0
38303     46608000.0
Name: 취급액, Length: 35379, dtype: float64

In [20]:
MAPE(y, lgbm_pred1['val_pred'][0])

32.01987603976899

### lgbm_params2

In [28]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=d82e46cb6b6c7ed1d957f240892d41a2fbc001e1b78b9ea913b15f71b07dfc33
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [29]:
params_range = {
                'num_leaves': (8, 64),
                'max_depth': (4, 128),
                'min_child_samples': (16, 64), 
                'reg_alpha': (0.01, 0.99),  
                'reg_lambda': (0.01, 0.99), 
                'min_split_gain': (0.005, 0.035),
                'colsample_bytree': (0.3, 0.9), 
                'subsample': (0.6, 0.9), 
                'subsample_freq': (0, 7),
                'max_bin': (16, 256), 
                'learning_rate' : (.001, .01)
               }

In [30]:
def Bayes_running(num_leaves, max_depth, min_child_samples, reg_alpha, reg_lambda, min_split_gain, colsample_bytree, subsample, subsample_freq, max_bin, learning_rate):
    params = { 
                'boosting': 'gbdt', 'objective': 'regression', 'metric': 'mape', 'is_training_metric': True, 'n_estimators' : 30000, 
              
                "num_leaves": int(round(num_leaves)),
                "max_depth" : int(round(max_depth)),
                'min_child_samples': int(round(min_child_samples)), 
                'reg_lambda': max(reg_lambda, 0), 
                "min_split_gain": min_split_gain,
                'colsample_bytree': colsample_bytree, 
                'subsample': max(min(subsample,1),0), 
                'subsample_freq': int(round(subsample_freq)),
                'max_bin': int(round(max_bin)), 
              
                "learning_rate": learning_rate, 
                "n_jobs" : -1,
                'device_type' : 'gpu', 
                'verbose' : -1
              
    }
    
    print(params)

    models, oof_pred, mape, SHAP, pred = lgbm_oof(X, y2, params, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)
    return -(mape['final_mape'][0])

In [31]:
%%time 

Bayesian = BayesianOptimization(Bayes_running, params_range, random_state = 77)
Bayesian.maximize(init_points = 10, n_iter = 20)

NameError: ignored

In [None]:
print('Final result: ', Bayesian.max)

In [None]:
# prior은 정확한 관측치로부터 얻어진 것이 아니므로, 대략적인 정수를 사용해도 무방하다고 함!! 
lgbm_params2 = Bayesian.max["params"].copy()
lgbm_params2['num_leaves'] = int(lgbm_params2['num_leaves'])
lgbm_params2["max_depth"] = int(round(lgbm_params2["max_depth"]))
lgbm_params2["min_child_samples"] = int(round(lgbm_params2["min_child_samples"]))
lgbm_params2["subsample_freq"] = int(round(lgbm_params2["subsample_freq"]))
lgbm_params2["max_bin"] = int(round(lgbm_params2["max_bin"]))

lgbm_params2['n_jobs'] = -1
lgbm_params2['num_iterations'] = 30000
lgbm_params2['objective'] = 'regression'
lgbm_params2['metric'] = 'mape'
lgbm_params2['device_type'] = 'gpu'
lgbm_params2['verbose'] = -1
lgbm_params2['is_training_metric'] = True

In [None]:
lgbm_models2, lgbm_oof_pred2, lgbm_mape2, lgbm_SHAP2, lgbm_pred2 = lgbm_oof(X, y2, lgbm_params2, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)