In [12]:
import os
import sys
import joblib
import pickle as pkl

In [1]:
import pandas as pd 
import numpy as np

import joblib
import pickle as pkl

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
# MAPE 
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
# MAPE_exp  
def MAPE_exp(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100

# Modeling 

In [9]:
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [10]:
import shap

import optuna 
from bayes_opt import BayesianOptimization 

### xgboost 

In [17]:
# for hyperparameter tuning : xgboost 
def xgb_oof(X, y, params, version, x_test=None, shap=False, scaling=False, cv_splits=5, epoch=30000):

    mape = {'val_mape'   : [], 'final_mape' : []}
    SHAP = {'shap_values': [], 'expected_values' : []}
    pred = {'val_logpred': [], 'val_pred' : [], 'test_logpred': [], 'test_pred' : []}

    
    oof_pred = np.zeros(len(X))  # val을 통해 예측한 값 
    if x_test is not None:       # hyperparameter tuning된 test 값으로 예측할 때  
        test_pred = np.zeros(len(x_test))
    
    
    # models save 
    models = [] 
    
    
    # K Fold Cross Validation
    cv = KFold(n_splits=cv_splits, random_state=77, shuffle=True)
    for t,v in cv.split(X):
        X_train , X_val = X.iloc[t] , X.iloc[v]            
        y_train , y_val = y.iloc[t] , y.iloc[v]
        
        
        # scaling : MinMax or Standard 
        if scaling : 
            if scaling == 'MinMax' : 
                scaler = MinMaxScaler()
            elif scaling == 'Standard' : 
                scaler = StandardScaler() 

            X_train = scaler.fit_transform(X_train)
            X_val   = scaler.transform(X_val)
            
            if x_test is not None : 
                X_test = scaler.transform(x_test)
                
        else : 
            if x_test is not None : 
                X_test = x_test 
            
        
        # modeling 
        train_T = xgb.DMatrix(X_train, label=y_train, missing=np.nan) 
        val_T   = xgb.DMatrix(X_val,   label=y_val  , missing=np.nan)     

        watchlist = [(train_T, 'train'), (val_T, 'valid')]  
            

        model = xgb.train(params, train_T, epoch, watchlist, verbose_eval=2500, early_stopping_rounds=500)
        oof_pred[v] = model.predict(X_val)
        models.append(model) 
        
        
        mape['val_mape'].append(MAPE_exp(y_val, oof_pred[v]))

        
        if x_test is not None :
            test_pred += model.predict(X_test) / 5
        
        
        
        # SHAP
        if shap : 
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            
            SHAP['shap_values'].append(shap_values)
            SHAP['expected_values'].append(explainer.expected_value)    
        
    
    
    mape['final_mape'].append(MAPE_exp(y, oof_pred))

    # preds
    pred['val_logpred'].append(oof_pred)
    pred['val_pred'].append(np.expm1(oof_pred))
    if x_test is not None : 
        pred['test_logpred'].append(test_pred)
        pred['test_pred'].append(np.expm1(test_pred))
    
    
    # save models
    with open('xgb' + version + '.pickle', 'wb') as f:
        pkl.dump(models, f, pkl.HIGHEST_PROTOCOL)
        
        
    # save preds 
    with open('xgb_pred' + version + '.pickle', 'wb') as f:
        pkl.dump(pred, f, pkl.HIGHEST_PROTOCOL)
        
    
    # save shap values 
    if shap : 
        with open('xgb_shap' + version + '.pickle', 'wb') as f:
            pkl.dump(SHAP, f, pkl.HIGHEST_PROTOCOL)  
        
            
            
        
    if x_test is None:
        return models, oof_pred, mape, SHAP, pred 
    else:
        return models, oof_pred, mape, SHAP, pred, test_pred

### lightgbm

In [None]:
# for hyperparameter tuning : lgbm
def lgbm_oof(X, y, params, version, x_test=None, shap=False, scaling=False, cv_splits=5, epoch=30000):

    mape = {'val_mape'   : [], 'final_mape' : []}
    SHAP = {'shap_values': [], 'expected_values' : []}
    pred = {'val_logpred': [], 'val_pred' : [], 'test_logpred': [], 'test_pred' : []}

    
    oof_pred = np.zeros(len(X))  # val을 통해 예측한 값 
    if x_test is not None:       # hyperparameter tuning된 test 값으로 예측할 때  
        test_pred = np.zeros(len(x_test))
    
    
    # models save 
    models = [] 
    
    
    # K Fold Cross Validation
    cv = KFold(n_splits=cv_splits, random_state=77, shuffle=True)
    for t,v in cv.split(X):
        X_train , X_val = X.iloc[t] , X.iloc[v]            
        y_train , y_val = y.iloc[t] , y.iloc[v]
        
        
        # scaling : MinMax or Standard 
        if scaling : 
            if scaling == 'MinMax' : 
                scaler = MinMaxScaler()
            elif scaling == 'Standard' : 
                scaler = StandardScaler() 

            X_train = scaler.fit_transform(X_train)
            X_val   = scaler.transform(X_val)
            
            if x_test is not None : 
                X_test = scaler.transform(x_test)
                
        else : 
            X_train = X_train.values
            X_val = X_val.values
            if x_test is not None : 
                X_test = x_test.values 
            
        
        # modeling 
        train_T = lgb.Dataset(X_train, label=y_train.values) 
        val_T   = lgb.Dataset(X_val,   label=y_val.values)  
            
        
        model = lgb.train(params, train_T, epoch, valid_sets = val_T, verbose_eval=2500, early_stopping_rounds=500)
        oof_pred[v] = model.predict(X_val)
        models.append(model) 
        
        
        mape['val_mape'].append(MAPE_exp(y_val, oof_pred[v]))

        
        if x_test is not None :
            test_pred += model.predict(X_test) / 5
        
        
        
        # SHAP
        if shap : 
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            
            SHAP['shap_values'].append(shap_values)
            SHAP['expected_values'].append(explainer.expected_value)    
        
    
    
    mape['final_mape'].append(MAPE_exp(y, oof_pred))
       
    # preds
    pred['val_logpred'].append(oof_pred)
    pred['val_pred'].append(np.expm1(oof_pred))
    if x_test is not None : 
        pred['test_logpred'].append(test_pred)
        pred['test_pred'].append(np.expm1(test_pred))
    
    
    # save models
    with open('lgbm' + version + '.pickle', 'wb') as f:
        pkl.dump(models, f, pkl.HIGHEST_PROTOCOL)

    # save preds 
    with open('lgbm_pred' + version + '.pickle', 'wb') as f:
        pkl.dump(pred, f, pkl.HIGHEST_PROTOCOL)
    
    # save shap values 
    if shap : 
        with open('lgbm_shap' + version + '.pickle', 'wb') as f:
            pkl.dump(SHAP, f, pkl.HIGHEST_PROTOCOL)  
    
            
        
    if x_test is None:
        return models, oof_pred, mape, SHAP, pred 
    else:
        return models, oof_pred, mape, SHAP, pred, test_pred

### prediction ! 

In [6]:
# for pred  
def test_pred(X_test, models, version, cv_splits=5):

    SHAP = {'shap_values': [], 'expected_values' : []}    
    pred = {'log_pred': [], 'pred' : []}
    test_pred = np.zeros(len(X_test))  # test dataset 
    
    
    # predict 
    for model in models : 
        test_pred += model.predict(X_test) / cv_splits


        # SHAP
        if shap : 
            shap_values = explainer.shap_values(X_test)
            explainer = shap.TreeExplainer(model)

            SHAP['shap_values'].append(shap_values)
            SHAP['expected_values'].append(explainer.expected_value)    

    # preds
    pred['log_pred'].append(test_pred)
    pred['pred'].append(np.expm1(test_pred))
            
                
    # save test preds 
    with open('testpred_' + version + '.pickle', 'wb') as f:
        pkl.dump(pred, f, pkl.HIGHEST_PROTOCOL)    
    
        
    return SHAP, pred

## example 

In [13]:
data_v4 = joblib.load(os.path.join('..', '..', '7th_train_FE.pkl'))
locals().update(data_v4)

In [14]:
X = data_v4["X"] ; y = data_v4["y"]
X.shape, y.shape

((35379, 402), (35379,))

In [19]:
y2 = np.log1p(y)

### optuna parameter tuninig 

In [15]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [18]:
def objective_xgb(trial, X, y):
    params = {
                'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
                'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
                'max_depth': trial.suggest_int('max_depth', 4, 63),
                'eta': trial.suggest_uniform('eta', 0.001, 0.01),
                'gamma': trial.suggest_loguniform('gamma', 1e-3, 1.0),
                'subsample': trial.suggest_uniform('subsample', 1e-3, 1.0), 
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 1e-3, 1.0),  
                'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 1e-3, 1.0),  
                'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 15),  
                'max_bin': trial.suggest_int('max_bin', 31, 256), 
              
              
                'nthread': -1, 
                'silent' : 1 , 
                'objective': 'reg:squarederror',
                'n_estimators': 30000,
                'seed': 77
    }
    
    models, oof_pred, mape, SHAP = xgb_oof(X, y, params, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)
    return mape['final_mape'][0]

In [20]:
%%time
xgb_study1 = optuna.create_study()
xgb_study1.optimize(lambda x : objective_xgb(x, X, y2), timeout=1000, n_jobs=-1)
print(xgb_study1.best_params, xgb_study1.best_value)

[0]	train-rmse:15.95422	valid-rmse:15.94957
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[0]	train-rmse:15.97882	valid-rmse:15.97415
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[0]	train-rmse:16.01633	valid-rmse:16.01168
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[0]	train-rmse:15.98757	valid-rmse:15.98289
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[0]	train-rmse:15.99263	valid-rmse:15.98803
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
[0]	train-rmse:16.02331	valid-rmse:16.01868
Multip

KeyboardInterrupt: 

[5000]	train-rmse:0.14203	valid-rmse:0.44624
Stopping. Best iteration:
[4622]	train-rmse:0.14697	valid-rmse:0.44584



[W 2020-09-25 03:12:42,077] Trial 2 failed because of the following error: TypeError('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)
Traceback (most recent call last):
  File "/Users/jbeen/anaconda3/lib/python3.8/site-packages/optuna/study.py", line 778, in _run_trial
    result = func(trial)
  File "<timed exec>", line 2, in <lambda>
  File "<ipython-input-18-c704345be25b>", line 22, in objective_xgb
    models, oof_pred, mape, SHAP = xgb_oof(X, y, params, '0925_optuna1', x_test=None, shap=False, scaling='MinMax', cv_splits=5, epoch=30000)
  File "<ipython-input-17-040f7394cfde>", line 50, in xgb_oof
    oof_pred[v] = model.predict(X_val)
  File "/Users/jbeen/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 1360, in predict
    raise TypeError('Expecting data to be a DMatrix object, got: ',
TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)


[2500]	train-rmse:0.23485	valid-rmse:0.41432
[2500]	train-rmse:0.08711	valid-rmse:0.40992
