In [20]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 20)

In [21]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
X_train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)

In [22]:
for col in X_train.columns[X_train.dtypes == "object"].tolist():
    X_train[col] = X_train[col].astype('category')
    
for col in X_test.columns[X_test.dtypes == "object"].tolist():
    X_test[col] = X_test[col].astype('category')

In [23]:
K = 8 
X = X_train.drop('target', axis=1)
y = X_train['target']

fixedparams = {'random_state': 42,
           'n_estimators': 20000, 
           'learning_rate': 0.1, 
           'metric': 'auc', 
           'verbose':-1   
}

In [24]:
# Functions for KFold evaluation
def model_instance(hyperparams, fixedparams):
    """Create LGBM Classifier for a given set of hyper-parameters."""
    clf = LGBMClassifier(**hyperparams['clf'], **fixedparams) 
    
    if hyperparams['undersample'] == True:
        undersample = RandomUnderSampler(sampling_strategy='majority')
    else:
        undersample = None
    
    pipe = Pipeline([('undersample', undersample),
                     ('clf', clf) ])
    return pipe

In [25]:
def fit_with_stop(model, X, y, X_val, y_val, trial):
    """Advanced training with early stopping."""
    
    if(trial != 0):
        pruning_callback = [LightGBMPruningCallback(trial, 'auc')]
    else: 
        pruning_callback = None
    
    model.fit(X, y,
              clf__eval_set=(X_val, y_val),
              clf__early_stopping_rounds=100, # ! Hard-coded value
              clf__verbose=0,
              clf__eval_metric="auc",
              clf__callbacks = pruning_callback)
    return model

In [26]:
def evaluate(model, X, y):
    """Compute AUC for a given model."""
    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

In [27]:
def kfold_prediction(X, y, X_test, k, hyperparams, fixedparams):
    """Make predictions with a bagged model based on KFold."""
    yp = np.zeros(len(X_test))
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = StratifiedKFold(n_splits=k,random_state=42,shuffle=True)
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"\n----- FOLD {i} -----")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train, X_val, y_val, 0)
        yp += model_fit.predict_proba(X_test)[:, 1] / k
    
    return yp

In [28]:
def objective(trial):
    
    global X, y, K, fixedparams

    hyperparams = {
        'undersample': trial.suggest_categorical("undersample", [True, False]),
        'clf':{'boosting_type': trial.suggest_categorical("boosting_type", ['gbdt', 'goss']),
               'num_leaves': trial.suggest_categorical("num_leaves", [15, 31, 63, 127, 255, 511, 1023, 2047, 4095]), # 2**12
               # 'max_depth': trial.suggest_int('max_depth', 1, 32), # default max_depth=-1
               
               'max_bin': trial.suggest_int('max_bin', 32, 255),
               'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
               'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 256),
               
               'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
               'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 25),
               'min_split_gain' : trial.suggest_discrete_uniform('min_split_gain', 0, 5, 0.01),
               
               'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
               'subsample': trial.suggest_float('subsample', 0.2, 1.0),
               
               'cat_smooth': trial.suggest_int('cat_smooth', 1.0, 50.0) ,
               'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        }

    }
    
    kf = StratifiedKFold(n_splits=K,random_state=42,shuffle=True)
    scores = []
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        #print(f"\n----- FOLD {i} -----")
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train, X_val, y_val, trial)
        val_score = evaluate(model_fit, X_val, y_val)
        scores.append(val_score)
    
    return np.nanmean(scores)


In [29]:
# Optimization
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())

[32m[I 2021-03-21 18:21:20,293][0m A new study created in memory with name: no-name-8dab48a6-abd7-4e47-9fe9-2df7ecdea184[0m


In [30]:
%%time
import warnings
warnings.filterwarnings('ignore')

study.optimize(objective, timeout=60*60*5, n_jobs=-1, n_trials=None, gc_after_trial=False)

[32m[I 2021-03-21 18:23:43,243][0m Trial 5 finished with value: 0.8933694355887356 and parameters: {'undersample': False, 'boosting_type': 'gbdt', 'num_leaves': 127, 'max_bin': 88, 'min_data_in_leaf': 77, 'min_data_in_bin': 50, 'reg_alpha': 21.854214272257362, 'reg_lambda': 10.988377856101659, 'min_split_gain': 2.31, 'colsample_bytree': 0.395129984214706, 'subsample': 0.7578533873463729, 'cat_smooth': 45, 'cat_l2': 19}. Best is trial 5 with value: 0.8933694355887356.[0m
[32m[I 2021-03-21 18:24:31,726][0m Trial 3 finished with value: 0.8946449330954886 and parameters: {'undersample': False, 'boosting_type': 'gbdt', 'num_leaves': 127, 'max_bin': 198, 'min_data_in_leaf': 124, 'min_data_in_bin': 67, 'reg_alpha': 15.134306531614, 'reg_lambda': 4.764365291387815, 'min_split_gain': 0.28, 'colsample_bytree': 0.7836512742049344, 'subsample': 0.5971508825176808, 'cat_smooth': 31, 'cat_l2': 3}. Best is trial 3 with value: 0.8946449330954886.[0m
[32m[I 2021-03-21 18:24:31,781][0m Trial 6 p

CPU times: user 17min 24s, sys: 1min 24s, total: 18min 48s
Wall time: 4min 54s


In [31]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting_type,params_cat_l2,params_cat_smooth,params_colsample_bytree,params_max_bin,params_min_data_in_bin,params_min_data_in_leaf,params_min_split_gain,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,params_undersample,state
0,0,0.892852,2021-03-21 18:21:20.464875,2021-03-21 18:25:47.276076,0 days 00:04:26.811201,gbdt,7,20,0.775305,44,158,46,0.92,63,16.35071,5.350964,0.361463,True,COMPLETE
1,1,0.89356,2021-03-21 18:21:20.471581,2021-03-21 18:26:14.475924,0 days 00:04:54.004343,gbdt,5,37,0.649467,81,90,213,2.14,31,9.490799,22.970041,0.496804,False,COMPLETE
2,2,0.892133,2021-03-21 18:21:20.476591,2021-03-21 18:25:47.776152,0 days 00:04:27.299561,gbdt,15,11,0.886365,34,132,139,4.87,1023,9.534321,20.456481,0.258022,False,COMPLETE
3,3,0.893611,2021-03-21 18:21:20.481873,2021-03-21 18:26:14.977720,0 days 00:04:54.495847,gbdt,17,45,0.337008,178,125,45,3.79,255,2.641658,22.685472,0.360109,False,COMPLETE


In [32]:
study.best_value

0.8936114502515469

In [33]:
plot_optimization_history(study)

In [34]:
optuna.visualization.plot_parallel_coordinate(study)

In [35]:
plot_param_importances(study)

In [36]:
study.best_params

{'undersample': False,
 'boosting_type': 'gbdt',
 'num_leaves': 255,
 'max_bin': 178,
 'min_data_in_leaf': 45,
 'min_data_in_bin': 125,
 'reg_alpha': 2.6416583444795667,
 'reg_lambda': 22.685472000474444,
 'min_split_gain': 3.79,
 'colsample_bytree': 0.3370075974014262,
 'subsample': 0.3601086972836532,
 'cat_smooth': 45,
 'cat_l2': 17}

In [37]:
final_params = dict()
final_params['clf']=dict(study.best_params)
final_params['undersample']=final_params['clf']['undersample']
del final_params['clf']['undersample']

fixedparams['learning_rate'] = 0.01

In [38]:
%%time

submission.loc[:, 'target'] = kfold_prediction(X, y, X_test, 8, final_params, fixedparams)
submission.to_csv('submission.csv', index = False)


------ 8-fold evaluation -----
{'clf': {'boosting_type': 'gbdt', 'num_leaves': 255, 'max_bin': 178, 'min_data_in_leaf': 45, 'min_data_in_bin': 125, 'reg_alpha': 2.6416583444795667, 'reg_lambda': 22.685472000474444, 'min_split_gain': 3.79, 'colsample_bytree': 0.3370075974014262, 'subsample': 0.3601086972836532, 'cat_smooth': 45, 'cat_l2': 17}, 'undersample': False}

----- FOLD 0 -----

----- FOLD 1 -----

----- FOLD 2 -----


[32m[I 2021-03-21 18:32:14,457][0m Trial 7 finished with value: 0.894259841165521 and parameters: {'undersample': False, 'boosting_type': 'goss', 'num_leaves': 31, 'max_bin': 34, 'min_data_in_leaf': 95, 'min_data_in_bin': 39, 'reg_alpha': 22.70462473524685, 'reg_lambda': 22.6764572679783, 'min_split_gain': 1.84, 'colsample_bytree': 0.20304933341039488, 'subsample': 0.4232192817771033, 'cat_smooth': 45, 'cat_l2': 20}. Best is trial 3 with value: 0.8946449330954886.[0m



----- FOLD 3 -----

----- FOLD 4 -----

----- FOLD 5 -----

----- FOLD 6 -----

----- FOLD 7 -----
CPU times: user 36min 47s, sys: 1min 18s, total: 38min 6s
Wall time: 10min 46s


References:

- <https://www.kaggle.com/rmiperrier/tps-mar-lgbm-optuna>
- <https://towardsdatascience.com/how-to-make-your-model-awesome-with-optuna-b56d490368af>
- <https://optuna.readthedocs.io/en/v1.0.0/tutorial/pruning.html>
- <https://www.kaggle.com/kst6690/dsb2019-tuning-lightgbm-parameter-using-optuna>