# Problem definition


-  Details:

According to the description: "The dataset is used for this competition is synthetic but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the amount of an insurance claim. Although the features are anonymized, they have properties relating to real-world features."

-  Solution:

A LightGBM model will be adjusted using Bayesian optimization with lib [optuna](https://optuna.readthedocs.io/en/stable/) (optimize hyperparameters and pre processing). The goal will be to maximize area under the ROC curve

<p align="right"><span style="color:firebrick">Dont forget to upvote if the notebook was useful! <i class="fas fa-hand-peace"></i></span> </p>

# Import dependencies

In [1]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 20)

# Prepare data

In [2]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
X_train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)

In [3]:
for col in X_train.columns[X_train.dtypes == "object"].tolist():
    X_train[col] = X_train[col].astype('category')
    
for col in X_test.columns[X_test.dtypes == "object"].tolist():
    X_test[col] = X_test[col].astype('category')

In [4]:
X = X_train.drop('target', axis=1)
y = X_train['target']

K = 5 # cross validation

fixedparams = {'random_state': 42,
               'n_estimators': 20000, 
               'learning_rate': 0.1, 
               'metric': 'auc', 
               'verbose':-1   
}

# Custom Functions

In [5]:
def model_instance(hyperparams, fixedparams):

    clf = LGBMClassifier(**hyperparams['clf'], **fixedparams) 
    
    if hyperparams['resample'] == 'random':
        resample = RandomUnderSampler(sampling_strategy='majority')
    else:
        resample = None
        
    if hyperparams['power'] == True:
        cont = [col for col in X_train.columns if 'cont' in col]
        numeric_transformer = PowerTransformer(method='yeo-johnson',
                                               standardize=True)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, cont)])
    else:
        preprocessor = None
    
    pipe = Pipeline([('preprocessor', preprocessor),
                     ('resample', resample),
                     ('clf', clf) ])
    return pipe

In [6]:
def fit_with_stop(pipe, X, y, X_val, y_val, trial, hyperparams):
    
    if(trial != 0):
        pruning_callback = [LightGBMPruningCallback(trial, 'auc')]
    else: 
        pruning_callback = None
    
    if hyperparams['power'] == True:
        pipe_interim = pipe.named_steps.preprocessor.fit(X)
        X_val = pipe_interim.transform(X_val)
    
    pipe.fit(X, y,
              clf__eval_set=(X_val, y_val),
              clf__early_stopping_rounds=100,
              clf__verbose=0,
              clf__eval_metric="auc",
              clf__callbacks=pruning_callback)
    return pipe

In [7]:
def evaluate(model, X, y):

    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

In [8]:
def kfold_prediction(X, y, X_test, k, hyperparams, fixedparams):

    yp = np.zeros(len(X_test))
    
    kf = StratifiedKFold(n_splits=k,random_state=42,shuffle=True)
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"\n FOLD {i} ...")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train,
                                  X_val, y_val, 0, hyperparams)
        yp += model_fit.predict_proba(X_test)[:, 1] / k
    
    return yp

In [9]:
def objective(trial):
    
    global X, y, K, fixedparams

    hyperparams = {
        'resample': trial.suggest_categorical("resample", [None, 'random']),
        'power': trial.suggest_categorical("power", [True, False]),
        'clf':{
            'boosting_type': trial.suggest_categorical("boosting_type", ['gbdt']),
            'num_leaves': trial.suggest_int('num_leaves', 2, 512),
            'max_depth': trial.suggest_int('max_depth', 1, 16),
               
            'max_delta_step': trial.suggest_int('max_delta_step', 1, 10),
            ##'max_bin': trial.suggest_int('max_bin', 32, 255),
            ##'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
            ##'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 256),

            'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
            'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 30),
            #'min_split_gain' : trial.suggest_discrete_uniform('min_split_gain', 0, 5, 0.01),
            
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
            #'subsample': trial.suggest_float('subsample ', 0.1, 1.0),
            
            'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0),
            'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        }

    }
    
    kf = StratifiedKFold(n_splits=K,random_state=42,shuffle=True)
    scores = []
    model = model_instance(hyperparams, fixedparams)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train, X_val, y_val,
                                  trial, hyperparams)
        val_score = evaluate(model_fit, X_val, y_val)
        scores.append(val_score)
    
    return np.nanmean(scores)


# Model Tuning

In [10]:
study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.HyperbandPruner())

[32m[I 2021-03-27 21:16:25,482][0m A new study created in memory with name: no-name-14360787-2887-4504-8350-cef3d6f31c5b[0m


In [11]:
%%time
import warnings
warnings.filterwarnings('ignore')

study.optimize(objective, timeout=60*5, n_jobs=-1,
               n_trials=None, gc_after_trial=False)

[32m[I 2021-03-27 21:20:45,571][0m Trial 2 finished with value: 0.7724344270768049 and parameters: {'resample': None, 'power': True, 'boosting_type': 'gbdt', 'num_leaves': 119, 'max_depth': 6, 'max_delta_step': 7, 'reg_alpha': 3.5075851542790195, 'reg_lambda': 1.9024185086803547, 'colsample_bytree': 0.11826799202431157, 'cat_smooth': 26.759114389536474, 'cat_l2': 18}. Best is trial 2 with value: 0.7724344270768049.[0m
[32m[I 2021-03-27 21:21:04,635][0m Trial 4 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-03-27 21:21:16,950][0m Trial 1 finished with value: 0.893234740663124 and parameters: {'resample': 'random', 'power': False, 'boosting_type': 'gbdt', 'num_leaves': 298, 'max_depth': 10, 'max_delta_step': 3, 'reg_alpha': 14.327957292783971, 'reg_lambda': 23.200043963671302, 'colsample_bytree': 0.6790959028718407, 'cat_smooth': 18.30707793408465, 'cat_l2': 2}. Best is trial 1 with value: 0.893234740663124.[0m
[32m[I 2021-03-27 21:21:32,561][0m Trial 0 finished with 

CPU times: user 24min 2s, sys: 3min 33s, total: 27min 35s
Wall time: 8min 26s


# Evaluate optimization

In [12]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting_type,params_cat_l2,params_cat_smooth,params_colsample_bytree,params_max_delta_step,params_max_depth,params_num_leaves,params_power,params_reg_alpha,params_reg_lambda,params_resample,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,system_attrs_completed_rung_4,system_attrs_completed_rung_5,state
0,0,0.894651,2021-03-27 21:16:25.548888,2021-03-27 21:21:32.555752,0 days 00:05:07.006864,gbdt,1,24.230711,0.224911,5,7,128,False,16.567926,11.935292,random,0.894253,0.894253,0.894253,0.894253,,,COMPLETE
1,1,0.893235,2021-03-27 21:16:25.554817,2021-03-27 21:21:16.945180,0 days 00:04:51.390363,gbdt,2,18.307078,0.679096,3,10,298,False,14.327957,23.200044,random,0.892506,0.892506,0.892506,0.892506,0.892506,,COMPLETE
2,2,0.772434,2021-03-27 21:16:25.557857,2021-03-27 21:20:45.567262,0 days 00:04:20.009405,gbdt,18,26.759114,0.118268,7,6,119,True,3.507585,1.902419,,,,,,,,COMPLETE
3,3,0.827444,2021-03-27 21:16:25.562296,2021-03-27 21:24:51.856227,0 days 00:08:26.293931,gbdt,4,20.239738,0.716473,4,16,262,True,22.985367,3.487344,,0.827484,0.827484,0.827484,0.827484,0.827484,0.827484,COMPLETE
4,4,0.747909,2021-03-27 21:20:45.576162,2021-03-27 21:21:04.635542,0 days 00:00:19.059380,gbdt,7,31.043001,0.511393,7,16,274,True,22.128896,2.545227,random,0.747909,,,,,,PRUNED
5,5,0.89385,2021-03-27 21:21:04.639069,2021-03-27 21:23:44.465130,0 days 00:02:39.826061,gbdt,3,4.324984,0.552518,3,16,77,False,1.719211,7.389299,,0.857815,0.872579,0.881229,0.886671,0.891774,0.892381,COMPLETE
6,6,0.68257,2021-03-27 21:21:16.956398,2021-03-27 21:21:37.010583,0 days 00:00:20.054185,gbdt,12,12.866295,0.43123,4,2,347,True,12.881962,28.267302,,0.68257,,,,,,PRUNED


In [13]:
study.best_value

0.8946512024364497

In [14]:
plot_optimization_history(study)

In [15]:
optuna.visualization.plot_parallel_coordinate(study)

In [16]:
plot_param_importances(study)

In [17]:
study.best_params

{'resample': 'random',
 'power': False,
 'boosting_type': 'gbdt',
 'num_leaves': 128,
 'max_depth': 7,
 'max_delta_step': 5,
 'reg_alpha': 16.56792594426734,
 'reg_lambda': 11.935291522117284,
 'colsample_bytree': 0.22491131139722578,
 'cat_smooth': 24.230710626013394,
 'cat_l2': 1}

Update the best parameters after a long training

In [18]:
best_params = {'resample': None,
 'power': False,
 'boosting_type': 'gbdt',
 'num_leaves': 153,
 'max_depth': 14,
 'max_delta_step': 9,
 'reg_alpha': 14.206069641010822,
 'reg_lambda': 4.35151505977074,
 'colsample_bytree': 0.23599717695150987,
 'cat_smooth': 49.698724437071206,
 'cat_l2': 19}

# Prepare to submit

In [19]:
final_params = dict()
final_params['clf']=dict(best_params)

final_params['resample']=final_params['clf']['resample']
del final_params['clf']['resample']

final_params['power']=final_params['clf']['power']
del final_params['clf']['power']

fixedparams['learning_rate'] = 0.01

In [20]:
%%time

submission.loc[:, 'target'] = kfold_prediction(X, y, X_test, 10, 
                                               final_params, fixedparams)
submission.to_csv('submission.csv', index = False)


 FOLD 0 ...

 FOLD 1 ...

 FOLD 2 ...

 FOLD 3 ...

 FOLD 4 ...

 FOLD 5 ...

 FOLD 6 ...

 FOLD 7 ...

 FOLD 8 ...

 FOLD 9 ...
CPU times: user 2h 39min 2s, sys: 51.6 s, total: 2h 39min 53s
Wall time: 40min 56s


# References:

- <https://optuna.readthedocs.io/>
- <https://www.kaggle.com/rmiperrier/tps-mar-lgbm-optuna>
- <https://towardsdatascience.com/how-to-make-your-model-awesome-with-optuna-b56d490368af>
- <https://optuna.readthedocs.io/en/v1.0.0/tutorial/pruning.html>
- <https://www.kaggle.com/kst6690/dsb2019-tuning-lightgbm-parameter-using-optuna>