From https://www.kaggle.com/hiroshi0530/tpg-mar2021-optuna-xgb

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna

import eli5
from eli5.sklearn import PermutationImportance

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-mar-2021/train.csv
/kaggle/input/tabular-playground-series-mar-2021/test.csv


In [69]:
pip install feature-engine

Note: you may need to restart the kernel to use updated packages.


In [70]:
from feature_engine import encoding

In [71]:
df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=[0])
X_train = df.drop(columns='target', axis=1)
y_train = df['target']

In [72]:
X_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=[0])

In [73]:
vars_num = [var for var in X_train.columns if X_train[var].dtype != 'O']
vars_cat = [var for var in X_train.columns if X_train[var].dtype == 'O']

### Pipeline for preprocessing with Feature-Engine

In [74]:
pre_pipe = Pipeline([

    ('encoder_rare_label',
     encoding.RareLabelEncoder(tol=0.01, n_categories=4, variables=vars_cat)),
    
    ('categorical_encoder',
     encoding.OrdinalEncoder(encoding_method='ordered',
                             variables=vars_cat)),

    ])

In [75]:
pre_pipe.fit(X_train, y_train)


The number of unique categories for variable cat0 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat11 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat12 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat13 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat14 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat15 is less than that indicated in n_categories. Thus, all categories will be considered frequent


The number of unique categories for variable cat16 is less than that indicated in n_categories. Thus, all categories w

Pipeline(steps=[('encoder_rare_label',
                 RareLabelEncoder(n_categories=4, tol=0.01,
                                  variables=['cat0', 'cat1', 'cat2', 'cat3',
                                             'cat4', 'cat5', 'cat6', 'cat7',
                                             'cat8', 'cat9', 'cat10', 'cat11',
                                             'cat12', 'cat13', 'cat14', 'cat15',
                                             'cat16', 'cat17', 'cat18'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['cat0', 'cat1', 'cat2', 'cat3',
                                           'cat4', 'cat5', 'cat6', 'cat7',
                                           'cat8', 'cat9', 'cat10', 'cat11',
                                           'cat12', 'cat13', 'cat14', 'cat15',
                                           'cat16', 'cat17', 'cat18']))])

In [76]:
X_train = pre_pipe.transform(X_train)
X_test = pre_pipe.transform(X_test)

In [77]:
X_train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,2,3,4,2,0,11,7,7,...,0.855349,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915
1,1,1,2,1,0,2,3,12,14,1,...,0.328929,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729
2,1,4,2,1,0,2,0,9,10,2,...,0.322749,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452
3,1,4,2,4,0,2,0,0,14,1,...,0.707663,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242
4,1,1,3,3,0,2,2,14,7,7,...,0.274514,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896


In [78]:
X_test.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,1,3,2,1,2,2,0,18,13,7,...,0.73569,0.578366,0.723154,0.228037,0.356227,0.551249,0.655693,0.598331,0.359987,0.947489
6,1,10,0,1,0,3,5,2,5,7,...,0.313703,0.928885,0.516602,0.600169,0.795224,0.248987,0.654614,0.347944,0.56552,0.38858
8,1,7,0,1,2,3,0,18,2,7,...,0.448201,0.424876,0.344729,0.242073,0.270632,0.74674,0.33559,0.341238,0.252289,0.411592
9,0,11,0,1,2,2,0,9,13,7,...,0.666092,0.598943,0.561971,0.806347,0.735983,0.538724,0.381566,0.48166,0.348514,0.325723
11,1,3,2,3,2,2,0,18,0,7,...,0.772229,0.479572,0.767745,0.252454,0.35481,0.17892,0.763479,0.562491,0.466261,0.585781


### Parameter Search with Optuna

In [79]:
def objective(trial, data=X_train, target=y_train):
    seed = 1234
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, valid_index in split.split(data, target):
        X_train = data.iloc[train_index]
        y_train = target.iloc[train_index]
        X_valid = data.iloc[valid_index]
        y_valid = target.iloc[valid_index]


    lgbm_params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 170, 250),
        'min_child_samples': trial.suggest_int('min_child_samples', 40, 60),
        'max_depth': trial.suggest_int('max_depth', 15, 25),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 3000, 4500),
        'random_state': seed,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        # 'device': 'gpu'
    }
    

    model = LGBMClassifier(**lgbm_params)  
    
    model.fit(
            X_train,
            y_train,
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

    y_valid_pred = model.predict_proba(X_valid)[:,1]
    
    roc_auc = roc_auc_score(y_valid, y_valid_pred)
    
    return roc_auc

In [80]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

[32m[I 2021-03-29 06:58:30,383][0m A new study created in memory with name: no-name-6fb35f61-c786-4ffe-9a1e-c3ffddbb18d2[0m
[32m[I 2021-03-29 07:00:28,942][0m Trial 0 finished with value: 0.8953480509171011 and parameters: {'reg_alpha': 2.5558283981346093, 'reg_lambda': 5.876354308360597, 'num_leaves': 209, 'min_child_samples': 58, 'max_depth': 16, 'learning_rate': 0.01, 'colsample_bytree': 0.19945592916623206, 'n_estimators': 3111}. Best is trial 0 with value: 0.8953480509171011.[0m
[32m[I 2021-03-29 07:02:04,377][0m Trial 1 finished with value: 0.8954708916888463 and parameters: {'reg_alpha': 3.655049284172982, 'reg_lambda': 7.297303023731574, 'num_leaves': 236, 'min_child_samples': 41, 'max_depth': 23, 'learning_rate': 0.01, 'colsample_bytree': 0.1550340381097072, 'n_estimators': 3410}. Best is trial 1 with value: 0.8954708916888463.[0m
[32m[I 2021-03-29 07:03:55,220][0m Trial 2 finished with value: 0.8953729066752406 and parameters: {'reg_alpha': 1.8583870260273687, 'reg

[32m[I 2021-03-29 07:38:26,150][0m Trial 22 finished with value: 0.8954591578536457 and parameters: {'reg_alpha': 3.3605929159143946, 'reg_lambda': 7.040869237563666, 'num_leaves': 250, 'min_child_samples': 47, 'max_depth': 17, 'learning_rate': 0.01, 'colsample_bytree': 0.22432281290000247, 'n_estimators': 4289}. Best is trial 13 with value: 0.8956136607212497.[0m
[32m[I 2021-03-29 07:40:14,715][0m Trial 23 finished with value: 0.8955801611567297 and parameters: {'reg_alpha': 4.301407470517228, 'reg_lambda': 9.01643339097155, 'num_leaves': 246, 'min_child_samples': 43, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.2516719715803071, 'n_estimators': 4085}. Best is trial 13 with value: 0.8956136607212497.[0m
[32m[I 2021-03-29 07:41:58,125][0m Trial 24 finished with value: 0.8954803187019773 and parameters: {'reg_alpha': 4.355311511473838, 'reg_lambda': 8.89207471116117, 'num_leaves': 234, 'min_child_samples': 40, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_b

[32m[I 2021-03-29 08:18:19,952][0m Trial 45 finished with value: 0.8953294294276335 and parameters: {'reg_alpha': 4.624989377687385, 'reg_lambda': 4.978724297532311, 'num_leaves': 230, 'min_child_samples': 43, 'max_depth': 20, 'learning_rate': 0.01, 'colsample_bytree': 0.2147720416212161, 'n_estimators': 3981}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 08:19:55,725][0m Trial 46 finished with value: 0.8955697055605978 and parameters: {'reg_alpha': 4.042685176959891, 'reg_lambda': 5.476517282309119, 'num_leaves': 218, 'min_child_samples': 44, 'max_depth': 24, 'learning_rate': 0.01, 'colsample_bytree': 0.23645247919387508, 'n_estimators': 4139}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 08:21:41,656][0m Trial 47 finished with value: 0.8955127854036784 and parameters: {'reg_alpha': 4.2911977564102335, 'reg_lambda': 6.519786884651377, 'num_leaves': 236, 'min_child_samples': 42, 'max_depth': 18, 'learning_rate': 0.01, 'colsample

[32m[I 2021-03-29 08:57:48,046][0m Trial 68 finished with value: 0.8953025336227662 and parameters: {'reg_alpha': 4.140893473318378, 'reg_lambda': 7.875835609899973, 'num_leaves': 245, 'min_child_samples': 47, 'max_depth': 16, 'learning_rate': 0.01, 'colsample_bytree': 0.20360340922558146, 'n_estimators': 4438}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 08:59:29,076][0m Trial 69 finished with value: 0.8954874232003724 and parameters: {'reg_alpha': 4.399477169130617, 'reg_lambda': 8.667518880684884, 'num_leaves': 213, 'min_child_samples': 43, 'max_depth': 17, 'learning_rate': 0.01, 'colsample_bytree': 0.2767204576874638, 'n_estimators': 4335}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 09:01:05,589][0m Trial 70 finished with value: 0.8955462963810472 and parameters: {'reg_alpha': 3.5071548987087406, 'reg_lambda': 8.06124414742173, 'num_leaves': 243, 'min_child_samples': 59, 'max_depth': 22, 'learning_rate': 0.01, 'colsample_

[32m[I 2021-03-29 09:38:13,681][0m Trial 91 finished with value: 0.8955869931733588 and parameters: {'reg_alpha': 3.3889938675043143, 'reg_lambda': 6.775521751424093, 'num_leaves': 248, 'min_child_samples': 46, 'max_depth': 18, 'learning_rate': 0.01, 'colsample_bytree': 0.25258715274141064, 'n_estimators': 4215}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 09:39:50,870][0m Trial 92 finished with value: 0.8955043070836061 and parameters: {'reg_alpha': 3.257462999189488, 'reg_lambda': 6.752972518505805, 'num_leaves': 242, 'min_child_samples': 46, 'max_depth': 18, 'learning_rate': 0.01, 'colsample_bytree': 0.24350876133372204, 'n_estimators': 4163}. Best is trial 43 with value: 0.8956179762047123.[0m
[32m[I 2021-03-29 09:41:30,215][0m Trial 93 finished with value: 0.8955451593759828 and parameters: {'reg_alpha': 3.346101328258331, 'reg_lambda': 6.03904960113083, 'num_leaves': 247, 'min_child_samples': 44, 'max_depth': 23, 'learning_rate': 0.01, 'colsample

Number of finished trials: 100
Best trial: {'reg_alpha': 3.769558545590637, 'reg_lambda': 6.6901802039767615, 'num_leaves': 237, 'min_child_samples': 45, 'max_depth': 18, 'learning_rate': 0.01, 'colsample_bytree': 0.2585120134872498, 'n_estimators': 4169}
Best value: 0.8956179762047123


In [81]:
optuna.visualization.plot_optimization_history(study)

In [82]:
optuna.visualization.plot_param_importances(study)

In [83]:
seed = 1234
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['random_state'] = seed
# paramsLGBM['device'] = 'gpu'

In [84]:
paramsLGBM

{'reg_alpha': 3.769558545590637,
 'reg_lambda': 6.6901802039767615,
 'num_leaves': 237,
 'min_child_samples': 45,
 'max_depth': 18,
 'learning_rate': 0.01,
 'colsample_bytree': 0.2585120134872498,
 'n_estimators': 4169,
 'boosting_type': 'gbdt',
 'random_state': 1234}

In [85]:
model = LGBMClassifier(**paramsLGBM)  
    
model.fit(X_train, y_train)

y_test_pred = model.predict_proba(X_test)[:,1]

In [86]:
y_test_pred = model.predict_proba(X_test)[:,1]

### Submission

In [87]:
sub = pd.DataFrame(y_test_pred, index=X_test.index).reset_index().rename(columns={0: 'target'})
sub.to_csv('optuna_final.csv', index=False)

In [88]:
sub

Unnamed: 0,id,target
0,5,0.086834
1,6,0.466333
2,8,0.008820
3,9,0.310681
4,11,0.090570
...,...,...
199995,499983,0.902877
199996,499984,0.036692
199997,499987,0.742144
199998,499994,0.175666
