In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as imb_pipeline
from category_encoders.target_encoder import TargetEncoder

import lightgbm as lgb
import xgboost as xgb
import optuna
from optuna.integration import LightGBMPruningCallback

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

%matplotlib inline

  from pandas import MultiIndex, Int64Index


# Read Data

In [2]:
train_data = pd.read_pickle('./data/train_file_eng.pkl')

In [3]:
train_data.head(1)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y,y_encoded,quarter
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no,0,q4


In [4]:
train_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
       'poutcome', 'y', 'y_encoded', 'quarter'],
      dtype='object')

In [5]:
feature_set = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
               'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
               'poutcome', 'quarter']

cat_features = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                'contact', 'month', 'day_of_week', 'poutcome', 'quarter']

num_features = ['age', 'duration', 'campaign', 'previous']

In [6]:
for col_name in cat_features:
    train_data[col_name] = train_data[col_name].astype('category')

# Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_set], train_data.y_encoded, random_state=24, test_size=0.2)

In [8]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [9]:
X_train.shape, X_test.shape

((26328, 15), (6582, 15))

# Hyperparameter Tuning with Optuna

## LightGBM

In [10]:
def lgbm_objective(trial, X, y, oversampling=None, sampling_strategy=None):
    
    #Param Grid for Lightgbm
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 80),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 200, step=20),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 10),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
    }
    
    #Cross Validation with StratifiedKFold
    scores = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
    for train_idx, test_idx in cv.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y[train_idx], y[test_idx]
        
        if oversampling == 'random':
            sampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=24)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)
        elif oversampling == 'smote':
            categorical_features = np.array([idx for idx, col_name in enumerate(X_train_fold.columns) if col_name in cat_features])
            sampler = SMOTENC(categorical_features = categorical_features)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)
        else:
            X_train_fold2, y_train_fold2 = X_train_fold, y_train_fold
        
        model = lgb.LGBMClassifier(objective="binary",
                                   subsample_freq=1,
                                   verbosity=-1,
                                   random_state=24,
                                   n_jobs=4, **param_grid)
        
        if oversampling == 'class_weight':
            model.set_params(class_weight='balanced')
        
        model.fit(
            X_train_fold2,
            y_train_fold2,
            eval_set=[(X_test_fold, y_test_fold)],
            eval_metric="auc",
            early_stopping_rounds=100,
            callbacks=[LightGBMPruningCallback(trial, "auc")],
            verbose=-1
        )
        
        y_preds_fold = model.predict_proba(X_test_fold)
        y_preds_fold = [i[1] for i in y_preds_fold]
        fold_score = roc_auc_score(y_test_fold, y_preds_fold)
        scores.append(fold_score)
        
    return np.mean(scores)

### SMOTENC

In [22]:
study = optuna.create_study(direction="maximize", study_name="LGBM")
func = lambda trial: lgbm_objective(trial, X_train, y_train, oversampling='smote', sampling_strategy='auto')
study.optimize(func, n_trials=30)

[32m[I 2022-04-05 23:02:59,091][0m A new study created in memory with name: LGBM[0m
[32m[I 2022-04-05 23:03:14,860][0m Trial 0 finished with value: 0.9128077316698056 and parameters: {'n_estimators': 800, 'learning_rate': 0.1058184329107983, 'num_leaves': 35, 'max_depth': 7, 'min_child_samples': 180, 'lambda_l1': 4, 'lambda_l2': 2, 'bagging_fraction': 0.7582908996248414, 'feature_fraction': 0.9093827675566161}. Best is trial 0 with value: 0.9128077316698056.[0m
[32m[I 2022-04-05 23:03:30,137][0m Trial 1 finished with value: 0.9126150842926286 and parameters: {'n_estimators': 850, 'learning_rate': 0.2070726960639668, 'num_leaves': 78, 'max_depth': 9, 'min_child_samples': 80, 'lambda_l1': 2, 'lambda_l2': 2, 'bagging_fraction': 0.8629888312976537, 'feature_fraction': 0.659007558688024}. Best is trial 0 with value: 0.9128077316698056.[0m
[32m[I 2022-04-05 23:03:45,208][0m Trial 2 finished with value: 0.9099178759652553 and parameters: {'n_estimators': 800, 'learning_rate': 0.125

### Random Over Sampling

In [11]:
study2 = optuna.create_study(direction="maximize", study_name="LGBM2")
func = lambda trial: lgbm_objective(trial, X_train, y_train, oversampling='random', sampling_strategy='auto')
study2.optimize(func, n_trials=30)

[32m[I 2022-04-06 00:33:24,634][0m A new study created in memory with name: LGBM2[0m
[32m[I 2022-04-06 00:33:28,318][0m Trial 0 finished with value: 0.9306547093600928 and parameters: {'n_estimators': 900, 'learning_rate': 0.023392417406422665, 'num_leaves': 40, 'max_depth': 7, 'min_child_samples': 140, 'reg_alpha': 4, 'reg_lambda': 2, 'subsample': 0.9598618663408268, 'colsample_bytree': 0.7843708215804357}. Best is trial 0 with value: 0.9306547093600928.[0m
[32m[I 2022-04-06 00:33:30,811][0m Trial 1 finished with value: 0.9276731600134343 and parameters: {'n_estimators': 400, 'learning_rate': 0.4849014641515901, 'num_leaves': 68, 'max_depth': 2, 'min_child_samples': 140, 'reg_alpha': 10, 'reg_lambda': 10, 'subsample': 0.7565852858741953, 'colsample_bytree': 0.5405909863633254}. Best is trial 0 with value: 0.9306547093600928.[0m
[32m[I 2022-04-06 00:33:34,067][0m Trial 2 finished with value: 0.9302920479376608 and parameters: {'n_estimators': 600, 'learning_rate': 0.04703333

### Class Weight Hyperparameter

In [12]:
study3 = optuna.create_study(direction="maximize", study_name="LGBM3")
func = lambda trial: lgbm_objective(trial, X_train, y_train, oversampling='class_weight', sampling_strategy=None)
study3.optimize(func, n_trials=30)

[32m[I 2022-04-06 00:33:53,731][0m A new study created in memory with name: LGBM3[0m
[32m[I 2022-04-06 00:33:55,445][0m Trial 0 finished with value: 0.9270814700853041 and parameters: {'n_estimators': 200, 'learning_rate': 0.010909359392112279, 'num_leaves': 35, 'max_depth': 9, 'min_child_samples': 60, 'reg_alpha': 0, 'reg_lambda': 4, 'subsample': 0.5163220427230086, 'colsample_bytree': 0.9583240118432258}. Best is trial 0 with value: 0.9270814700853041.[0m
[32m[I 2022-04-06 00:34:01,647][0m Trial 1 finished with value: 0.9306633700636073 and parameters: {'n_estimators': 800, 'learning_rate': 0.010264924032798212, 'num_leaves': 36, 'max_depth': 10, 'min_child_samples': 20, 'reg_alpha': 5, 'reg_lambda': 5, 'subsample': 0.7299734822446494, 'colsample_bytree': 0.7839975374619017}. Best is trial 1 with value: 0.9306633700636073.[0m
[32m[I 2022-04-06 00:34:03,301][0m Trial 2 finished with value: 0.9284524643675857 and parameters: {'n_estimators': 800, 'learning_rate': 0.241254625

### No Oversampling

In [13]:
study4 = optuna.create_study(direction="maximize", study_name="LGBM3")
func = lambda trial: lgbm_objective(trial, X_train, y_train, oversampling=None, sampling_strategy=None)
study4.optimize(func, n_trials=30)

[32m[I 2022-04-06 00:34:10,786][0m A new study created in memory with name: LGBM3[0m
[32m[I 2022-04-06 00:34:11,977][0m Trial 0 finished with value: 0.9292895357302416 and parameters: {'n_estimators': 200, 'learning_rate': 0.07611033527684935, 'num_leaves': 13, 'max_depth': 4, 'min_child_samples': 200, 'reg_alpha': 9, 'reg_lambda': 2, 'subsample': 0.9193212790298293, 'colsample_bytree': 0.6384647036359363}. Best is trial 0 with value: 0.9292895357302416.[0m
[32m[I 2022-04-06 00:34:13,159][0m Trial 1 finished with value: 0.9295533584772588 and parameters: {'n_estimators': 950, 'learning_rate': 0.21585267618398365, 'num_leaves': 49, 'max_depth': 10, 'min_child_samples': 40, 'reg_alpha': 8, 'reg_lambda': 3, 'subsample': 0.7676992847745248, 'colsample_bytree': 0.6203806515495385}. Best is trial 1 with value: 0.9295533584772588.[0m
[32m[I 2022-04-06 00:34:15,602][0m Trial 2 finished with value: 0.9272168710307586 and parameters: {'n_estimators': 350, 'learning_rate': 0.0103061352

## XGBoost

In [14]:
def xgb_objective(trial, X, y, oversampling=None, sampling_strategy=None, encoding='ohe'):
    
    #Param Grid for Lightgbm
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        #"num_leaves": trial.suggest_int("num_leaves", 2, 80),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 20, 200, step=20),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 10),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
    }
    
    #Cross Validation with StratifiedKFold
    scores = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
    for train_idx, test_idx in cv.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y[train_idx], y[test_idx]
        
        if oversampling == 'random':
            sampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=24)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)
        elif oversampling == 'smote':
            categorical_features = np.array([idx for idx, col_name in enumerate(X_train_fold.columns) if col_name in cat_features])
            sampler = SMOTENC(categorical_features = categorical_features)
            X_train_fold2, y_train_fold2 = sampler.fit_resample(X_train_fold, y_train_fold)
        else:
            X_train_fold2, y_train_fold2 = X_train_fold, y_train_fold
        
        if encoding == 'ohe':
            cat_transformer = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore'))])
        elif encoding == 'te':
            cat_transformer = Pipeline(steps=[('encoder', TargetEncoder(min_samples_leaf=100, smoothing=5))])

        num_transformer = Pipeline(steps=[('identity', FunctionTransformer(func = None))])

        preprocessor = ColumnTransformer(transformers=[
                                                        ('num_trans', num_transformer, num_features),
                                                        ('cat_trans', cat_transformer, cat_features)
                                                    ],
                                        remainder='drop')

        xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                                  verbosity=0,
                                  random_state=24,
                                  tree_method="hist",
                                  enable_categorical=False,
                                  n_jobs=4, **param_grid)
        
        if oversampling == 'scale_pos_weight':
            ratio = sum(y_train_fold2 == 0) / sum(y_train_fold2 == 1)
            xgb_model.set_params(scale_pos_weight=ratio)

        clf = Pipeline(
            steps=[("preprocessor", preprocessor), ("classifier", xgb_model)]
        )

        clf.fit(X_train_fold2, y_train_fold2)
        
        y_preds_fold = clf.predict_proba(X_test_fold)
        y_preds_fold = [i[1] for i in y_preds_fold]
        fold_score = roc_auc_score(y_test_fold, y_preds_fold)
        scores.append(fold_score)
        
    return np.mean(scores)

### SMOTENC

### Random Over Sampling

In [15]:
study5 = optuna.create_study(direction="maximize", study_name="XGB2")
func = lambda trial: xgb_objective(trial, X_train, y_train, oversampling='random', sampling_strategy='auto', encoding='ohe')
study5.optimize(func, n_trials=20)

[32m[I 2022-04-06 00:34:27,324][0m A new study created in memory with name: XGB2[0m
[32m[I 2022-04-06 00:34:31,260][0m Trial 0 finished with value: 0.9289038404714443 and parameters: {'n_estimators': 350, 'learning_rate': 0.011303177735062907, 'max_depth': 8, 'min_child_weight': 20, 'reg_alpha': 10, 'reg_lambda': 0, 'subsample': 0.5028423488175245, 'colsample_bytree': 0.5567674001059271}. Best is trial 0 with value: 0.9289038404714443.[0m
[32m[I 2022-04-06 00:34:34,987][0m Trial 1 finished with value: 0.9250272030687885 and parameters: {'n_estimators': 500, 'learning_rate': 0.013603904162922698, 'max_depth': 4, 'min_child_weight': 40, 'reg_alpha': 0, 'reg_lambda': 8, 'subsample': 0.5617899294456232, 'colsample_bytree': 0.6029584233546259}. Best is trial 0 with value: 0.9289038404714443.[0m
[32m[I 2022-04-06 00:34:38,816][0m Trial 2 finished with value: 0.9216338060882684 and parameters: {'n_estimators': 850, 'learning_rate': 0.032542110210141445, 'max_depth': 2, 'min_child_w

In [16]:
study6 = optuna.create_study(direction="maximize", study_name="XGB3")
func = lambda trial: xgb_objective(trial, X_train, y_train, oversampling='random', sampling_strategy='auto', encoding='te')
study6.optimize(func, n_trials=20)

[32m[I 2022-04-06 00:36:00,317][0m A new study created in memory with name: XGB3[0m
[32m[I 2022-04-06 00:36:06,072][0m Trial 0 finished with value: 0.927096244269487 and parameters: {'n_estimators': 700, 'learning_rate': 0.05919113726546405, 'max_depth': 9, 'min_child_weight': 160, 'reg_alpha': 7, 'reg_lambda': 2, 'subsample': 0.6680632203954948, 'colsample_bytree': 0.8207287327475029}. Best is trial 0 with value: 0.927096244269487.[0m
[32m[I 2022-04-06 00:36:13,298][0m Trial 1 finished with value: 0.9303721874731903 and parameters: {'n_estimators': 700, 'learning_rate': 0.018653841927448185, 'max_depth': 7, 'min_child_weight': 20, 'reg_alpha': 4, 'reg_lambda': 1, 'subsample': 0.523951982199114, 'colsample_bytree': 0.6908459205593644}. Best is trial 1 with value: 0.9303721874731903.[0m
[32m[I 2022-04-06 00:36:19,618][0m Trial 2 finished with value: 0.9223304597178986 and parameters: {'n_estimators': 850, 'learning_rate': 0.2785353986807618, 'max_depth': 7, 'min_child_weight'

### Scale Pos Weight Hyperparameter

In [17]:
study7 = optuna.create_study(direction="maximize", study_name="XGB4")
func = lambda trial: xgb_objective(trial, X_train, y_train, oversampling='scale_pos_weight', sampling_strategy='auto', encoding='ohe')
study7.optimize(func, n_trials=20)

[32m[I 2022-04-06 00:37:55,514][0m A new study created in memory with name: XGB4[0m
[32m[I 2022-04-06 00:37:59,210][0m Trial 0 finished with value: 0.9115083964495767 and parameters: {'n_estimators': 700, 'learning_rate': 0.3891090105687897, 'max_depth': 5, 'min_child_weight': 20, 'reg_alpha': 8, 'reg_lambda': 2, 'subsample': 0.7386163373160726, 'colsample_bytree': 0.6991921652858346}. Best is trial 0 with value: 0.9115083964495767.[0m
[32m[I 2022-04-06 00:38:01,365][0m Trial 1 finished with value: 0.9259937009273591 and parameters: {'n_estimators': 600, 'learning_rate': 0.19924876332861402, 'max_depth': 3, 'min_child_weight': 180, 'reg_alpha': 0, 'reg_lambda': 3, 'subsample': 0.9353376275087812, 'colsample_bytree': 0.5527333575722869}. Best is trial 1 with value: 0.9259937009273591.[0m
[32m[I 2022-04-06 00:38:05,699][0m Trial 2 finished with value: 0.9167242013084598 and parameters: {'n_estimators': 650, 'learning_rate': 0.22855225668152696, 'max_depth': 9, 'min_child_weigh

In [18]:
study8 = optuna.create_study(direction="maximize", study_name="XGB5")
func = lambda trial: xgb_objective(trial, X_train, y_train, oversampling='scale_pos_weight', sampling_strategy='auto', encoding='te')
study8.optimize(func, n_trials=20)

[32m[I 2022-04-06 00:39:15,068][0m A new study created in memory with name: XGB5[0m
[32m[I 2022-04-06 00:39:17,369][0m Trial 0 finished with value: 0.9234098665302138 and parameters: {'n_estimators': 300, 'learning_rate': 0.22449649696630422, 'max_depth': 10, 'min_child_weight': 140, 'reg_alpha': 4, 'reg_lambda': 9, 'subsample': 0.7711324009330565, 'colsample_bytree': 0.8538418687409699}. Best is trial 0 with value: 0.9234098665302138.[0m
[32m[I 2022-04-06 00:39:22,181][0m Trial 1 finished with value: 0.9291800978150102 and parameters: {'n_estimators': 800, 'learning_rate': 0.017238585114526733, 'max_depth': 9, 'min_child_weight': 120, 'reg_alpha': 7, 'reg_lambda': 0, 'subsample': 0.8407971711745343, 'colsample_bytree': 0.5572397143786367}. Best is trial 1 with value: 0.9291800978150102.[0m
[32m[I 2022-04-06 00:39:26,272][0m Trial 2 finished with value: 0.9264455867917667 and parameters: {'n_estimators': 550, 'learning_rate': 0.05489470969045221, 'max_depth': 8, 'min_child_w

In [19]:
study2.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_n_estimators,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
9,9,0.930788,2022-04-06 00:33:47.548073,2022-04-06 00:33:48.961537,0 days 00:00:01.413464,0.702969,0.113336,7,80,100,19,4,0,0.517086,COMPLETE
0,0,0.930655,2022-04-06 00:33:24.635896,2022-04-06 00:33:28.317679,0 days 00:00:03.681783,0.784371,0.023392,7,140,900,40,4,2,0.959862,COMPLETE
4,4,0.930526,2022-04-06 00:33:37.654765,2022-04-06 00:33:43.456320,0 days 00:00:05.801555,0.550079,0.010962,8,20,550,27,1,5,0.584831,COMPLETE
2,2,0.930292,2022-04-06 00:33:30.812216,2022-04-06 00:33:34.065994,0 days 00:00:03.253778,0.75923,0.047033,4,120,600,78,2,10,0.646684,COMPLETE
12,12,0.929875,2022-04-06 00:33:49.230851,2022-04-06 00:33:51.485716,0 days 00:00:02.254865,0.83558,0.024971,7,140,150,43,0,4,0.835491,COMPLETE


In [20]:
study3.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_n_estimators,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
1,1,0.930663,2022-04-06 00:33:55.446239,2022-04-06 00:34:01.646298,0 days 00:00:06.200059,0.783998,0.010265,10,20,800,36,5,5,0.729973,COMPLETE
3,3,0.930184,2022-04-06 00:34:03.302571,2022-04-06 00:34:04.888609,0 days 00:00:01.586038,0.802013,0.060831,7,100,200,46,3,6,0.657514,COMPLETE
2,2,0.928452,2022-04-06 00:34:01.648541,2022-04-06 00:34:03.300272,0 days 00:00:01.651731,0.84203,0.241255,2,200,800,64,9,3,0.813289,COMPLETE
9,9,0.928345,2022-04-06 00:34:07.142702,2022-04-06 00:34:08.348934,0 days 00:00:01.206232,0.784923,0.174732,10,100,650,34,6,0,0.589694,COMPLETE
4,4,0.928149,2022-04-06 00:34:04.890898,2022-04-06 00:34:06.977506,0 days 00:00:02.086608,0.821158,0.09338,3,120,700,59,5,5,0.516294,COMPLETE


In [21]:
study4.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_n_estimators,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
3,3,0.930381,2022-04-06 00:34:15.602871,2022-04-06 00:34:17.606085,0 days 00:00:02.003214,0.566652,0.049575,9,140,600,46,0,0,0.965714,COMPLETE
21,21,0.930065,2022-04-06 00:34:23.646507,2022-04-06 00:34:25.444241,0 days 00:00:01.797734,0.507133,0.097952,9,20,950,49,0,0,0.963212,COMPLETE
1,1,0.929553,2022-04-06 00:34:11.978375,2022-04-06 00:34:13.158639,0 days 00:00:01.180264,0.620381,0.215853,10,40,950,49,8,3,0.767699,COMPLETE
10,10,0.929328,2022-04-06 00:34:21.498321,2022-04-06 00:34:22.967818,0 days 00:00:01.469497,0.505209,0.119662,8,20,900,53,0,0,0.998759,COMPLETE
0,0,0.92929,2022-04-06 00:34:10.787671,2022-04-06 00:34:11.977167,0 days 00:00:01.189496,0.638465,0.07611,4,200,200,13,9,2,0.919321,COMPLETE


In [22]:
study5.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
14,14,0.930252,2022-04-06 00:35:18.878931,2022-04-06 00:35:25.901497,0 days 00:00:07.022566,0.501875,0.010103,8,20,700,9,1,0.899265,COMPLETE
15,15,0.929535,2022-04-06 00:35:25.902373,2022-04-06 00:35:33.230988,0 days 00:00:07.328615,0.678852,0.01026,9,40,700,8,2,0.920187,COMPLETE
13,13,0.928979,2022-04-06 00:35:10.859412,2022-04-06 00:35:18.877498,0 days 00:00:08.018086,0.984161,0.011117,9,20,650,8,0,0.824139,COMPLETE
0,0,0.928904,2022-04-06 00:34:27.326309,2022-04-06 00:34:31.260466,0 days 00:00:03.934157,0.556767,0.011303,8,20,350,10,0,0.502842,COMPLETE
17,17,0.928638,2022-04-06 00:35:39.064149,2022-04-06 00:35:46.690530,0 days 00:00:07.626381,0.665177,0.023891,9,40,750,9,5,0.914802,COMPLETE


In [23]:
study6.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
6,6,0.931002,2022-04-06 00:36:35.079078,2022-04-06 00:36:43.888369,0 days 00:00:08.809291,0.731002,0.010857,7,20,1000,3,1,0.500919,COMPLETE
1,1,0.930372,2022-04-06 00:36:06.073419,2022-04-06 00:36:13.298210,0 days 00:00:07.224791,0.690846,0.018654,7,20,700,4,1,0.523952,COMPLETE
7,7,0.930222,2022-04-06 00:36:43.889272,2022-04-06 00:36:49.422064,0 days 00:00:05.532792,0.92519,0.023678,8,40,500,9,6,0.636781,COMPLETE
13,13,0.930156,2022-04-06 00:37:16.639729,2022-04-06 00:37:23.203474,0 days 00:00:06.563745,0.604737,0.010249,10,60,600,3,2,0.57781,COMPLETE
11,11,0.930138,2022-04-06 00:37:02.855776,2022-04-06 00:37:08.886219,0 days 00:00:06.030443,0.67126,0.010222,5,20,750,4,3,0.530691,COMPLETE


In [24]:
study7.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
12,12,0.92872,2022-04-06 00:38:40.775707,2022-04-06 00:38:47.152131,0 days 00:00:06.376424,0.882569,0.012053,7,60,1000,4,5,0.821966,COMPLETE
11,11,0.928608,2022-04-06 00:38:35.257968,2022-04-06 00:38:40.774401,0 days 00:00:05.516433,0.851956,0.010664,6,60,950,6,6,0.846789,COMPLETE
10,10,0.928411,2022-04-06 00:38:30.117211,2022-04-06 00:38:35.256563,0 days 00:00:05.139352,0.844577,0.016045,6,60,900,6,6,0.834185,COMPLETE
15,15,0.928056,2022-04-06 00:38:54.230882,2022-04-06 00:38:59.585874,0 days 00:00:05.354992,0.859553,0.010455,6,80,1000,8,6,0.792128,COMPLETE
14,14,0.927932,2022-04-06 00:38:48.331678,2022-04-06 00:38:54.229591,0 days 00:00:05.897913,0.837704,0.020804,7,60,900,4,5,0.802043,COMPLETE


In [25]:
study8.trials_dataframe().sort_values('value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample,state
6,6,0.930399,2022-04-06 00:39:35.894099,2022-04-06 00:39:39.821185,0 days 00:00:03.927086,0.898975,0.022336,9,20,350,4,1,0.793348,COMPLETE
13,13,0.930021,2022-04-06 00:39:56.993204,2022-04-06 00:39:59.879292,0 days 00:00:02.886088,0.912031,0.020722,6,20,300,2,3,0.510758,COMPLETE
19,19,0.929993,2022-04-06 00:40:16.091291,2022-04-06 00:40:20.489433,0 days 00:00:04.398142,0.94407,0.016251,9,40,500,5,4,0.670752,COMPLETE
11,11,0.929525,2022-04-06 00:39:50.608895,2022-04-06 00:39:54.032009,0 days 00:00:03.423114,0.984591,0.01089,7,20,400,10,3,0.644427,COMPLETE
10,10,0.929268,2022-04-06 00:39:47.088138,2022-04-06 00:39:50.608011,0 days 00:00:03.519873,0.996551,0.01029,7,20,400,10,3,0.639848,COMPLETE


# Remove Unimportant Features

In [73]:
feature_set = ['age', 'job', 'education', 'default',
               'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
               'poutcome', 'quarter']

cat_features = ['job', 'education', 'default',
                'contact', 'month', 'day_of_week', 'poutcome', 'quarter']

num_features = ['age', 'duration', 'campaign', 'previous']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_set], train_data.y_encoded, random_state=24, test_size=0.2)

In [75]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [76]:
X_train.shape, X_test.shape

((26328, 12), (6582, 12))

### Class Weight Hyperparameter

In [77]:
study9 = optuna.create_study(direction="maximize", study_name="LGBM2")
func = lambda trial: lgbm_objective(trial, X_train, y_train, oversampling='random', sampling_strategy='auto')
study9.optimize(func, n_trials=30)

[32m[I 2022-04-06 00:10:17,509][0m A new study created in memory with name: LGBM2[0m
[32m[I 2022-04-06 00:10:19,705][0m Trial 0 finished with value: 0.931183266152131 and parameters: {'n_estimators': 650, 'learning_rate': 0.04931774162043972, 'num_leaves': 33, 'max_depth': 9, 'min_child_samples': 180, 'lambda_l1': 1, 'lambda_l2': 7, 'bagging_fraction': 0.8485786057147204, 'feature_fraction': 0.7874396169748645}. Best is trial 0 with value: 0.931183266152131.[0m
[32m[I 2022-04-06 00:10:21,136][0m Trial 1 finished with value: 0.9287072657687162 and parameters: {'n_estimators': 200, 'learning_rate': 0.3709933931371577, 'num_leaves': 71, 'max_depth': 5, 'min_child_samples': 180, 'lambda_l1': 6, 'lambda_l2': 6, 'bagging_fraction': 0.6919072490940053, 'feature_fraction': 0.8776327511292803}. Best is trial 0 with value: 0.931183266152131.[0m
[32m[I 2022-04-06 00:10:23,026][0m Trial 2 finished with value: 0.9298785495522109 and parameters: {'n_estimators': 350, 'learning_rate': 0.10

In [78]:
study10 = optuna.create_study(direction="maximize", study_name="XGB5")
func = lambda trial: xgb_objective(trial, X_train, y_train, oversampling='scale_pos_weight', sampling_strategy='auto', encoding='te')
study10.optimize(func, n_trials=20)

[32m[I 2022-04-06 00:10:32,222][0m A new study created in memory with name: XGB5[0m
[32m[I 2022-04-06 00:10:35,914][0m Trial 0 finished with value: 0.9270831346306279 and parameters: {'n_estimators': 850, 'learning_rate': 0.020232226927244988, 'max_depth': 9, 'min_child_weight': 200, 'lambda_l1': 2, 'lambda_l2': 6, 'bagging_fraction': 0.7803286942701633, 'feature_fraction': 0.6335309117627896}. Best is trial 0 with value: 0.9270831346306279.[0m
[32m[I 2022-04-06 00:10:38,690][0m Trial 1 finished with value: 0.9244866813356174 and parameters: {'n_estimators': 900, 'learning_rate': 0.03077675866683733, 'max_depth': 3, 'min_child_weight': 200, 'lambda_l1': 1, 'lambda_l2': 2, 'bagging_fraction': 0.5565174947607294, 'feature_fraction': 0.8826024552349538}. Best is trial 0 with value: 0.9270831346306279.[0m
[32m[I 2022-04-06 00:10:40,681][0m Trial 2 finished with value: 0.9271019274099228 and parameters: {'n_estimators': 600, 'learning_rate': 0.1158026367351804, 'max_depth': 2, 'm

In [28]:
study6.trials_dataframe().sort_values('value', ascending=False).iloc[0].to_dict()

{'number': 6,
 'value': 0.9310015064071377,
 'datetime_start': Timestamp('2022-04-06 00:36:35.079078'),
 'datetime_complete': Timestamp('2022-04-06 00:36:43.888369'),
 'duration': Timedelta('0 days 00:00:08.809291'),
 'params_colsample_bytree': 0.7310016759193707,
 'params_learning_rate': 0.01085683932685544,
 'params_max_depth': 7,
 'params_min_child_weight': 20,
 'params_n_estimators': 1000,
 'params_reg_alpha': 3,
 'params_reg_lambda': 1,
 'params_subsample': 0.5009186963118293,
 'state': 'COMPLETE'}