# LGBM+XGB+CATBOOST Ensemble with HyperOpt Tuning

In [11]:
import pandas as pd
import numpy as np

import time
import datetime as dt
from typing import Tuple, List, Dict

from sklearn.metrics import auc, roc_curve


from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope

from functools import partial

import warnings
warnings.filterwarnings('ignore')

In [12]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-03-23 17:40:21.465012


In [13]:
# read data
in_kaggle = False


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-mar-2021/train.csv'
        test_path = '../input/tabular-playground-series-mar-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-mar-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path


In [14]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train = pd.read_csv(train_set_path)
test = pd.read_csv(test_set_path)
target = train.target

test_id = test.id

subm = pd.read_csv(sample_subm_path)

Wall time: 5.34 s


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  object 
 2   cat1    300000 non-null  object 
 3   cat2    300000 non-null  object 
 4   cat3    300000 non-null  object 
 5   cat4    300000 non-null  object 
 6   cat5    300000 non-null  object 
 7   cat6    300000 non-null  object 
 8   cat7    300000 non-null  object 
 9   cat8    300000 non-null  object 
 10  cat9    300000 non-null  object 
 11  cat10   300000 non-null  object 
 12  cat11   300000 non-null  object 
 13  cat12   300000 non-null  object 
 14  cat13   300000 non-null  object 
 15  cat14   300000 non-null  object 
 16  cat15   300000 non-null  object 
 17  cat16   300000 non-null  object 
 18  cat17   300000 non-null  object 
 19  cat18   300000 non-null  object 
 20  cont0   300000 non-null  float64
 21  cont1   30

In [16]:
cols_to_drop = ['id', 'cat5', 'cat7', 'cat8', 'cat10']

cat_cols = [feature for feature in train.columns if 'cat' in feature and feature not in cols_to_drop ]
cont_cols = [feature for feature in train.columns if 'cont' in feature and feature not in cols_to_drop]

all_features = cat_cols + cont_cols

def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [17]:
train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

# encoder=LabelEncoder()
test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)

In [18]:
class EnsembleModel:
    def __init__(self, params):
        """
        LGB + XGB + CatBoost model
        """
        self.lgb_params = params['lgb']
        self.xgb_params = params['xgb']
        self.cat_params = params['cat']

        self.lgb_model = LGBMClassifier(**self.lgb_params)
        self.xgb_model = XGBClassifier(**self.xgb_params)
        self.cat_model = CatBoostClassifier(**self.cat_params)

    def fit(self, x, y, *args, **kwargs):
        return (self.lgb_model.fit(x, y, *args, **kwargs),
                self.xgb_model.fit(x, y, *args, **kwargs),
               self.cat_model.fit(x, y, *args, **kwargs))

    def predict(self, x, weights=[1.0, 1.0, 1.0]):
        """
        Generate model predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return np.rint((weights[0] * self.lgb_model.predict(x) +
                weights[1] * self.xgb_model.predict(x) +
                weights[2] * self.cat_model.predict(x)) / 3)
    
    def predict_proba(self, x, weights=[1.0, 1.0, 1.0]):
        """
        Generate model class label probability predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return np.rint((weights[0] * self.lgb_model.predict_proba(x) +
                weights[1] * self.xgb_model.predict_proba(x) +
                weights[2] * self.cat_model.predict_proba(x)) / 3)

In [19]:
#integer and string parameters, used with hp.choice()
bootstrap_type = [{'bootstrap_type':'Poisson'}, 
                           {'bootstrap_type':'Bayesian',
                            'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                          {'bootstrap_type':'Bernoulli'}] 
LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if not using GPU
grow_policy = [
    {'grow_policy':'SymmetricTree'},
    # {'grow_policy':'Depthwise'},
    {'grow_policy':'Lossguide',
        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]

ensemble_params = {
    "lgb" : {
        "num_leaves": scope.int(hp.quniform("num_leaves", 31, 200, 1)),
        "max_depth": scope.int(hp.quniform("max_depth", 10, 24, 1)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_split_gain': hp.uniform('min_split_gain', 0, 1.0),
        'min_child_samples': scope.int(hp.quniform("min_child_samples", 2, 700, 1)),
        "subsample": hp.uniform("subsample", 0.2, 1.0),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        'reg_alpha': hp.uniform('reg_alpha', 1e-5, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0, 50),
        'metric': 'auc',
        'n_jobs': -1,
        'n_estimators': 2000},
    'xgb': {
        'max_depth': scope.int(hp.quniform('xgb.max_depth', 10, 24, 1)),
        'learning_rate': hp.uniform('xgb.learning_rate', 0.01, 0.3),
        'gamma': hp.uniform('xgb.gamma', 1, 10),
        'min_child_weight': scope.int(hp.quniform('xgb.min_child_weight', 2, 700, 1)),
        'n_estimators': 2000,
        'colsample_bytree': hp.uniform('xgb.colsample_bytree', 0.5, 0.9),
        'subsample': hp.uniform('xgb.subsample', 0.5, 1.0),
        'reg_lambda': hp.uniform('xgb.reg_lambda', 0, 100),
        'reg_alpha': hp.uniform('xgb.reg_alpha', 1e-5, 0.5),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'eval_metric': 'auc',
        'n_jobs': -1},
    'cat': {
        'depth': hp.quniform("cat.depth", 2, 16, 1),
        'learning_rate': hp.uniform('cat.learning_rate', 0.01, 0.3),
        'l2_leaf_reg': hp.uniform('cat.l2_leaf_reg', 3, 8),
        'max_bin' : hp.quniform('cat.max_bin', 1, 254, 1),
        'min_data_in_leaf' : hp.quniform('cat.min_data_in_leaf', 2, 700, 1),
        'random_strength' : hp.loguniform('cat.random_strength', np.log(0.005), np.log(5)),
        'leaf_estimation_backtracking' : hp.choice('cat.leaf_estimation_backtracking', LEB),
        # 'grow_policy': hp.choice('cat.grow_policy', grow_policy),
        'fold_len_multiplier' : hp.loguniform('cat.fold_len_multiplier', np.log(1.01), np.log(2.5)),
        'eval_metric': 'AUC',
        'n_estimators': 2000
        #'od_type' : 'Iter',
        #'od_wait' : 25,
       }
}

def ensemble_search(params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

    model = EnsembleModel(params)

    evaluation = [(X_test, y_test)]

    model.fit(X_train, y_train,
              eval_set=evaluation,
              early_stopping_rounds=100, verbose=False)

    val_preds = model.predict(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, val_preds, pos_label = 1)
    auc_score = auc(fpr, tpr)

    neg_auc_score = -1 * auc_score

    return {"loss": neg_auc_score, "status": STATUS_OK}


In [20]:
# search for model

X = train[all_features]
y = target

trials = Trials()

best_hyperparams = fmin(fn=ensemble_search,
                       space=ensemble_params,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=trials)

100%|██████████████████████████████████████████| 100/100 [6:20:42<00:00, 228.43s/trial, best loss: -0.7757663279612521]


In [21]:
best_hyperparams

{'cat.depth': 10.0,
 'cat.fold_len_multiplier': 1.7422457760587946,
 'cat.l2_leaf_reg': 3.2623366275911745,
 'cat.leaf_estimation_backtracking': 1,
 'cat.learning_rate': 0.15136133382564895,
 'cat.max_bin': 202.0,
 'cat.min_data_in_leaf': 574.0,
 'cat.random_strength': 0.20562758847364615,
 'colsample_bytree': 0.6425537568096009,
 'learning_rate': 0.10962315502263141,
 'max_depth': 14.0,
 'min_child_samples': 323.0,
 'min_split_gain': 0.2933380974365634,
 'num_leaves': 148.0,
 'reg_alpha': 0.5170282083590011,
 'reg_lambda': 43.80139979272411,
 'subsample': 0.6572327355494463,
 'xgb.colsample_bytree': 0.790674241521834,
 'xgb.gamma': 6.06972832524672,
 'xgb.learning_rate': 0.15292477100346652,
 'xgb.max_depth': 16.0,
 'xgb.min_child_weight': 16.0,
 'xgb.reg_alpha': 0.3670013738813803,
 'xgb.reg_lambda': 24.921997449057116,
 'xgb.subsample': 0.8845306281663261}

In [22]:
# ------------------------------------------------------------------------------
# Parameters
# ------------------------------------------------------------------------------
N_FOLDS = 10
N_ESTIMATORS = 30000
SEED = 2021
BAGGING_SEED = 48

since = time.time()
columns = train.columns

ensemble_params = {
    "lgb" : {
        'colsample_bytree': 0.6425537568096009,
        'learning_rate': 0.00730821, # 0.10962315502263141,
        'max_depth': 14,
        'min_child_samples': 323,
        'min_split_gain': 0.2933380974365634,
        'num_leaves': 148,
        'reg_alpha': 0.5170282083590011,
        'reg_lambda': 43.80139979272411,
        'subsample': 0.6572327355494463,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'random_state': SEED,
        'n_jobs': -1,
        'n_estimators': 30000
    },
    'xgb': {
        
        'colsample_bytree': 0.790674241521834,
        'gamma': 6.06972832524672,
        'learning_rate': 0.035823, # 0.15292477100346652,
        'max_depth': 16,
        'min_child_weight': 16,
        'reg_alpha': 0.3670013738813803,
        'reg_lambda': 24.921997449057116,
        'subsample': 0.8845306281663261,
        'n_estimators': 10000,
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'random_state': SEED,
        'eval_metric': 'auc',
        'n_jobs': -1
    },
    'cat': {
        'depth': 10,
        'fold_len_multiplier': 1.7422457760587946,
        'l2_leaf_reg': 3.2623366275911745,
        'leaf_estimation_backtracking': 'AnyImprovement',
        'learning_rate': 0.03026133382564895, # 0.15136133382564895,
        'max_bin': 202,
        'min_data_in_leaf': 574,
        'random_strength': 0.20562758847364615,
        'n_estimators': 10000,
        'random_state': SEED,
        'eval_metric': 'AUC',
    }
}

In [23]:
#X = X.abs()
X = train[all_features]
y = target

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1)
oof_2 = np.zeros(len(train))
score_list = []
fold = 1
test_preds_2 = []

# view accuracy
from sklearn.metrics import accuracy_score


for train_index, test_index in kf.split(train):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    
    dtrain = X_train[all_features]
    dvalid = X_val[all_features]


    evaluation = [(dvalid, y_val)]

    model = EnsembleModel(ensemble_params)
    model.fit(dtrain, y_train,
              eval_set=evaluation,
              early_stopping_rounds=100, verbose=False)

    val_preds = model.predict(dvalid)
    
    fpr, tpr, thresholds = roc_curve(y_val, val_preds, pos_label = 1)
    auc_score = auc(fpr, tpr)


    print(f"AUC Score Fold-{fold} : {auc_score}")

    oof_2[test_index] = val_preds

    
    score_list.append(auc_score)
    test_preds_2.append(model.predict_proba(test[all_features])[:, 1])
    fold+=1
    

np.mean(score_list)

AUC Score Fold-1 : 0.7847205155691993
AUC Score Fold-2 : 0.771619761247742
AUC Score Fold-3 : 0.7801924563573394
AUC Score Fold-4 : 0.7769481988759315
AUC Score Fold-5 : 0.7741553655564881
AUC Score Fold-6 : 0.7772232912253242
AUC Score Fold-7 : 0.7748356366803452
AUC Score Fold-8 : 0.7792835047603486
AUC Score Fold-9 : 0.7728020250495855
AUC Score Fold-10 : 0.7734896519543258


0.776527040727663

In [24]:
print(np.mean(score_list))
score_list

0.776527040727663


[0.7847205155691993,
 0.771619761247742,
 0.7801924563573394,
 0.7769481988759315,
 0.7741553655564881,
 0.7772232912253242,
 0.7748356366803452,
 0.7792835047603486,
 0.7728020250495855,
 0.7734896519543258]

In [25]:
train["2_preds"] = oof_2
test["2_preds"] = np.mean(test_preds_2,axis=0)

fpr, tpr, thresholds = roc_curve(train["target"], train["2_preds"], pos_label = 1)
auc_score = auc(fpr, tpr)
    
print(f"AUC Score Final : {auc_score}")

AUC Score Final : 0.7765196532271988


In [26]:
# save prediction
subm_df = test[["2_preds"]]
subm_df['id'] = test_id
subm_df = subm_df.rename(columns={"2_preds": "target"})
subm_df.to_csv("hyperopt_lgb_xgb_cat_ensemble_model.csv",index=False)

In [27]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)

We are done. That is all, folks!
Finished at  2021-03-24 01:21:16.236500
Elapsed time:  7:40:54.771488


# Historical Record Tracking

## Baseline, with all features

- Final validation AUC locally - 0.7776592687653098
- public LB 0.882171

```
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.8785258997000931,
    'learning_rate': 0.014051289,
    'max_depth': 11,
    'min_child_weight': 5.5803474120805336e-06,
    'min_data_in_leaf': 85,
    'min_split_gain': 0.23035992374930991,
    'num_leaves': 248,
    'reg_alpha': 0.5030438494317617,
    'reg_lambda': 27.412351627541167,
    'subsample': 0.5725588226322971,
    'subsample_for_bin': 160000
}
```

## Single lgbm without non-important cat features

- AUC Score Final (locally) : 0.777074298578011
- public lb 0.88821

cat cols dropped: ['cat5', 'cat7', 'cat8', 'cat10']

(jump from top 66% to top 47% in public lb as of Mar 6, 2021)

```
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.7671990402625448,
     'learning_rate': 0.019767284,
     'max_depth': 15,
     'min_child_weight': 0.0002777396522195953,
     'min_data_in_leaf': 57,
     'min_split_gain': 0.21136452616481655,
     'num_leaves': 175,
     'reg_alpha': 0.11063887315135824,
     'reg_lambda': 18.001114258982668,
     'subsample': 0.9818248529067929,
     'subsample_for_bin': 180000
}


```

# Ensemble optimized on AUC min

- Final validation AUC locally - 0.7752592332573469
- public LB 0.7781...

```
ensemble_params = {
    "lgb" : {
        'colsample_bytree': 0.7669988155091811,
        'learning_rate': 0.018785504391188274,
        'max_depth': 15, #16
        'min_child_samples': 432,
        'min_split_gain': 0.5196682075422662,
        'num_leaves': 33,
        'reg_alpha': 0.7190968839100209,
        'reg_lambda': 45.09680841620885,
        'subsample': 0.5200921340590307,
        
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'random_state': SEED,
        'n_jobs': -1,
        'n_estimators': 20000
    },
    'xgb': {
        'colsample_bytree': 0.567401438889052,
        'gamma': 5.1311670760999615,
        'learning_rate': 0.024102, #0.06821522938543251,
        'max_depth': 10,
        'min_child_weight': 449,
        'reg_alpha': 0.20781010566037308,
        'reg_lambda': 97.95123800455754,
        'subsample': 0.6839072387257561,
        'n_estimators': 6000,
        'objective': 'binary:logistic',
        #'tree_method': 'gpu_hist',
        'random_state': SEED,
        'eval_metric': 'auc',
        'n_jobs': -1
    },
    'cat': {
        'depth': 16,
        'fold_len_multiplier': 1.6898925297449634,
        'l2_leaf_reg': 6.764140085134971,
        'leaf_estimation_backtracking': 'AnyImprovement',
        'learning_rate': 0.05112, #0.24599873470208633,
        'max_bin': 50,
        'min_data_in_leaf': 362,
        'random_strength': 0.006749481608797086,
        'n_estimators': 10000,
        'random_state': SEED,
        'eval_metric': 'AUC',
    }
}
```