# Overview

The purpose of this notebook is to predict the target by an Ensemble model composed of tree individual models

- lightgbm
- xgboost
- catboost

Feature Engineering followed basic practices that proved to work for GBM-style models for this competition

- label encoding the cat variables
- standard scaling to numeric variables

Params for *xgboost* and *catboost* have been discovered via hyperparam search, using *hyperopt*. Params for *lightgbm* have been reused from https://www.kaggle.com/hiro5299834/tps-feb-2021-with-single-lgbm-tuned (they appeared to work better vs. the set of parameters I discovered in *hyperopt*-based search).

Weight of lightgbm prediction was set to be a little higher then catboost and xgboost.

The well-thought software design of the Ensembling class was inspired by https://www.kaggle.com/kenkpixdev/ensemble-lgb-xgb-with-hyperopt

In [1]:
import pandas as pd
import numpy as np
import time
import datetime as dt
from typing import Tuple, List, Dict

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-02-20 11:10:17.277475


In [3]:
# read data
in_kaggle = True


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-feb-2021/train.csv'
        test_path = '../input/tabular-playground-series-feb-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-feb-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path


In [4]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train = pd.read_csv(train_set_path)
test = pd.read_csv(test_set_path)
target = train.target

subm = pd.read_csv(sample_subm_path)

CPU times: user 1.86 s, sys: 140 ms, total: 2 s
Wall time: 3.64 s


In [5]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [6]:
def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [7]:
cat_cols = ['cat' + str(i) for i in range(10)]
cont_cols = ['cont' + str(i) for i in range(14)]

train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=['id', 'target'], cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

# encoder=LabelEncoder()
test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_drop=['id'], cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)

In [8]:
class EnsembleModel:
    def __init__(self, params):
        """
        LGB + XGB + CatBoost model
        """
        self.lgb_params = params['lgb']
        self.xgb_params = params['xgb']
        self.cat_params = params['cat']

        self.lgb_model = LGBMRegressor(**self.lgb_params)
        self.xgb_model = XGBRegressor(**self.xgb_params)
        self.cat_model = CatBoostRegressor(**self.cat_params)

    def fit(self, x, y, *args, **kwargs):
        return (self.lgb_model.fit(x, y, *args, **kwargs),
                self.xgb_model.fit(x, y, *args, **kwargs),
               self.cat_model.fit(x, y, *args, **kwargs))

    def predict(self, x, weights=[1.0, 1.0, 1.0]):
        """
        Generate model predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return (weights[0] * self.lgb_model.predict(x) +
                weights[1] * self.xgb_model.predict(x) +
                weights[2] * self.cat_model.predict(x)) / 3

In [9]:
since = time.time()
columns = train.columns

# ------------------------------------------------------------------------------
# Parameters
# ------------------------------------------------------------------------------
N_FOLDS = 10
N_ESTIMATORS = 30000
SEED = 2021
BAGGING_SEED = 48

# ------------------------------------------------------------------------------
# LightGBM: training and inference
# ------------------------------------------------------------------------------
lgb_params = {'random_state': SEED,
          'metric': 'rmse',
          'n_estimators': N_ESTIMATORS,
          'n_jobs': -1,
          'cat_feature': [x for x in range(len(cat_cols))],
          'bagging_seed': SEED,
          'feature_fraction_seed': SEED,
          'learning_rate': 0.003899156646724397,
          'max_depth': 99,
          'num_leaves': 63,
          'reg_alpha': 9.562925363678952,
          'reg_lambda': 9.355810045480153,
          'colsample_bytree': 0.2256038826485174,
          'min_child_samples': 290,
          'subsample_freq': 1,
          'subsample': 0.8805303688019942,
          'max_bin': 882,
          'min_data_per_group': 127,
          'cat_smooth': 96,
          'cat_l2': 19
          }

ensemble_params = {
    "lgb" : lgb_params,
    'xgb': {
        'random_state': SEED,
        'max_depth': 13,
        'learning_rate': 0.020206705089028228,
        'gamma': 3.5746731812451156,
        'min_child_weight': 564,
        'n_estimators': 8000,
        'colsample_bytree': 0.5015940592112956,
        'subsample': 0.6839489639112909,
        'reg_lambda': 18.085502002853246,
        'reg_alpha': 0.17532087359570606,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'eval_metric': 'rmse',
        'n_jobs': -1
    },
    'cat': {
        'random_state': SEED,
        'depth': 3.0,
        'fold_len_multiplier': 1.1425259013471902,
        'l2_leaf_reg': 7.567589781752637,
        'leaf_estimation_backtracking': 'AnyImprovement',
        'learning_rate': 0.25121635918496565,
        'max_bin': 107.0,
        'min_data_in_leaf': 220.0,
        'random_strength': 3.2658690042589726,
        'n_estimators': 8000,
        'eval_metric': 'RMSE',
    }
}
    
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=N_FOLDS, random_state=22, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in kf.split(train[columns], target):

    X_tr, X_val=train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val=target.iloc[trn_idx], target.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    preds += model.predict(test[columns], weights=[1.1, 1.0, 0.9]) / kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1


print("Mean RMSE: ", np.mean(rmse))
end_time = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
        end_time // 60, end_time % 60))

Fold 1, RMSE: 0.8452680427304071
Fold 2, RMSE: 0.838145303777536
Fold 3, RMSE: 0.8375768531571243
Fold 4, RMSE: 0.844915135015359
Fold 5, RMSE: 0.8414464913675375
Fold 6, RMSE: 0.8454354409888862
Fold 7, RMSE: 0.8423574977211604
Fold 8, RMSE: 0.8432141136989376
Fold 9, RMSE: 0.8388999483442439
Fold 10, RMSE: 0.8442114314166476
Mean RMSE:  0.8421470258217839
Training complete in 61m 52s


In [10]:
# submit prediction
subm['target'] = preds
subm.to_csv("ensemble_model_lgb_xgb_cat_other_lgb_params.csv", index=False)

In [11]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)

We are done. That is all, folks!
Finished at  2021-02-20 12:12:16.162011
Elapsed time:  1:01:58.884536
