In [1]:
# inherited from https://www.kaggle.com/kenkpixdev/ensemble-lgb-xgb-with-hyperopt

In [2]:
import pandas as pd
import numpy as np
import time
import datetime as dt
from typing import Tuple, List, Dict

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [3]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-02-19 20:49:08.728342


In [4]:
# read data
in_kaggle = False


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-feb-2021/train.csv'
        test_path = '../input/tabular-playground-series-feb-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-jan-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path


In [5]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train = pd.read_csv(train_set_path)
test = pd.read_csv(test_set_path)
target = train.target

subm = pd.read_csv(sample_subm_path)

Wall time: 2.85 s


In [6]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [7]:
def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [8]:
cat_cols = ['cat' + str(i) for i in range(10)]
cont_cols = ['cont' + str(i) for i in range(14)]

train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=['id', 'target'], cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

# encoder=LabelEncoder()
test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_drop=['id'], cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)

In [9]:
class EnsembleModel:
    def __init__(self, params):
        """
        LGB + XGB model
        """
        self.lgb_params = params['lgb']
        self.xgb_params = params['xgb']

        self.lgb_model = LGBMRegressor(**self.lgb_params)
        self.xgb_model = XGBRegressor(**self.xgb_params)

    def fit(self, x, y, *args, **kwargs):
        return (self.lgb_model.fit(x, y, *args, **kwargs),
                self.xgb_model.fit(x, y, *args, **kwargs))

    def predict(self, x, weights=[1.0, 1.0]):
        """
        Generate model predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return (weights[0] * self.lgb_model.predict(x) +
                weights[1] * self.xgb_model.predict(x)) / 2

In [10]:
ensemble_params = {
    "lgb" : {
        "num_leaves": scope.int(hp.quniform("num_leaves", 31, 200, 1)),
        "max_depth": scope.int(hp.quniform("max_depth", 10, 24, 1)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_split_gain': hp.uniform('min_split_gain', 0, 1.0),
        'min_child_samples': scope.int(hp.quniform("min_child_samples", 2, 700, 1)),
        "subsample": hp.uniform("subsample", 0.2, 1.0),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        'reg_alpha': hp.uniform('reg_alpha', 1e-5, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0, 50),
        'n_jobs': -1,
        'n_estimators': 2000},
    'xgb': {
        'max_depth': scope.int(hp.quniform('xgb.max_depth', 10, 24, 1)),
        'learning_rate': hp.uniform('xgb.learning_rate', 0.01, 0.3),
        'gamma': hp.uniform('xgb.gamma', 1, 10),
        'min_child_weight': scope.int(hp.quniform('xgb.min_child_weight', 2, 700, 1)),
        'n_estimators': 2000,
        'colsample_bytree': hp.uniform('xgb.colsample_bytree', 0.5, 0.9),
        'subsample': hp.uniform('xgb.subsample', 0.5, 1.0),
        'reg_lambda': hp.uniform('xgb.reg_lambda', 0, 100),
        'reg_alpha': hp.uniform('xgb.reg_alpha', 1e-5, 0.5),
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'n_jobs': -1}
}

def ensemble_search(params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

    model = EnsembleModel(params)

    evaluation = [(X_test, y_test)]

    model.fit(X_train, y_train,
              eval_set=evaluation, eval_metric='rmse',
              early_stopping_rounds=100, verbose=False)

    val_preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, val_preds, squared=False)

    return {"loss": rmse, "status": STATUS_OK}

In [11]:
X = train.copy()
y = target

trials = Trials()

best_hyperparams = fmin(fn=ensemble_search,
                       space=ensemble_params,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=trials)

100%|██████████████████████████████████████████████| 100/100 [49:21<00:00, 29.62s/trial, best loss: 0.8424887913360436]


In [12]:
best_hyperparams

{'colsample_bytree': 0.5866279475208961,
 'learning_rate': 0.03031232932494199,
 'max_depth': 17.0,
 'min_child_samples': 455.0,
 'min_split_gain': 0.6860513483791865,
 'num_leaves': 106.0,
 'reg_alpha': 0.039739131441590844,
 'reg_lambda': 36.97850445493069,
 'subsample': 0.28472607257700644,
 'xgb.colsample_bytree': 0.5039031209135206,
 'xgb.gamma': 4.465092951581267,
 'xgb.learning_rate': 0.05292640058060351,
 'xgb.max_depth': 19.0,
 'xgb.min_child_weight': 475.0,
 'xgb.reg_alpha': 0.4070590572766408,
 'xgb.reg_lambda': 60.00604592381073,
 'xgb.subsample': 0.5561407886208866}

In [13]:
since = time.time()
columns = train.columns

ensemble_params = {
    "lgb" : {
        "num_leaves": 36,
        "max_depth": 21,
        'learning_rate': 0.049019854828962754,
        'min_split_gain': 0.2579555416739361,
        'min_child_samples': 500,
        "subsample": 0.2595537456780356,
        "colsample_bytree": 0.6203517996970486,
        'reg_alpha': 0.33867231210286647,
        'reg_lambda': 42.071411120949854,
        'n_jobs': -1,
        'n_estimators': 5000},
    'xgb': {
        'max_depth': 13,
        'learning_rate': 0.020206705089028228,
        'gamma': 3.5746731812451156,
        'min_child_weight': 564,
        'n_estimators': 5000,
        'colsample_bytree': 0.5015940592112956,
        'subsample': 0.6839489639112909,
        'reg_lambda': 18.085502002853246,
        'reg_alpha': 0.17532087359570606,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'n_jobs': -1}
}
    
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=10, random_state=22, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in kf.split(train[columns], target):

    X_tr, X_val=train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val=target.iloc[trn_idx], target.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    preds += model.predict(test[columns]) / kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1


print("Mean RMSE: ", np.mean(rmse))
end_time = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
        end_time // 60, end_time % 60))

Fold 1, RMSE: 0.8459683968064248
Fold 2, RMSE: 0.8390819506770154
Fold 3, RMSE: 0.837938733559432
Fold 4, RMSE: 0.8453188622072112
Fold 5, RMSE: 0.8426249571150686
Fold 6, RMSE: 0.8459673623179603
Fold 7, RMSE: 0.8426650796838433
Fold 8, RMSE: 0.84368738733979
Fold 9, RMSE: 0.8393808081654686
Fold 10, RMSE: 0.8446753957022116
Mean RMSE:  0.8427308933574424
Training complete in 9m 19s


In [14]:
# submit prediction
# public LB 0.84283
subm['target'] = preds
subm.to_csv("ensemble_model_2.csv", index=False)

In [15]:
# Local optimization yielded another set of params as well as a little better local score -  best loss: 0.8424602223218137

{'colsample_bytree': 0.5339822834488268,
 'learning_rate': 0.0771637203923797,
 'max_depth': 11.0,
 'min_child_samples': 220.0,
 'min_split_gain': 0.3863023853133569,
 'num_leaves': 31.0,
 'reg_alpha': 0.09731772459111021,
 'reg_lambda': 32.70688372281866,
 'subsample': 0.28103089889544786,
 'xgb.colsample_bytree': 0.759562316680131,
 'xgb.gamma': 3.5718242850438195,
 'xgb.learning_rate': 0.02500010390975047,
 'xgb.max_depth': 11.0,
 'xgb.min_child_weight': 220.0,
 'xgb.reg_alpha': 0.411312935999471,
 'xgb.reg_lambda': 23.416049187691378,
 'xgb.subsample': 0.570420767510636}

since = time.time()
columns = train.columns

ensemble_params = {
    "lgb" : {
        "num_leaves": 31,
        "max_depth": 11,
        'learning_rate': 0.0771637203923797,
        'min_split_gain': 0.3863023853133569,
        'min_child_samples': 220,
        "subsample": 0.28103089889544786,
        "colsample_bytree": 0.5339822834488268,
        'reg_alpha': 0.09731772459111021,
        'reg_lambda': 32.70688372281866,
        'n_jobs': -1,
        'n_estimators': 5000},
    'xgb': {
        'max_depth': 11,
        'learning_rate': 0.02500010390975047,
        'gamma': 3.5718242850438195,
        'min_child_weight': 220,
        'n_estimators': 5000,
        'colsample_bytree': 0.759562316680131,
        'subsample': 0.570420767510636,
        'reg_lambda': 23.416049187691378,
        'reg_alpha': 0.411312935999471,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'n_jobs': -1}
}
    
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=10, random_state=22, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in kf.split(train[columns], target):

    X_tr, X_val=train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val=target.iloc[trn_idx], target.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    preds += model.predict(test[columns]) / kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1


print("Mean RMSE: ", np.mean(rmse))
end_time = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
        end_time // 60, end_time % 60))

Fold 1, RMSE: 0.846022716892975
Fold 2, RMSE: 0.8389665485942602
Fold 3, RMSE: 0.8379310034116687
Fold 4, RMSE: 0.8458756233064536
Fold 5, RMSE: 0.8425900432951002
Fold 6, RMSE: 0.846136452488498
Fold 7, RMSE: 0.8428657676629502
Fold 8, RMSE: 0.8440328349496541
Fold 9, RMSE: 0.8395218028480587
Fold 10, RMSE: 0.8443435252134142
Mean RMSE:  0.8428286318663032
Training complete in 7m 5s


In [16]:
# submit prediction
# public LB 0.84282
subm['target'] = preds
subm.to_csv("ensemble_model_3.csv", index=False)

In [17]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)

We are done. That is all, folks!
Finished at  2021-02-19 21:55:00.454026
Elapsed time:  1:05:51.725684
