# LGBM: Resampling, HyperOpt of The Parameters

Various re-sampling techniques by https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [32]:
import pandas as pd
import numpy as np

import datetime as dt
from typing import Tuple, List, Dict
from collections import Counter

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.metrics import auc, roc_curve


from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder

import lightgbm as lgbm
from lightgbm import LGBMClassifier

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope

from functools import partial

import warnings
warnings.filterwarnings('ignore')

In [33]:
# main flow
start_time = dt.datetime.now()
print("Started at ", start_time)

Started at  2021-03-13 00:28:57.515327


In [34]:
# read data
in_kaggle = False


def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str]:
    train_path = ''
    test_path = ''
    sample_submission_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/tabular-playground-series-mar-2021/train.csv'
        test_path = '../input/tabular-playground-series-mar-2021/test.csv'
        sample_submission_path = '../input/tabular-playground-series-mar-2021/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train.csv'
        test_path = 'data/test.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, test_path, sample_submission_path


In [35]:
%%time
# get the training set and labels
train_set_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train = pd.read_csv(train_set_path)
test = pd.read_csv(test_set_path)
target = train.target

test_id = test.id

subm = pd.read_csv(sample_subm_path)

Wall time: 1.82 s


In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  object 
 2   cat1    300000 non-null  object 
 3   cat2    300000 non-null  object 
 4   cat3    300000 non-null  object 
 5   cat4    300000 non-null  object 
 6   cat5    300000 non-null  object 
 7   cat6    300000 non-null  object 
 8   cat7    300000 non-null  object 
 9   cat8    300000 non-null  object 
 10  cat9    300000 non-null  object 
 11  cat10   300000 non-null  object 
 12  cat11   300000 non-null  object 
 13  cat12   300000 non-null  object 
 14  cat13   300000 non-null  object 
 15  cat14   300000 non-null  object 
 16  cat15   300000 non-null  object 
 17  cat16   300000 non-null  object 
 18  cat17   300000 non-null  object 
 19  cat18   300000 non-null  object 
 20  cont0   300000 non-null  float64
 21  cont1   30

In [37]:
cols_to_drop = ['id', 'cat5', 'cat7', 'cat8', 'cat10']

cat_cols = [feature for feature in train.columns if 'cat' in feature and feature not in cols_to_drop ]
cont_cols = [feature for feature in train.columns if 'cont' in feature and feature not in cols_to_drop]

all_features = cat_cols + cont_cols

def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [38]:
train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

# encoder=LabelEncoder()
test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_drop=cols_to_drop, cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)


In [39]:
# apply re-sampling to training


over = SMOTE(sampling_strategy=0.6)
under = RandomUnderSampler(sampling_strategy=0.99)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_train = train[all_features]
y = train['target']

# transform the dataset
X_train, y = pipeline.fit_resample(X_train, y)

# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 133659, 1: 132323})


In [40]:
# re-create the training set
X_train['target'] = y
train = X_train.copy()
target = train.target

In [41]:
# ------------------------------------------------------------------------------
# Parameters
# ------------------------------------------------------------------------------
N_FOLDS = 10
N_ESTIMATORS = 30000
SEED = 2021
BAGGING_SEED = 48

# ------------------------------------------------------------------------------
# LightGBM: example params
# ------------------------------------------------------------------------------
lgbm_params_et = {'random_state': SEED,
          'metric': 'auc',
          'n_estimators': N_ESTIMATORS,
          'n_jobs': -1,
          'cat_feature': [x for x in range(len(cat_cols))],
          'bagging_seed': SEED,
          'feature_fraction_seed': SEED,
          'learning_rate': 0.003899156646724397,
          'max_depth': 99,
          'num_leaves': 63,
          'reg_alpha': 9.562925363678952,
          'reg_lambda': 9.355810045480153,
          'colsample_bytree': 0.2256038826485174,
          'min_child_samples': 290,
          'subsample_freq': 1,
          'subsample': 0.8805303688019942,
          'max_bin': 882,
          'min_data_per_group': 127,
          'cat_smooth': 96,
          'cat_l2': 19
          }

In [42]:
lgb_search_params = {
        "num_leaves": scope.int(hp.quniform("num_leaves", 31, 250, 1)),
        "max_depth": scope.int(hp.quniform("max_depth", 10, 24, 1)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'subsample_for_bin': scope.int(hp.quniform('subsample_for_bin', 10000, 300000, 10000)),
        'min_data_in_leaf': scope.int(hp.qloguniform('min_data_in_leaf', 0, 6, 1)),
    
    
    
        'min_split_gain': hp.uniform('min_split_gain', 0, 1.0),
        'min_child_samples': scope.int(hp.quniform("min_child_samples", 2, 700, 1)),
        "subsample": hp.uniform("subsample", 0.2, 1.0), #alias: bagging_fraction
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0), # alias: feature_fraction
        'reg_alpha': hp.uniform('reg_alpha', 1e-5, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0, 50),
    
        'min_sum_hessian_in_leaf': None, #overrides min_child_weight
        'min_child_weight': hp.loguniform('min_child_weight', -16, 5), #also aliases to min_sum_hessian
        'min_child_samples': None, #overridden by min_data_in_leaf
        'n_estimators': 2000
}

#         'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt'}, 
#                                                 {'boosting_type': 'dart'},
#                                                 {'boosting_type': 'goss'}]),
#
#    'boosting_type_choice': hp.choice('boosting_type', 
#                                      [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gbdt_subsample', 0.5, 1)}, 
#                                       {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
#                                       {'boosting_type': 'goss', 'subsample': 1.0}]),
#        'n_jobs': -1,

def lgbm_search(params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

    model = LGBMClassifier(boosting_type='gbdt', **params) # boosting_type='gbdt',

    evaluation = [(X_test, y_test)]

    model.fit(X_train, y_train,
              eval_set=evaluation, eval_metric='auc', categorical_feature=cat_cols,
              early_stopping_rounds=100, verbose=False)

    val_preds = model.predict(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, val_preds, pos_label = 1)
    auc_score = auc(fpr, tpr)

    return {"loss": auc_score, "status": STATUS_OK}

In [43]:
# search for model

X = train[all_features]
y = target


trials = Trials()

best_hyperparams = fmin(fn=lgbm_search,
                       space=lgb_search_params,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=trials)

Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).           
Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).           
100%|██████████████████████████████████████████████| 100/100 [15:54<00:00,  9.55s/trial, best loss: 0.8285258607956092]


In [44]:
best_hyperparams

{'colsample_bytree': 0.8018576944727711,
 'learning_rate': 0.17077078169809012,
 'max_depth': 12.0,
 'min_child_weight': 1.6579901067817857,
 'min_data_in_leaf': 13.0,
 'min_split_gain': 0.8597953783167228,
 'num_leaves': 35.0,
 'reg_alpha': 0.6315483882596105,
 'reg_lambda': 49.829447554660874,
 'subsample': 0.6251371577974962,
 'subsample_for_bin': 80000.0}

In [46]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.8018576944727711,
     'learning_rate': 0.011384719,
     'max_depth': 12,
     'min_child_weight': 1.6579901067817857,
     'min_data_in_leaf': 13,
     'min_split_gain': 0.8597953783167228,
     'num_leaves': 35,
     'reg_alpha': 0.6315483882596105,
     'reg_lambda': 49.829447554660874,
     'subsample': 0.6251371577974962,
     'subsample_for_bin': 80000
}


In [47]:
#X = X.abs()
y = train["target"]
kf = KFold(n_splits=8, shuffle=True, random_state=1)
oof_2 = np.zeros(len(train))
score_list = []
fold = 1
test_preds_2 = []

# view accuracy
from sklearn.metrics import accuracy_score


for train_index, test_index in kf.split(train):
    X_train, X_val = train.iloc[train_index], train.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    
    dtrain = X_train[all_features]
    dvalid = X_val[all_features]
    
    model = LGBMClassifier(**lgbm_params) 

    evaluation = [(dvalid, y_val)]

    model.fit(dtrain, y_train,
              eval_set=evaluation, eval_metric='auc', categorical_feature=cat_cols,
              early_stopping_rounds=100, verbose=False)

    val_preds = model.predict(dvalid)
    
    fpr, tpr, thresholds = roc_curve(y_val, val_preds, pos_label = 1)
    auc_score = auc(fpr, tpr)


    print(f"AUC Score Fold-{fold} : {auc_score}")

    oof_2[test_index] = val_preds

    
    score_list.append(auc_score)
    test_preds_2.append(model.predict_proba(test[all_features])[:, 1])
    fold+=1
    

np.mean(score_list)

AUC Score Fold-1 : 0.8312917109957951
AUC Score Fold-2 : 0.8318411094267925
AUC Score Fold-3 : 0.8344440926191992
AUC Score Fold-4 : 0.831698298849326
AUC Score Fold-5 : 0.8292316835710815
AUC Score Fold-6 : 0.8279506300189392
AUC Score Fold-7 : 0.8307629955369671
AUC Score Fold-8 : 0.8301751423706627


0.8309244579235954

In [48]:
print(np.mean(score_list))
score_list

0.8309244579235954


[0.8312917109957951,
 0.8318411094267925,
 0.8344440926191992,
 0.831698298849326,
 0.8292316835710815,
 0.8279506300189392,
 0.8307629955369671,
 0.8301751423706627]

In [49]:
train["2_preds"] = oof_2
test["2_preds"] = np.mean(test_preds_2,axis=0)

fpr, tpr, thresholds = roc_curve(train["target"], train["2_preds"], pos_label = 1)
auc_score = auc(fpr, tpr)
    
print(f"AUC Score Final : {auc_score}")

AUC Score Final : 0.8309213813977572


In [50]:
# save prediction
subm_df = test[["2_preds"]]
subm_df['id'] = test_id
subm_df = subm_df.rename(columns={"2_preds": "target"})
subm_df.to_csv("undersample_099_hyperopt_single_lgb_model.csv",index=False)

In [51]:
print('We are done. That is all, folks!')
finish_time = dt.datetime.now()
print("Finished at ", finish_time)
elapsed = finish_time - start_time
print("Elapsed time: ", elapsed)

We are done. That is all, folks!
Finished at  2021-03-13 01:06:37.253727
Elapsed time:  0:37:39.738400


# Historical Record Tracking

## Baseline, with all features

- Final validation AUC locally - 0.7776592687653098
- public LB 0.882171

```
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.8785258997000931,
    'learning_rate': 0.014051289,
    'max_depth': 11,
    'min_child_weight': 5.5803474120805336e-06,
    'min_data_in_leaf': 85,
    'min_split_gain': 0.23035992374930991,
    'num_leaves': 248,
    'reg_alpha': 0.5030438494317617,
    'reg_lambda': 27.412351627541167,
    'subsample': 0.5725588226322971,
    'subsample_for_bin': 160000
}
```

## Single lgbm without non-important cat features

- AUC Score Final (locally) : 0.777074298578011
- public lb 0.88821

cat cols dropped: ['cat5', 'cat7', 'cat8', 'cat10']

(jump from top 66% to top 47% in public lb as of Mar 6, 2021)

```
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.7671990402625448,
     'learning_rate': 0.019767284,
     'max_depth': 15,
     'min_child_weight': 0.0002777396522195953,
     'min_data_in_leaf': 57,
     'min_split_gain': 0.21136452616481655,
     'num_leaves': 175,
     'reg_alpha': 0.11063887315135824,
     'reg_lambda': 18.001114258982668,
     'subsample': 0.9818248529067929,
     'subsample_for_bin': 180000
}


```

## undersampling 0.99


- AUC Score Final (locally) : 0.8085022125313804
- public lb 0.88672

cat cols dropped: ['cat5', 'cat7', 'cat8', 'cat10']

```
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'random_state': SEED,
    'n_estimators': N_ESTIMATORS,
    'n_jobs': -1,
    'cat_feature': [x for x in range(len(cat_cols))],
    'bagging_seed': SEED,
    'feature_fraction_seed': SEED,
    
    
    'colsample_bytree': 0.9460530130541709,
    'learning_rate': 0.019573449,
    'max_depth': 24,
    'min_child_weight': 0.4998754884181588,
    'min_data_in_leaf': 106,
    'min_split_gain': 0.9813419298112451,
    'num_leaves': 158,
    'reg_alpha': 0.795664780159067,
    'reg_lambda': 29.908892776777908,
    'subsample': 0.5916153378040604,
    'subsample_for_bin': 200000
}

```

## undersampling 0.99, smaller n_rounds

- local cv score: 0.8080759644091626
- public lb 0.88679

## over+undersampling pipeline
