# Hyperparameter tuning with XGBoost, Ray Tune, Hyperopt and Optuna

## Introduction


In this post we are going to demonstrate how we can speed up hyperparameter tuning with:

1) Bayesian optimization tuning algos like HyperOpt and Optuna, running on…

2) the [Ray](https://ray.io/) distributed ML framework, with a [unified API to many hyperparameter search algos](https://medium.com/riselab/cutting-edge-hyperparameter-tuning-with-ray-tune-be6c0447afdf) and…

3) a distributed cluster of cloud instances for even more speedup.



In [1]:
from itertools import product
from datetime import datetime, timedelta
import os
import random
import string

import numpy as np
import pandas as pd

import sklearn
from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV, Ridge, RidgeCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

#!conda install -y -c conda-forge  xgboost 
import xgboost
from xgboost import XGBRegressor
from xgboost import plot_importance

import lightgbm
from lightgbm import LGBMRegressor

import ray
from ray import tune
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.suggest.optuna import OptunaSearch
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLogger

# import wandb
# os.environ['WANDB_NOTEBOOK_NAME']='hyperparameter_optimization.ipynb'

print(datetime.now())

print ("%-20s %s"% ("numpy", np.__version__))
print ("%-20s %s"% ("pandas", pd.__version__))
print ("%-20s %s"% ("sklearn", sklearn.__version__))
print ("%-20s %s"% ("xgboost", xgboost.__version__))
print ("%-20s %s"% ("lightgbm", lightgbm.__version__))
print ("%-20s %s"% ("ray", ray.__version__))


2020-10-24 20:17:46.299533
numpy                1.19.1
pandas               1.1.3
sklearn              0.23.2
xgboost              1.2.0
lightgbm             2.3.0
ray                  1.1.0.dev0


In [2]:
# set seed for reproducibility
RANDOMSTATE = 42
np.random.seed(RANDOMSTATE)


In [3]:
# import train data
df = pd.read_pickle('df_train.pickle')

response = 'SalePrice'
predictors = ['YearBuilt',
              'BsmtFullBath',
              'FullBath',
              'KitchenAbvGr',
              'GarageYrBlt',
              'LotFrontage',
              'MasVnrArea',
              '1stFlrSF',
              'GrLivArea',
              'GarageArea',
              'WoodDeckSF',
              'PorchSF',
              'AvgBltRemod',
              'FireBathRatio',
              'TotalSF x OverallQual x OverallCond',
              'AvgBltRemod x Functional x TotalFinSF',
              'Functional x OverallQual',
              'KitchenAbvGr x KitchenQual',
              'GarageCars x GarageYrBlt',
              'GarageQual x GarageCond x GarageCars',
              'HeatingQC x Heating',
              'monthnum',
              'log_YearBuilt',
              'log_LotArea',
              'log_TotalFinSF',
              'log_GarageRatio',
              'log_TotalSF x OverallQual x OverallCond',
              'log_TotalSF x OverallCond',
              'log_AvgBltRemod x TotalFinSF',
              'sq_2ndFlrSF',
              'sq_BsmtFinSF',
              'sq_BsmtFinSF x BsmtQual',
              'sq_BsmtFinSF x BsmtBath',
              'BldgType_4',
              'BsmtExposure_1',
              'BsmtExposure_4',
              'BsmtFinType1_1',
              'BsmtFinType1_2',
              'BsmtFinType1_4',
              'BsmtFinType1_5',
              'BsmtFinType1_6',
              'CentralAir_0',
              'CentralAir_1',
              'Condition1_1',
              'Condition1_3',
              'ExterCond_2',
              'ExterQual_2',
              'Exterior1st_4',
              'Exterior1st_5',
              'Exterior1st_10',
              'Fence_0',
              'Fence_2',
              'Foundation_1',
              'Foundation_5',
              'GarageCars_1',
              'GarageFinish_2',
              'GarageFinish_3',
              'GarageType_2',
              'HouseStyle_2',
              'KitchenQual_4',
              'LotConfig_0',
              'LotConfig_4',
              'MSSubClass_30',
              'MSSubClass_70',
              'MSZoning_0',
              'MSZoning_1',
              'MSZoning_4',
              'MasVnrType_2',
              'MasVnrType_3',
              'MoSold_1',
              'MoSold_5',
              'MoSold_6',
              'MoSold_11',
              'Neighborhood_3',
              'Neighborhood_4',
              'Neighborhood_5',
              'Neighborhood_10',
              'Neighborhood_11',
              'Neighborhood_16',
              'Neighborhood_17',
              'Neighborhood_19',
              'Neighborhood_22',
              'Neighborhood_24',
              'OverallCond_7',
              'OverallQual_5',
              'OverallQual_6',
              'OverallQual_7',
              'OverallQual_9',
              'PavedDrive_0',
              'PavedDrive_2',
              'SaleCondition_1',
              'SaleCondition_2',
              'SaleCondition_5',
              'SaleType_4',
              'BedroomAbvGr_1',
              'BedroomAbvGr_4',
              'BedroomAbvGr_5',
              'HalfBath_1',
              'TotalBath_1.0',
              'TotalBath_2.5']

X_train, X_test, y_train, y_test = train_test_split(df, df[response], test_size=.25)

display(df[predictors].head())
display(df[[response]].head())


Unnamed: 0_level_0,YearBuilt,BsmtFullBath,FullBath,KitchenAbvGr,GarageYrBlt,LotFrontage,MasVnrArea,1stFlrSF,GrLivArea,GarageArea,...,SaleCondition_1,SaleCondition_2,SaleCondition_5,SaleType_4,BedroomAbvGr_1,BedroomAbvGr_4,BedroomAbvGr_5,HalfBath_1,TotalBath_1.0,TotalBath_2.5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,1,2,1,7,65.0,196.0,856,1710,548.0,...,0,0,0,1,0,0,0,1,0,0
2,34,0,2,1,34,80.0,0.0,1262,1262,460.0,...,0,0,0,1,0,0,0,0,0,1
3,9,1,2,1,9,68.0,162.0,920,1786,608.0,...,0,0,0,1,0,0,0,1,0,0
4,95,1,1,1,12,60.0,0.0,961,1717,642.0,...,1,0,0,1,0,0,0,0,0,0
5,10,1,2,1,10,84.0,350.0,1145,2198,836.0,...,0,0,0,1,0,1,0,1,0,0


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1,12.247699
2,12.109016
3,12.317171
4,11.849405
5,12.42922


In [4]:
# we are training on a response which is the log of 1 + the sale price
# transform prediction back to original basis with expm1 and evaluate vs. original

MEAN_RESPONSE=df[response].mean()
def cv_to_raw(cv_val, mean_response=MEAN_RESPONSE):
    """convert log1p rmse to underlying SalePrice error"""
    # MEAN_RESPONSE assumes folds have same mean response, which is true in expectation but not in each fold
    # we can also pass the actual response for each fold
    # but we're usually looking to consistently convert the log value to a more meaningful unit
    return np.expm1(mean_response+cv_val) - np.expm1(mean_response)

In [5]:
# always use same k-folds for reproducibility
kfolds = KFold(n_splits=10, shuffle=True, random_state=RANDOMSTATE)


# Ray Cluster

- Cluster config is in `ray1.1.yaml`
- Edit `ray1.1.yaml` file with your region, availability zone, subnet, imageid information
    - to get those variables launch the latest Deep Learning AMI (Ubuntu 18.04) Version 35.0 into a small instance in your favorite region/zone
    - test that it works
    - note those 4 variables: region, availability zone, subnet, AMI imageid
    - terminate the instance and edit `ray1.1.yaml` accordingly
    - in future you can create your own image with everything pre-installed and specify its AMI imageid, instead of using the generic image and installing everything at launch.
- To run the cluster: 
`ray up ray1.1.yaml`
    - Creates head instance using image specified.
    - Installs ray and related requirements
    - Clones this Iowa repo
    - Launches worker nodes per auto-scaling parameters (currently we fix the number of nodes because we're not benching the time the cluster will take to auto-scale)
- After cluster starts you can check AWS console and note that several instances launched.
- Check `ray monitor ray1.1.yaml` for any error messages
- Run Jupyter on the cluster with port forwarding
 `ray exec ray1.1.yaml --port-forward=8899 'jupyter notebook --port=8899'`
- Open the notebook on the generated URL e.g. http://localhost:8899/?token=5f46d4355ae7174524ba71f30ef3f0633a20b19a204b93b4
- Make sure to hoose the default kernel to make sure it runs in the conda environment with all installs
- Make sure to use the ray.init() command given in the startup messages.
- You can also run a terminal on the head node of the cluster with
 `ray attach /Users/drucev/projects/iowa/ray1.1.yaml`
- You can also ssh explicitly with the IP address and the generated private key
 `ssh -o IdentitiesOnly=yes -i ~/.ssh/ray-autoscaler_1_us-east-1.pem ubuntu@54.161.200.54`
- run port forwarding to the Ray dashboard with   
`ray dashboard ray1.1.yaml`
and then open
 http://localhost:8265/

see https://docs.ray.io/en/latest/cluster/launcher.html for additional info

In [6]:
# make sure local ray service is shutdown
ray.shutdown()


In [7]:
# launch cluster in terminal with ray up ray1.1.yaml
# initialize ray on cluster
ray.init(address='localhost:6379', _redis_password='5241590000000000')


2020-10-24 20:17:47,266	INFO worker.py:674 -- Connecting to existing Ray cluster at address: 172.30.5.248:6379


{'node_ip_address': '172.30.5.248',
 'raylet_ip_address': '172.30.5.248',
 'redis_address': '172.30.5.248:6379',
 'object_store_address': '/tmp/ray/session_2020-10-24_20-15-23_287874_28682/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-10-24_20-15-23_287874_28682/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-10-24_20-15-23_287874_28682',
 'metrics_export_port': 62581,
 'node_id': '8a79d1850f23c75facc3790435656ef6f3d67e2b'}

In [8]:
# refactor to give ray.tune a single function of hyperparameters to optimize

# @wandb_mixin
def my_xgb(config):
    
    # fix these configs to match calling convention
    # search wants to pass in floats but xgb wants ints
    #config['max_leaves'] = int(config['max_leaves'])
    config['n_estimators'] = int(config['n_estimators'])   # pass float eg loguniform distribution, use int
    # hyperopt needs left to start at 0 but we want to start at 2    
    config['max_depth'] = int(config['max_depth']) + 2
    config['learning_rate'] = 10 ** config['learning_rate']
    
    xgb = XGBRegressor(
        objective='reg:squarederror',
        n_jobs=1,
        random_state=RANDOMSTATE,
        booster='gbtree',   
        scale_pos_weight=1, 
        **config,
    )
    scores = -cross_val_score(xgb, df[predictors], df[response],
                                      scoring="neg_root_mean_squared_error",
                                      cv=kfolds)
    rmse = np.mean(scores)
    tune.report(rmse=rmse)
#     wandb.log({"rmse": rmse})
    
    return {"rmse": rmse}

In [9]:
xgb_tune_kwargs = {
    "n_estimators": tune.loguniform(100, 10000),
    "max_depth": tune.randint(0, 5),
    # max_leaves doesn't seem to have any impact on XGBoost but num_leaves does help LGBM, oddly.
    # 'max_leaves': tune.loguniform(1, 1000),    
    "subsample": tune.quniform(0.25, 0.75, 0.01),
    "colsample_bytree": tune.quniform(0.05, 0.5, 0.01),
    "colsample_bylevel": tune.quniform(0.05, 0.5, 0.01),    
    "learning_rate": tune.quniform(-3.0, -1.0, 0.5),
#     "wandb": {
#         "project": "iowa_xgb",
#         "api_key_file": "~/secrets/wandb.txt",
#    }    
}

xgb_tune_params = [k for k in xgb_tune_kwargs.keys() if k !kkk= 'wandb']
xgb_tune_params

['n_estimators',
 'max_depth',
 'subsample',
 'colsample_bytree',
 'colsample_bylevel',
 'learning_rate']

In [10]:
NUM_SAMPLES=2048

print("XGBoost HyperOpt")

start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

algo = HyperOptSearch(random_state_seed=RANDOMSTATE)
# to limit number of cores, uncomment and set max_concurrent 
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

analysis = tune.run(my_xgb,
                    num_samples=NUM_SAMPLES,
                    config=xgb_tune_kwargs,                    
                    name="hyperopt_xgb",
                    metric="rmse",
                    mode="min",
                    search_alg=algo,
                    scheduler=scheduler,
                    verbose=1,
#                    loggers=DEFAULT_LOGGERS + (WandbLogger, ),
                   )

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Trial name,status,loc,colsample_bylevel,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,iter,total time (s),rmse
my_xgb_fe548e2c,TERMINATED,,0.36,0.16,-3.0,4,4626.62,0.48,1,65.5733,0.177395
my_xgb_fe60013a,TERMINATED,,0.07,0.47,-2.5,3,2853.98,0.46,2,45.0943,0.111589
my_xgb_fe66937e,TERMINATED,,0.08,0.44,-1.0,1,5842.93,0.67,1,81.7294,0.11164
my_xgb_fe6d7568,TERMINATED,,0.09,0.11,-3.0,2,6877.73,0.47,1,81.8715,0.158541
my_xgb_fe730442,TERMINATED,,0.09,0.29,-1.5,3,220.67,0.66,2,7.15287,0.123829
my_xgb_fe78808e,TERMINATED,,0.37,0.46,-1.5,1,751.724,0.65,2,16.8511,0.10924
my_xgb_fe807d2a,TERMINATED,,0.49,0.3,-1.5,2,7563.79,0.62,1,197.148,0.108419
my_xgb_fe8645e8,TERMINATED,,0.22,0.35,-2.5,0,3554.09,0.56,1,49.5505,0.113938
my_xgb_fe8c00aa,TERMINATED,,0.07,0.2,-1.5,4,6918.6,0.52,2,88.5408,0.107773
my_xgb_fe949b02,TERMINATED,,0.15,0.22,-2.0,1,2519.09,0.73,2,30.9389,0.10597


2020-10-24 17:02:24,050	INFO tune.py:439 -- Total run time: 5451.91 seconds (5442.97 seconds for the tuning loop).


Start Time           2020-10-24 15:31:32.142080
End Time             2020-10-24 17:02:30.554736
1:30:58


In [11]:
param_cols = ['config.' + k for k in xgb_tune_params]
analysis_results_df = analysis.results_df[['rmse', 'date', 'time_this_iter_s'] + param_cols].sort_values('rmse')
analysis_results_df


Unnamed: 0_level_0,rmse,date,time_this_iter_s,config.n_estimators,config.max_depth,config.subsample,config.colsample_bytree,config.colsample_bylevel,config.learning_rate
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8845fdbe,0.102816,2020-10-24_16-13-47,100.439363,7978,3,0.32,0.20,0.10,0.010000
bfcbbe22,0.102918,2020-10-24_16-51-01,59.348856,8662,3,0.32,0.20,0.13,0.010000
83cc89ac,0.103059,2020-10-24_16-42-19,78.354994,7928,3,0.34,0.20,0.12,0.010000
28ae42b8,0.103063,2020-10-24_16-42-12,224.790460,7004,3,0.34,0.20,0.10,0.010000
84c9092c,0.103065,2020-10-24_16-27-09,17.007228,7842,3,0.34,0.20,0.10,0.010000
...,...,...,...,...,...,...,...,...,...
0021e3f8,7.363346,2020-10-24_15-31-38,1.764739,142,3,0.61,0.14,0.37,0.003162
f8a27f62,7.650186,2020-10-24_16-00-04,0.938665,130,3,0.35,0.23,0.09,0.003162
022c19ca,7.720515,2020-10-24_15-31-42,1.737596,127,6,0.56,0.07,0.23,0.003162
afa3d650,8.357422,2020-10-24_15-43-43,0.853009,102,5,0.29,0.21,0.41,0.003162


In [12]:
best_config = {z: analysis_results_df.iloc[0]['config.' + z] for z in xgb_tune_params}

xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=RANDOMSTATE,    
    verbosity=1,
    n_jobs=-1,
    **best_config
)
print(xgb)

scores = -cross_val_score(xgb, df[predictors], df[response],
                          scoring="neg_root_mean_squared_error",
                          cv=kfolds)

raw_scores = [cv_to_raw(x) for x in scores]
print()
print("Log1p CV RMSE %.06f (STD %.04f)" % (np.mean(scores), np.std(scores)))
print("Raw CV RMSE %.0f (STD %.0f)" % (np.mean(raw_scores), np.std(raw_scores)))


XGBRegressor(base_score=None, booster=None, colsample_bylevel=0.1,
             colsample_bynode=None, colsample_bytree=0.2, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.01, max_delta_step=None, max_depth=3,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=7978, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=0.32, tree_method=None,
             validate_parameters=None, verbosity=1)

Log1p CV RMSE 0.102816 (STD 0.0128)
Raw CV RMSE 18030 (STD 2356)


In [13]:
NUM_SAMPLES=2048

print("XGBoost Optuna")

start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

algo = OptunaSearch()
# to limit number of cores, uncomment and set max_concurrent 
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

analysis = tune.run(my_xgb,
                    num_samples=NUM_SAMPLES,
                    config=xgb_tune_kwargs,                    
                    name="hyperopt_xgb",
                    metric="rmse",
                    mode="min",
                    search_alg=algo,
                    scheduler=scheduler,
                    verbose=1,
#                    loggers=DEFAULT_LOGGERS + (WandbLogger, ),
                   )

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))

Trial name,status,loc,colsample_bylevel,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,iter,total time (s),rmse
my_xgb_dbc66d96,TERMINATED,,0.08,0.05,-2.0,0,655.627,0.75,1,4.62562,0.188589
my_xgb_dbcf06cc,TERMINATED,,0.26,0.33,-1.0,5,5307.46,0.6,1,72.4094,0.113811
my_xgb_dbd597c6,TERMINATED,,0.3,0.13,-2.0,3,269.863,0.73,2,9.5849,0.80787
my_xgb_dbddc518,TERMINATED,,0.14,0.13,-2.0,0,675.783,0.36,2,8.10665,0.185686
my_xgb_dbe2a39e,TERMINATED,,0.44,0.18,-2.0,4,132.564,0.42,1,1.16588,3.08635
my_xgb_dbe82cc4,TERMINATED,,0.1,0.47,-1.5,4,2820.8,0.34,2,37.4432,0.106072
my_xgb_dbf0d5fe,TERMINATED,,0.31,0.18,-1.5,5,1389.86,0.42,2,28.5131,0.108951
my_xgb_dbf75e10,TERMINATED,,0.22,0.36,-3.0,3,993.383,0.69,1,10.6419,4.28678
my_xgb_dbfd0b30,TERMINATED,,0.33,0.24,-3.0,2,6220.67,0.54,1,107.459,0.120182
my_xgb_dc048798,TERMINATED,,0.27,0.4,-1.5,3,257.896,0.35,2,18.5171,0.113439


2020-10-24 18:33:28,303	INFO tune.py:439 -- Total run time: 5390.66 seconds (5383.36 seconds for the tuning loop).


Start Time           2020-10-24 17:03:37.617599
End Time             2020-10-24 18:33:35.341744
1:29:57


In [14]:
param_cols = ['config.' + k for k in xgb_tune_params]
analysis_results_df = analysis.results_df[['rmse', 'date', 'time_this_iter_s'] + param_cols].sort_values('rmse')
analysis_results_df


Unnamed: 0_level_0,rmse,date,time_this_iter_s,config.n_estimators,config.max_depth,config.subsample,config.colsample_bytree,config.colsample_bylevel,config.learning_rate
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2552adb4,0.102802,2020-10-24_18-05-54,94.305483,7658,3,0.32,0.20,0.12,0.010000
1f7ddac6,0.102806,2020-10-24_18-13-40,551.719544,7729,3,0.32,0.20,0.12,0.010000
de19bc48,0.102813,2020-10-24_17-34-53,61.880440,6830,3,0.32,0.20,0.11,0.010000
24179b12,0.102814,2020-10-24_18-05-54,77.514643,7859,3,0.32,0.20,0.12,0.010000
2a41e84e,0.102814,2020-10-24_18-06-58,178.048737,7792,3,0.32,0.20,0.12,0.010000
...,...,...,...,...,...,...,...,...,...
ddc816e4,7.917921,2020-10-24_17-03-43,1.154885,119,6,0.54,0.18,0.36,0.003162
dc5b8962,8.684946,2020-10-24_17-03-44,4.152945,284,7,0.49,0.45,0.37,0.001000
e3399fee,9.633090,2020-10-24_17-03-56,2.725270,180,5,0.70,0.05,0.12,0.001000
dcba2774,10.248390,2020-10-24_17-03-42,1.434465,118,6,0.49,0.14,0.06,0.001000


In [15]:
best_config = {z: analysis_results_df.iloc[0]['config.' + z] for z in xgb_tune_params}

xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=RANDOMSTATE,    
    verbosity=1,
    n_jobs=-1,
    **best_config
)
print(xgb)

scores = -cross_val_score(xgb, df[predictors], df[response],
                          scoring="neg_root_mean_squared_error",
                          cv=kfolds)

raw_scores = [cv_to_raw(x) for x in scores]
print()
print("Log1p CV RMSE %.06f (STD %.04f)" % (np.mean(scores), np.std(scores)))
print("Raw CV RMSE %.0f (STD %.0f)" % (np.mean(raw_scores), np.std(raw_scores)))


XGBRegressor(base_score=None, booster=None,
             colsample_bylevel=0.12000000000000001, colsample_bynode=None,
             colsample_bytree=0.2, gamma=None, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.01, max_delta_step=None, max_depth=3,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=7658, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=0.32, tree_method=None,
             validate_parameters=None, verbosity=1)

Log1p CV RMSE 0.102802 (STD 0.0128)
Raw CV RMSE 18028 (STD 2353)


#### LGBM

In [8]:
lgbm_tune_kwargs = {
    "n_estimators": tune.loguniform(100, 10000),
    "max_depth": tune.randint(0, 5),
    'num_leaves': tune.quniform(1, 10, 1.0),               # xgb max_leaves
    "bagging_fraction": tune.quniform(0.5, 0.8, 0.01),    # xgb subsample
    "feature_fraction": tune.quniform(0.05, 0.5, 0.01),   # xgb colsample_bytree
    "learning_rate": tune.quniform(-3.0, -1.0, 0.5),
#     "wandb": {
#         "project": "iowa",
#     }        
}

#print("wandb name:", lgbm_tune_kwargs['wandb']['name'])
lgbm_tune_params = [k for k in lgbm_tune_kwargs.keys() if k != 'wandb']
print(lgbm_tune_params)


['n_estimators', 'max_depth', 'num_leaves', 'bagging_fraction', 'feature_fraction', 'learning_rate']


In [9]:
def my_lgbm(config):
    
    # fix these configs 
    config['n_estimators'] = int(config['n_estimators'])   # pass float eg loguniform distribution, use int
    config['num_leaves'] = int(2**config['num_leaves'])
    config['learning_rate'] = 10**config['learning_rate']
    
    lgbm = LGBMRegressor(objective='regression',
                         max_bin=200,
                         feature_fraction_seed=7,
                         min_data_in_leaf=2,
                         verbose=-1,
                         n_jobs=1,
                         # these are specified to suppress warnings
                         colsample_bytree=None,
                         min_child_samples=None,
                         subsample=None,
                         **config,
                         # early stopping params, maybe in fit
                         #early_stopping_rounds=early_stopping_rounds,
                         #valid_sets=[xgtrain, xgvalid], valid_names=['train','valid'], evals_result=evals_results
                         #num_boost_round=num_boost_round,
                         )
    
    scores = -cross_val_score(lgbm, df[predictors], df[response],
                              scoring="neg_root_mean_squared_error",
                              cv=kfolds)
    rmse=np.mean(scores)  
    tune.report(rmse=rmse)
    # wandb.log({"rmse": rmse})
    
    return {'rmse': np.mean(scores)}

In [10]:
# tune LightGBM
print("LightGBM HyperOpt")

NUM_SAMPLES=2048

start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

algo = HyperOptSearch(random_state_seed=RANDOMSTATE)
# uncomment and set max_concurrent to limit number of cores
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

# lgbm_tune_kwargs['wandb']['name'] = 'hyperopt_' + xgb_tune_kwargs['wandb']['name']

analysis = tune.run(my_lgbm,
                    num_samples=NUM_SAMPLES,
                    config = lgbm_tune_kwargs,
                    name="hyperopt_lgbm",
                    metric="rmse",
                    mode="min",
                    search_alg=algo,
                    scheduler=scheduler,
                    verbose=1,
#                     loggers=DEFAULT_LOGGERS + (WandbLogger, ),
                   )

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))


Trial name,status,loc,bagging_fraction,feature_fraction,learning_rate,max_depth,n_estimators,num_leaves,iter,total time (s),rmse
my_lgbm_fd709bcc,TERMINATED,,0.71,0.16,-3.0,4,4626.62,5,1,129.034,0.114896
my_lgbm_fd89b4e0,TERMINATED,,0.51,0.47,-2.5,3,2853.98,5,2,70.8371,0.115249
my_lgbm_fd976478,TERMINATED,,0.52,0.44,-1.0,1,5842.93,9,1,14.6198,0.114466
my_lgbm_2b16867c,TERMINATED,,0.52,0.11,-3.0,2,6877.73,5,1,17.8726,0.119708
my_lgbm_2b374c18,TERMINATED,,0.53,0.29,-1.5,3,220.67,8,1,1.33999,0.115158
my_lgbm_2b5718c2,TERMINATED,,0.72,0.46,-1.5,1,751.724,8,1,1.6665,0.118629
my_lgbm_2b6b21f0,TERMINATED,,0.79,0.3,-1.5,2,7563.79,8,2,41.979,0.111968
my_lgbm_2b8541ca,TERMINATED,,0.62,0.35,-2.5,0,3554.09,7,1,526.562,0.117366
my_lgbm_2b9d768c,TERMINATED,,0.52,0.2,-1.5,4,6918.6,6,2,96.5498,0.108537
my_lgbm_2bbfa8e2,TERMINATED,,0.56,0.22,-2.0,1,2519.09,10,1,6.28772,0.117232


2020-10-24 21:23:03,306	INFO tune.py:439 -- Total run time: 3912.79 seconds (3904.04 seconds for the tuning loop).


Start Time           2020-10-24 20:17:50.507207
End Time             2020-10-24 21:23:10.116729
1:05:19


In [11]:
param_cols = ['config.' + k for k in lgbm_tune_params]
analysis_results_df = analysis.results_df[['rmse', 'date', 'time_this_iter_s'] + param_cols].sort_values('rmse')
analysis_results_df


Unnamed: 0_level_0,rmse,date,time_this_iter_s,config.n_estimators,config.max_depth,config.num_leaves,config.bagging_fraction,config.feature_fraction,config.learning_rate
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
71f3a458,0.105126,2020-10-24_20-58-34,91.843580,3126,3,512,0.56,0.05,0.010000
0ae3b42e,0.105129,2020-10-24_20-54-28,15.319118,3121,3,256,0.53,0.05,0.010000
2df8aa4a,0.105132,2020-10-24_21-03-07,43.234116,3114,3,128,0.54,0.05,0.010000
12d3fb06,0.105132,2020-10-24_21-12-49,234.416534,3676,3,128,0.63,0.05,0.010000
1e82e9ec,0.105134,2020-10-24_20-48-05,25.408284,3107,3,128,0.55,0.05,0.010000
...,...,...,...,...,...,...,...,...,...
300aa528,0.277021,2020-10-24_20-19-19,2.223157,1041,1,4,0.63,0.38,0.001000
8c5a822e,0.280971,2020-10-24_21-11-59,0.738359,252,3,256,0.66,0.05,0.003162
69bb65a0,0.296315,2020-10-24_20-21-00,2.676574,480,3,32,0.56,0.14,0.001000
49a89688,0.300689,2020-10-24_20-27-11,1.127454,199,3,1024,0.56,0.05,0.003162


In [12]:
best_config = {z: analysis_results_df.iloc[0]['config.' + z] for z in lgbm_tune_params}

lgbm = LGBMRegressor(objective='regression',
                     max_bin=200,
                     feature_fraction_seed=7,
                     min_data_in_leaf=2,
                     verbose=-1,
                     **best_config,
                     # early stopping params, maybe in fit
                     #early_stopping_rounds=early_stopping_rounds,
                     #valid_sets=[xgtrain, xgvalid], valid_names=['train','valid'], evals_result=evals_results
                     #num_boost_round=num_boost_round,
                     )
 
print(lgbm)

scores = -cross_val_score(lgbm, df[predictors], df[response],
                          scoring="neg_root_mean_squared_error",
                          cv=kfolds)

raw_scores = [cv_to_raw(x) for x in scores]
print()
print("Log1p CV RMSE %.06f (STD %.04f)" % (np.mean(scores), np.std(scores)))
print("Raw CV RMSE %.0f (STD %.0f)" % (np.mean(raw_scores), np.std(raw_scores)))


LGBMRegressor(bagging_fraction=0.56, feature_fraction=0.05,
              feature_fraction_seed=7, learning_rate=0.01, max_bin=200,
              max_depth=3, min_data_in_leaf=2, n_estimators=3126,
              num_leaves=512, objective='regression', verbose=-1)

Log1p CV RMSE 0.105126 (STD 0.0136)
Raw CV RMSE 18459 (STD 2511)


In [13]:
# tune LightGBM
print("LightGBM Optuna")

NUM_SAMPLES=2048

start_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))

algo = OptunaSearch()
# uncomment and set max_concurrent to limit number of cores
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

# lgbm_tune_kwargs['wandb']['name'] = 'hyperopt_' + xgb_tune_kwargs['wandb']['name']

analysis = tune.run(my_lgbm,
                    num_samples=NUM_SAMPLES,
                    config = lgbm_tune_kwargs,
                    name="hyperopt_lgbm",
                    metric="rmse",
                    mode="min",
                    search_alg=algo,
                    scheduler=scheduler,
                    verbose=1,
#                     loggers=DEFAULT_LOGGERS + (WandbLogger, ),
                   )

end_time = datetime.now()
print("%-20s %s" % ("Start Time", start_time))
print("%-20s %s" % ("End Time", end_time))
print(str(timedelta(seconds=(end_time-start_time).seconds)))


Trial name,status,loc,bagging_fraction,feature_fraction,learning_rate,max_depth,n_estimators,num_leaves,iter,total time (s),rmse
my_lgbm_797de638,TERMINATED,,0.67,0.18,-1.0,3,3563.95,8,2,25.1464,0.110319
my_lgbm_7987ca2c,TERMINATED,,0.61,0.21,-1.0,0,1552.24,4,2,28.0269,0.111001
my_lgbm_798d40c4,TERMINATED,,0.7,0.11,-2.5,3,2700.88,8,2,11.7271,0.11092
my_lgbm_7994475c,TERMINATED,,0.57,0.39,-3.0,3,3613.64,4,1,34.4769,0.126178
my_lgbm_799a3acc,TERMINATED,,0.62,0.29,-3.0,4,2231.87,6,1,21.8105,0.137868
my_lgbm_799f4a62,TERMINATED,,0.8,0.5,-3.0,4,173.796,8,1,2.32393,0.345365
my_lgbm_79a51bcc,TERMINATED,,0.69,0.24,-2.5,5,8913.35,10,1,132.402,0.109164
my_lgbm_79ab87fa,TERMINATED,,0.59,0.37,-2.5,0,2331.92,7,1,247.747,0.117625
my_lgbm_79b1a5f4,TERMINATED,,0.72,0.25,-2.5,0,675.74,2,1,3.8897,0.166525
my_lgbm_79b6d15a,TERMINATED,,0.61,0.07,-2.5,1,448.799,5,1,0.728974,0.271536


2020-10-24 22:28:12,663	INFO tune.py:439 -- Total run time: 2889.54 seconds (2880.76 seconds for the tuning loop).


Start Time           2020-10-24 21:40:03.109168
End Time             2020-10-24 22:28:19.354769
0:48:16


In [14]:
param_cols = ['config.' + k for k in lgbm_tune_params]
analysis_results_df = analysis.results_df[['rmse', 'date', 'time_this_iter_s'] + param_cols].sort_values('rmse')
analysis_results_df


Unnamed: 0_level_0,rmse,date,time_this_iter_s,config.n_estimators,config.max_depth,config.num_leaves,config.bagging_fraction,config.feature_fraction,config.learning_rate
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
c5d6f676,0.105119,2020-10-24_21-56-43,1.938774,3100,3,32,0.69,0.05,0.010
df680616,0.105126,2020-10-24_21-57-52,30.241344,3126,3,16,0.76,0.05,0.010
df3d57e0,0.105127,2020-10-24_21-57-51,29.300410,3128,3,16,0.76,0.05,0.010
cb8625c4,0.105128,2020-10-24_21-56-52,3.771016,3124,3,32,0.75,0.05,0.010
dcef8cdc,0.105129,2020-10-24_22-05-23,55.165939,3116,3,32,0.75,0.05,0.010
...,...,...,...,...,...,...,...,...,...
7aba6f4e,0.324206,2020-10-24_21-40-09,1.873255,399,4,4,0.73,0.11,0.001
7b3c525c,0.340894,2020-10-24_21-40-10,3.444390,192,5,64,0.53,0.23,0.001
7f4449b8,0.343825,2020-10-24_21-40-16,1.564945,229,2,512,0.54,0.40,0.001
799f4a62,0.345365,2020-10-24_21-40-06,2.323931,173,4,256,0.80,0.50,0.001


In [15]:
best_config = {z: analysis_results_df.iloc[0]['config.' + z] for z in lgbm_tune_params}

lgbm = LGBMRegressor(objective='regression',
                     max_bin=200,
                     feature_fraction_seed=7,
                     min_data_in_leaf=2,
                     verbose=-1,
                     **best_config,
                     # early stopping params, maybe in fit
                     #early_stopping_rounds=early_stopping_rounds,
                     #valid_sets=[xgtrain, xgvalid], valid_names=['train','valid'], evals_result=evals_results
                     #num_boost_round=num_boost_round,
                     )
 
print(lgbm)

scores = -cross_val_score(lgbm, df[predictors], df[response],
                          scoring="neg_root_mean_squared_error",
                          cv=kfolds)

raw_scores = [cv_to_raw(x) for x in scores]
print()
print("Log1p CV RMSE %.06f (STD %.04f)" % (np.mean(scores), np.std(scores)))
print("Raw CV RMSE %.0f (STD %.0f)" % (np.mean(raw_scores), np.std(raw_scores)))


LGBMRegressor(bagging_fraction=0.69, feature_fraction=0.05,
              feature_fraction_seed=7, learning_rate=0.01, max_bin=200,
              max_depth=3, min_data_in_leaf=2, n_estimators=3100, num_leaves=32,
              objective='regression', verbose=-1)

Log1p CV RMSE 0.105119 (STD 0.0136)
Raw CV RMSE 18458 (STD 2511)
