# Parallelize HPO of XGBoost with Optuna and Dask

In [1]:
FILEPATH="s3://prefect-dask-examples/nyc-uber-lyft/feature_table_fixed_upper_bound.parquet"
WORKER_INSTANCE_TYPE = "m6i.4xlarge"
# CLUSTERS = {}    # Dict for shutting down clusters when we're done.
# NUM_THREADS = 4  # Number of threads to use
# NUM_FUTURES = 4  # Number of Futures to create

In [2]:
import datetime
import joblib

from distributed import Client, wait
import dask.dataframe as dd
import coiled

import dask
import numpy as np
import pandas as pd
import optuna
from dask_ml.metrics import mean_squared_error as lazy_mse
import xgboost as xgb
from xgboost.dask import DaskDMatrix

import dask.dataframe as dd
from xgboost.core import XGBoostError

In [3]:
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.59
dask: 2022.12.1
dask.distributed: 2022.12.1
optuna: 3.1.0b0
xgboost: 1.7.3
coiled: 0.2.59


### Train Model

In [4]:
# Here we subset data for cross-validation
def _make_cv(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test

In [5]:
def train_model(trial_number, study_params, n_splits=5):

    cluster = coiled.Cluster(
        worker_vm_types=[WORKER_INSTANCE_TYPE],
        scheduler_vm_types=["m6i.2xlarge"],
        package_sync=True, # copy local packages
        shutdown_on_close=True,
        show_widget=False,
        n_workers=10,
        use_best_zone=True,
        account="dask-engineering",
        backend_options={"region": "us-east-2", "spot": True},
        scheduler_options={"idle_timeout": "10 minutes"},
        )

    print("starting run")
    with Client(cluster) as client:
        # Load and pre-process the DataFrame
        ddf = dd.read_parquet(FILEPATH)
        categorical_vars = ddf.select_dtypes(include="category").columns.tolist()
        ddf = ddf.categorize(columns=categorical_vars)
        float_cols = ddf.select_dtypes(include="float").columns.tolist()
        ddf[float_cols] = ddf[float_cols].astype(np.float32).persist()  # Under the hood, XGBoost converts floats to `float32`

        val_scores = []

        for i, (train, test) in enumerate(_make_cv(ddf, n_splits)):
            print(f"Starting training run {i}")
            start = datetime.datetime.now()
            train = dd.concat(train)

            try:
                assert all(train[c].cat.known for c in categorical_vars)
                assert all(test[c].cat.known for c in categorical_vars)
            except Exception as e:
                cluster.shutdown()
                raise RuntimeError(f"Categorical_vars are not known")

            y_train = train['trip_time'].to_frame().persist()
            X_train = train.drop(columns=['trip_time']).persist()

            # Make the training data
            y_test = test['trip_time'].to_frame().persist()
            X_test = test.drop(columns='trip_time').persist()

            try:
                print("Make dtrain")
                dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

                print("Training model")
                model = xgb.dask.train(
                    client,
                    {
                        'verbosity': 1,
                        'tree_method': 'hist', 
                        "objective": "reg:squarederror",
                        **study_params
                    },
                    dtrain,
                    num_boost_round=4,
                    evals=[(dtrain, "train")],
                )

                print("Make predictions")
                # It's faster to run the prediction directly on X_test DataFrame
                # We also need to confirm that predictions on dtest when it
                # contains categoricals performs as expected
                predictions = xgb.dask.predict(client, model, X_test)

                print("Score the model")
                score = lazy_mse(y_test.to_dask_array(lengths=True).reshape(-1,), 
                                 predictions.to_dask_array(lengths=True), squared=False,
                                )
                wait(score)  # Explicitly waiting avoids https://github.com/dask/distributed/issues/4612
                print(f"rmse_score:  {score}")
                val_scores.append(score)
                print(f"val_scores:  {val_scores}")
                print(f"Finished training run in:  {datetime.datetime.now() - start} seconds")

            except XGBoostError as e:
                print(f"Trial {i} failed with {e}")

        return np.mean(val_scores)

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 125),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }
    rmse = train_model(
        trial_number=trial.number,
        study_params=params, 
        n_splits=5,
    )
    print(f"final mse:  {rmse}")
    return rmse

In [None]:
# Create a single study and run 5 trials

study = optuna.create_study(study_name="nyc-travel-time-model-m6i4xlarge")    
study.optimize(objective, n_trials=5)

[32m[I 2023-01-09 15:06:15,846][0m A new study created in memory with name: nyc-travel-time-model-m6i4xlarge[0m


starting run
Starting training run 0
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  444.9999263332771
val_scores:  [444.9999263332771]
Finished training run in:  0:01:32.673216 seconds
Starting training run 1
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  493.27415542637414
val_scores:  [444.9999263332771, 493.27415542637414]
Finished training run in:  0:01:25.247185 seconds
Starting training run 2
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  478.4477107284105
val_scores:  [444.9999263332771, 493.27415542637414, 478.4477107284105]
Finished training run in:  0:01:30.373519 seconds
Starting training run 3
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  445.3277982614692
val_scores:  [444.9999263332771, 493.27415542637414, 478.4477107284105, 445.3277982614692]
Finished training run in:  0:01:28.547654 seconds
Starting training run 4
Make dtrain
Training model
Make predictions
Score the

[32m[I 2023-01-09 15:15:08,397][0m Trial 0 finished with value: 457.1135157717722 and parameters: {'n_estimators': 63, 'learning_rate': 0.8258905741822139, 'subsample': 0.10166860006551871, 'max_depth': 7, 'colsample_bytree': 0.80904052803992, 'min_child_weight': 3, 'colsample_bynode': 0.1942027179183018, 'colsample_bylevel': 0.48159179640514105, 'reg_alpha': 0.023606899234248635, 'reg_lambda': 0.17415625715242555}. Best is trial 0 with value: 457.1135157717722.[0m


final mse:  457.1135157717722
starting run
Starting training run 0
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  631.8728207323597
val_scores:  [631.8728207323597]
Finished training run in:  0:01:21.469385 seconds
Starting training run 1
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  631.791727744905
val_scores:  [631.8728207323597, 631.791727744905]
Finished training run in:  0:01:24.012921 seconds
Starting training run 2
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  631.7969643619515
val_scores:  [631.8728207323597, 631.791727744905, 631.7969643619515]
Finished training run in:  0:01:25.089265 seconds
Starting training run 3
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  631.7980520375294
val_scores:  [631.8728207323597, 631.791727744905, 631.7969643619515, 631.7980520375294]
Finished training run in:  0:01:22.687379 seconds
Starting training run 4
Make dtrain
Training model
Make

In [9]:
len(study.trials)

4

In [10]:
study.best_params

{'n_estimators': 67,
 'learning_rate': 0.31678535703204,
 'subsample': 0.30019617918399855,
 'max_depth': 5,
 'colsample_bytree': 0.8257536161869092,
 'min_child_weight': 1,
 'colsample_bynode': 0.4380011259355694,
 'colsample_bylevel': 0.09039924240088493,
 'reg_alpha': 0.2606665648782066,
 'reg_lambda': 0.33684573169250664}

In [11]:
study.best_value

510.8644979342265

In [12]:
study.best_trial

FrozenTrial(number=2, state=TrialState.COMPLETE, values=[510.8644979342265], datetime_start=datetime.datetime(2023, 1, 9, 14, 43, 41, 27093), datetime_complete=datetime.datetime(2023, 1, 9, 14, 53, 57, 916054), params={'n_estimators': 67, 'learning_rate': 0.31678535703204, 'subsample': 0.30019617918399855, 'max_depth': 5, 'colsample_bytree': 0.8257536161869092, 'min_child_weight': 1, 'colsample_bynode': 0.4380011259355694, 'colsample_bylevel': 0.09039924240088493, 'reg_alpha': 0.2606665648782066, 'reg_lambda': 0.33684573169250664}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=125, log=False, low=50, step=1), 'learning_rate': FloatDistribution(high=0.9, log=False, low=0.1, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'max_depth': IntDistribution(high=9, log=False, low=3, step=1), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'min_child_weight': IntDist

In [None]:
# Uncomment this if you want to save the results of your study to examine later.

#joblib.dump(study, "data/study_m6i4xlarge.pkl")