# HPO of XGBoost with Optuna and Dask

In [None]:
from __future__ import annotations

from collections.abc import Iterator
from functools import partial
from datetime import datetime

import dask.array as da
import dask.dataframe as dd
import coiled
import distributed
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
import xgboost
from dask_ml.metrics import mean_squared_error

In [None]:
# Coiled account
ACCOUNT = "dask-engineering"
# Location of feature table
FILEPATH = "s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"

# Number of parallel optuna jobs to run
N_JOBS = 10
# Number of converging serial trials to run in each job
N_TRIALS = 5
# Number of folds in each trial. This also determines the train/test split
# (e.g. N_FOLDS=5 -> train=4/5 of the total data, test=1/5)
N_FOLDS = 5

# The number of training exercises that will be run in total is
# N_JOBS * N_TRIALS * N_FOLDS

# Keep the number of parallel training exercises to the bare minimum
# to ensure pipelining. More than this would just overwhelm the scheduler.
N_PARALLEL = 2

# Dask worker instance type and number
WORKER_INSTANCE_TYPE = "r6i.large"
N_WORKERS = 100

### Start coiled cluster

In [None]:
cluster = coiled.Cluster(
    worker_vm_types=[WORKER_INSTANCE_TYPE],
    scheduler_vm_types=["m6i.large"],
    package_sync=True,  # align remote packages to local ones
    n_workers=N_WORKERS,
    account=ACCOUNT,
    backend_options={
        "region": "us-east-2",
        "multizone": True,
        "spot": True,
        "spot_on_demand_fallback": True,
    },
    scheduler_options={"idle_timeout": "30 minutes"},
)
client = distributed.Client(cluster)

In [None]:
# Load feature table generated by Feature Engineering.ipynb
ddf = dd.read_parquet(FILEPATH)

# Reduce dataset size. Uncomment to speed up the exercise.
# ddf = ddf.partitions[:20]

# Under the hood, XGBoost converts floats to `float32`.
# Let's do it only once here.
float_cols = ddf.select_dtypes(include="float").columns.tolist()
ddf = ddf.astype({c: np.float32 for c in float_cols})

# We need the categories to be known
categorical_vars = ddf.select_dtypes(include="category").columns.tolist()

# categorize() reads the whole input and then discards it.
# Let's read from disk only once.
ddf = ddf.persist()
ddf = ddf.categorize(columns=categorical_vars)

# We will need to access this multiple times.
# We'll also need to retrieve it from the workers.
if "ddf" in client.datasets:
    client.unpublish_dataset("ddf")
client.publish_dataset(ddf=ddf)

ddf.head()

### Train Model

In [None]:
# Here we subset data for cross-validation
def make_cv_splits(
    ddf: dd.DataFrame, n_folds: int = 5
) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    splits = ddf.random_split(frac, shuffle=True)
    for i in range(n_folds):
        train = [splits[j] for j in range(n_folds) if j != i]
        test = splits[i]
        yield dd.concat(train), test

In [None]:
def train_once(
    train: dd.DataFrame,
    test: dd.DataFrame,
    sem: distributed.Semaphore,
    study_params: dict[str, float],
):
    distributed.secede()
    # Block until there are less than N_PARALLEL train_once
    # critical sections running
    with sem:
        y_train = train["trip_time"]
        X_train = train.drop(columns=["trip_time"])
        y_test = test["trip_time"]
        X_test = test.drop(columns=["trip_time"])

        d_train = xgboost.dask.DaskDMatrix(
            None, X_train, y_train, enable_categorical=True
        )
        # This has its own internal semaphore with a limit of 1
        model = xgboost.dask.train(
            None,
            {"tree_method": "hist", **study_params},
            d_train,
            num_boost_round=4,
            evals=[(d_train, "train")],
        )
        predictions = xgboost.dask.predict(None, model, X_test)
        score = mean_squared_error(
            y_test.to_dask_array(),
            predictions.to_dask_array(),
            squared=False,
        )
        return score


def train_model(
    ddf_name: str,
    n_folds: int,
    sem: distributed.Semaphore,
    study_params: dict[str, float],
):
    client = distributed.get_client()
    ddf = client.get_dataset(ddf_name)

    futures = [
        client.submit(train_once, train, test, sem, study_params, pure=False)
        for train, test in make_cv_splits(ddf, n_folds)
    ]

    try:
        distributed.secede()
    except KeyError:
        # Already seceded in a previous iteration of study.optimize()
        pass

    scores = client.gather(futures)
    return sum(scores) / len(scores)


def objective(trial, ddf_name: str, n_folds: int, sem: distributed.Semaphore) -> float:
    study_params = {
        "n_estimators": trial.suggest_int("n_estimators", 75, 125),
        "learning_rate": trial.suggest_float("learning_rate", 0.5, 0.7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 6),
        "max_leaves": trial.suggest_int("max_leaves", 0, 2),
        "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    return train_model(ddf_name, n_folds, sem, study_params)

In [None]:
# Create a single study and run some trials
start = datetime.now()
storage = optuna.integration.DaskStorage()
study = optuna.create_study(storage=storage, study_name="nyc-travel-time-model")


# Run N_JOBS in parallel
# each job will run N_TRIALS converging trials in series
# each trial will start N_FOLDS training exercises in parallel
# but only up to N_PARALLEL training exercise will actually submit
# tasks to the scheduler at the same time.

# In pure optuna, we would have used
# study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS)
sem = distributed.Semaphore(N_PARALLEL)
futures = [
    client.submit(
        study.optimize,
        partial(objective, ddf_name="ddf", n_folds=N_FOLDS, sem=sem),
        n_trials=N_TRIALS,
        pure=False,
    )
    for _ in range(N_JOBS)
]
client.gather(futures)

print(f"Total time:  {datetime.now() - start}")

In [None]:
cluster.shutdown()

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
study.best_trial

In [None]:
# Save the results of your study to examine later
joblib.dump(study, "study_single_cluster.pickle")

In [None]:
fig = optuna.visualization.matplotlib.plot_optimization_history(study)
fig.legend(loc="upper right")
plt.savefig("optimization_history_study_1.png")

In [None]:
client.restart()