# Parallel HPO with XGBoost/Dask/Optuna with multiple clusters

Training one model can be slow.  Parallelism can help!

Training many models during hyper-parameter optimization can be even slower.  Even more parallelism can help!

In this example we extend our previous notebook to run many model trainings in parallel, each model running in a separate Dask cluster.  This allows us to accelerate our search for a good model by using more hardware.  It's important to note here that there are two levels of parallelism:

1.  Each model runs in parallel using Dask
2.  Trigger many such runs in different threads locally

Each local thread does very little work, it just asks Dask to manage a large remote job.

![high level diagram](Modeling_3.svg)

In [None]:
from __future__ import annotations

import joblib
import threading
from datetime import datetime
from collections.abc import Iterator

import coiled
import dask
import dask.array as da
import dask.dataframe as dd
import distributed
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import xgboost
from dask_ml.metrics import mean_squared_error

In [None]:
# This is only for xgboost 1.7.1.
# Not necessary with xgboost 1.7.3.
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
# Location of feature table
FILEPATH = "s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"

# Number of parallel optuna jobs to run
N_JOBS = 10
# Total number of converging trials to run across the various jobs
N_TRIALS = 50
# Number of folds in each trial. This also determines the train/test split
# (e.g. N_FOLDS=5 -> train=4/5 of the total data, test=1/5)
N_FOLDS = 5

# Dask worker instance type and number (per cluster)
# Total number of EC2 instances spun up = N_JOBS * N_WORKERS
WORKER_INSTANCE_TYPE = "r6i.large"
N_WORKERS = 50

In [None]:
clusters: dict[int, tuple[distributed.Client, dd.DataFrame]] = {}


def get_ddf() -> tuple[distributed.Client, dd.DataFrame]:
    thread_id = threading.get_ident()
    try:
        return clusters[thread_id]
    except KeyError:
        pass

    cluster = coiled.Cluster(
        name=f"xgb-nyc-taxi-gbh-{thread_id}",
        worker_vm_types=[WORKER_INSTANCE_TYPE],
        scheduler_vm_types=["m6i.large"],
        package_sync=True,  # align remote packages to local ones
        n_workers=N_WORKERS,
        backend_options={
            "region": "us-east-2",
            "multizone": True,
            "spot": True,
            "spot_on_demand_fallback": True,
        },
        scheduler_options={"idle_timeout": "15 minutes"},
    )

    client = distributed.Client(cluster, set_as_default=False)
    print("Started cluster at", client.dashboard_link)

    with client.as_current():
        # Load feature table generated by Feature Engineering.ipynb
        ddf = dd.read_parquet(FILEPATH)

        # Reduce dataset size. Uncomment to speed up the exercise.
        # ddf = ddf.partitions[:20]

        # Under the hood, XGBoost converts floats to `float32`.
        # Let's do it only once here.
        float_cols = ddf.select_dtypes(include="float").columns.tolist()
        ddf = ddf.astype({c: np.float32 for c in float_cols})

        # We need the categories to be known
        categorical_vars = ddf.select_dtypes(include="category").columns.tolist()

        # categorize() reads the whole input and then discards it.
        # Let's read from disk only once.
        ddf = ddf.persist()
        # FIXME https://github.com/dask/dask/issues/9901
        ddf = ddf.categorize(columns=categorical_vars, scheduler=client)

        # We will need to access this multiple times. Let's persist it.
        ddf = ddf.persist()

        clusters[thread_id] = client, ddf
        return client, ddf

### Train Model

In [None]:
# Here we subset data for cross-validation
def make_cv_splits(ddf: dd.DataFrame, n_folds: int = N_FOLDS) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    splits = ddf.random_split(frac, shuffle=True)
    for i in range(n_folds):
        train = [splits[j] for j in range(n_folds) if j != i]
        test = splits[i]
        yield dd.concat(train), test

In [None]:
def train_model(study_params: dict[str, float]) -> float:
    scores = []
    client, ddf = get_ddf()

    with client.as_current():
        for train, test in make_cv_splits(ddf):
            y_train = train["trip_time"]
            X_train = train.drop(columns=["trip_time"])
            y_test = test["trip_time"]
            X_test = test.drop(columns=["trip_time"])

            d_train = xgboost.dask.DaskDMatrix(
                client, X_train, y_train, enable_categorical=True
            )
            model = xgboost.dask.train(
                client,
                {"tree_method": "hist", **study_params},
                d_train,
                num_boost_round=4,
                evals=[(d_train, "train")],
            )
            predictions = xgboost.dask.predict(None, model, X_test)
            score = mean_squared_error(
                y_test.to_dask_array(),
                predictions.to_dask_array(),
                squared=False,
                compute=False,
            )
            # Compute predictions and mean squared error for this iteration
            # while we start the next one
            scores.append(score.reshape(1).persist())
            del d_train, model, predictions, score
            print("-" * 80)

        scores = da.concatenate(scores).compute()
        return scores.mean()

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 75, 125),
        "learning_rate": trial.suggest_float("learning_rate", 0.5, 0.7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 6),
        "max_leaves": trial.suggest_int("max_leaves", 0, 2),
        "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    return train_model(params)

In [None]:
# Create a single study
start = datetime.now()
study = optuna.create_study(study_name="parallel-nyc-travel-time-model")
study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS)
print(f"Total time:  {datetime.now() - start}")

In [None]:
# Tear down running clusters
for client, _ in clusters.values():
    client.shutdown()

In [None]:
len(study.trials)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
# Uncomment this if you want to save the results of your study to examine later.

joblib.dump(study, "study_many_threads.pickle")

In [None]:
fig = optuna.visualization.matplotlib.plot_optimization_history(study)
fig.legend(loc="upper right")
plt.savefig("optimization_history_study_2.png")