# HPO of XGBoost with Optuna and Dask

In the last notebook we trained a single XGBoost model with fixed hyper-parameters.  Those hyper-parameters were likely wrong.  In this notebook we use [Optuna](https://optuna.org/) to perform hyper-parameter-optimization (HPO) over a space of parameters to find the best model.  This involves training the same dataset repeatedly.

The primary difference between this notebook and the previous one is the creation of an `objective` function, and the use of Optuna studies.  At the end we look at the progress during HPO.

In [None]:
from __future__ import annotations

from collections.abc import Iterator
from datetime import datetime

import dask.array as da
import dask.dataframe as dd
import coiled
import distributed
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
import xgboost
from dask_ml.metrics import mean_squared_error

In [None]:
# Location of feature table
FILEPATH = "s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"

# Number of converging serial trials to run in each job
N_TRIALS = 5
# Number of folds in each trial. This also determines the train/test split
# (e.g. N_FOLDS=5 -> train=4/5 of the total data, test=1/5)
N_FOLDS = 5

# Dask worker instance type and number
WORKER_INSTANCE_TYPE = "r6i.large"
N_WORKERS = 40

### Start coiled cluster

**Note:** at the moment of writing, the size of the input dataset dictates the amount of RAM that your cluster must mount. With this dataset of ~55 GiB, you need any combination that will result in 640 GiB cluster memory, so:
- 80x m6i.large, or
- 40x r6i.large, or
- 40x m6i.xlarge,
- 20x r6i.xlarge,

and so on.

In [None]:
cluster = coiled.Cluster(
    worker_vm_types=[WORKER_INSTANCE_TYPE],
    scheduler_vm_types=["m6i.large"],
    package_sync=True,  # align remote packages to local ones
    n_workers=N_WORKERS,
    backend_options={
        "region": "us-east-2",
        "multizone": True,
        "spot": True,
        "spot_on_demand_fallback": True,
    },
    scheduler_options={"idle_timeout": "15 minutes"},
)
client = distributed.Client(cluster)

In [None]:
# Load feature table generated by Feature Engineering.ipynb
ddf = dd.read_parquet(FILEPATH)

# Reduce dataset size. Uncomment to speed up the exercise.
# ddf = ddf.partitions[:20]

# Under the hood, XGBoost converts floats to `float32`.
# Let's do it only once here.
float_cols = ddf.select_dtypes(include="float").columns.tolist()
ddf = ddf.astype({c: np.float32 for c in float_cols})

# We need the categories to be known
categorical_vars = ddf.select_dtypes(include="category").columns.tolist()

# categorize() reads the whole input and then discards it.
# Let's read from disk only once.
ddf = ddf.persist()
ddf = ddf.categorize(columns=categorical_vars)

# We will need to access this multiple times. Let's persist it.
ddf = ddf.persist()

ddf.head()

### Train Model

In [None]:
# Here we subset data for cross-validation
def make_cv_splits(
    n_folds: int = N_FOLDS,
) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    splits = ddf.random_split(frac, shuffle=True)
    for i in range(n_folds):
        train = [splits[j] for j in range(n_folds) if j != i]
        test = splits[i]
        yield dd.concat(train), test

In [None]:
def train_model(**study_params):
    scores = []

    for i, (train, test) in enumerate(make_cv_splits()):
        print(f"Training/Test split #{i}")
        y_train = train["trip_time"]
        X_train = train.drop(columns=["trip_time"])
        y_test = test["trip_time"]
        X_test = test.drop(columns=["trip_time"])

        print("Building DMatrix...")
        d_train = xgboost.dask.DaskDMatrix(
            None, X_train, y_train, enable_categorical=True
        )

        print("Training model...")
        model = xgboost.dask.train(
            None,
            {"tree_method": "hist", **study_params},
            d_train,
            num_boost_round=4,
            evals=[(d_train, "train")],
        )

        print("Running model on test data...")
        predictions = xgboost.dask.predict(None, model, X_test)

        print("Measuring accuracy of model vs. ground truth...")
        score = mean_squared_error(
            y_test.to_dask_array(),
            predictions.to_dask_array(),
            squared=False,
            compute=False,
        )
        # Compute predictions and mean squared error for this iteration
        # while we start the next one
        scores.append(score.reshape(1).persist())
        del d_train, model, predictions, score
        print("-" * 80)

    scores = da.concatenate(scores).compute()
    print(f"RSME={scores.mean()} +/- {scores.std()}")
    return scores.mean()
    print("-" * 80)


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 75, 125),
        "learning_rate": trial.suggest_float("learning_rate", 0.5, 0.7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 6),
        "max_leaves": trial.suggest_int("max_leaves", 0, 2),
        "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    print(f"Training model (trial #{trial.number})")
    for k, v in params.items():
        print(f"  {k}={v}")
    return train_model(**params)

In [None]:
# Create a single study and run some trials
start = datetime.now()
study = optuna.create_study(study_name="nyc-travel-time-model")
study.optimize(objective, n_trials=N_TRIALS)

print(f"Total time:  {datetime.now() - start}")

In [None]:
cluster.shutdown()

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
study.best_trial

In [None]:
# Save the results of your study to examine later
joblib.dump(study, "study_single_cluster.pickle")

In [None]:
fig = optuna.visualization.matplotlib.plot_optimization_history(study)
fig.legend(loc="upper right")
plt.savefig("optimization_history_study_1.png")