# HPO of XGBoost with Optuna and Dask

In [None]:
from datetime import datetime
import uuid

from distributed import Client
import dask.dataframe as dd
import coiled

import dask
import numpy as np
import pandas as pd
from dask_ml.metrics import mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix

import dask.dataframe as dd

In [None]:
# Coiled account
ACCOUNT = "dask-engineering"
# Location of the feature table
FILEPATH = "s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"

In [None]:
cluster = coiled.Cluster(
    worker_vm_types=["m6i.xlarge"],  # EC2 instance types
    scheduler_vm_types=["m6i.2xlarge"],
    package_sync=True,  # copy local packages
    show_widget=False,
    n_workers=20,  # 20 dask workers
    account=ACCOUNT,  # Add your account
    backend_options={"region": "us-east-2", "spot": True},  # Prefer spot instances
    scheduler_options={"idle_timeout": "5 minutes"},  # Shutdown if idle to save cost
)
client = Client(cluster)
client

### Train Model

In [None]:
# Make KFolds for cross-validation


def make_cv_splits(df, num_folds):
    frac = [1 / num_folds] * num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test

In [None]:
# Load and pre-process the DataFrame

ddf = dd.read_parquet(FILEPATH)
categorical_vars = ddf.select_dtypes(include="category").columns.tolist()
ddf = ddf.categorize(columns=categorical_vars)  # We need to categories to be `known`
float_cols = ddf.select_dtypes(include="float").columns.tolist()
ddf[float_cols] = (
    ddf[float_cols].astype(np.float32).persist()
)  # XGBoost converts float64 to float32

val_scores = []

start = datetime.now()
for train, test in make_cv_splits(ddf, 5):
    train = dd.concat(train)
    y_train = train["trip_time"].to_frame().persist()
    X_train = train.drop(columns=["trip_time"]).persist()

    # Make the test data
    y_test = test["trip_time"].to_frame().persist()
    X_test = test.drop(columns="trip_time").persist()

    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

    model = xgb.dask.train(
        client,
        {"tree_method": "hist"},
        dtrain,
        num_boost_round=4,
        evals=[(dtrain, "train")],
    )

    predictions = xgb.dask.predict(client, model, X_test)

    score = mean_squared_error(
        y_test.to_dask_array(lengths=True).reshape(
            -1,
        ),
        predictions.to_dask_array(lengths=True),
        squared=False,
    )
    val_scores.append(score)
print(f"Total time:  {datetime.now() - start} seconds")
print(np.mean(val_scores))

In [None]:
client.shutdown()

In [None]:
model