# HPO of XGBoost with Optuna and Dask

In [1]:
from datetime import datetime
import uuid

from distributed import Client
import dask.dataframe as dd
import coiled

import dask
import numpy as np
import pandas as pd
from dask_ml.metrics import mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix

import dask.dataframe as dd

In [2]:
ACCOUNT="dask-engineering"                                                        # <-- This is your account
FILEPATH="s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"  # <-- Location of the feature table

In [3]:
cluster = coiled.Cluster(
    worker_vm_types=["m6i.xlarge"],                        # EC2 instance types
    scheduler_vm_types=["m6i.2xlarge"],
    package_sync=True,                                      # copy local packages
    show_widget=False,
    n_workers=20,                                           # 20 dask workers
    account=ACCOUNT,                                        # Add your account
    backend_options={"region": "us-east-2", "spot": True},  # Prefer spot instances
    scheduler_options={"idle_timeout": "5 minutes"},        # Shutdown if idle to save cost
    )
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: coiled.ClusterBeta
Dashboard: http://3.12.154.143:8787,

0,1
Dashboard: http://3.12.154.143:8787,Workers: 12
Total threads: 48,Total memory: 178.29 GiB

0,1
Comm: tls://10.0.13.160:8786,Workers: 12
Dashboard: http://10.0.13.160:8787/status,Total threads: 48
Started: Just now,Total memory: 178.29 GiB

0,1
Comm: tls://10.0.1.191:35343,Total threads: 4
Dashboard: http://10.0.1.191:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.1.191:44303,
Local directory: /scratch/dask-worker-space/worker-0k733f0p,Local directory: /scratch/dask-worker-space/worker-0k733f0p

0,1
Comm: tls://10.0.7.249:35103,Total threads: 4
Dashboard: http://10.0.7.249:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.7.249:42109,
Local directory: /scratch/dask-worker-space/worker-yg8vxq6m,Local directory: /scratch/dask-worker-space/worker-yg8vxq6m

0,1
Comm: tls://10.0.3.74:38039,Total threads: 4
Dashboard: http://10.0.3.74:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.3.74:33241,
Local directory: /scratch/dask-worker-space/worker-2uocveef,Local directory: /scratch/dask-worker-space/worker-2uocveef

0,1
Comm: tls://10.0.15.57:41433,Total threads: 4
Dashboard: http://10.0.15.57:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.15.57:37231,
Local directory: /scratch/dask-worker-space/worker-cej5xy9j,Local directory: /scratch/dask-worker-space/worker-cej5xy9j

0,1
Comm: tls://10.0.7.217:36587,Total threads: 4
Dashboard: http://10.0.7.217:8787/status,Memory: 14.85 GiB
Nanny: tls://10.0.7.217:36987,
Local directory: /scratch/dask-worker-space/worker-h1m721pw,Local directory: /scratch/dask-worker-space/worker-h1m721pw

0,1
Comm: tls://10.0.8.20:34863,Total threads: 4
Dashboard: http://10.0.8.20:8787/status,Memory: 14.85 GiB
Nanny: tls://10.0.8.20:38649,
Local directory: /scratch/dask-worker-space/worker-rlhmxffn,Local directory: /scratch/dask-worker-space/worker-rlhmxffn

0,1
Comm: tls://10.0.4.22:37279,Total threads: 4
Dashboard: http://10.0.4.22:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.4.22:42209,
Local directory: /scratch/dask-worker-space/worker-rrewpdys,Local directory: /scratch/dask-worker-space/worker-rrewpdys

0,1
Comm: tls://10.0.11.181:42659,Total threads: 4
Dashboard: http://10.0.11.181:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.11.181:40561,
Local directory: /scratch/dask-worker-space/worker-fmxvgl2o,Local directory: /scratch/dask-worker-space/worker-fmxvgl2o

0,1
Comm: tls://10.0.15.214:37113,Total threads: 4
Dashboard: http://10.0.15.214:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.15.214:35473,
Local directory: /scratch/dask-worker-space/worker-ooemzvsg,Local directory: /scratch/dask-worker-space/worker-ooemzvsg

0,1
Comm: tls://10.0.10.219:38259,Total threads: 4
Dashboard: http://10.0.10.219:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.10.219:43617,
Local directory: /scratch/dask-worker-space/worker-fvtpf9p4,Local directory: /scratch/dask-worker-space/worker-fvtpf9p4

0,1
Comm: tls://10.0.10.143:34691,Total threads: 4
Dashboard: http://10.0.10.143:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.10.143:40849,
Local directory: /scratch/dask-worker-space/worker-_6auoiva,Local directory: /scratch/dask-worker-space/worker-_6auoiva

0,1
Comm: tls://10.0.12.163:45659,Total threads: 4
Dashboard: http://10.0.12.163:8787/status,Memory: 14.86 GiB
Nanny: tls://10.0.12.163:44529,
Local directory: /scratch/dask-worker-space/worker-fej_krl1,Local directory: /scratch/dask-worker-space/worker-fej_krl1


### Train Model

In [5]:
# Make KFolds for cross-validation

def make_cv_splits(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test

In [6]:
# Load and pre-process the DataFrame

ddf = dd.read_parquet(FILEPATH)
categorical_vars = ddf.select_dtypes(include="category").columns.tolist()
ddf = ddf.categorize(columns=categorical_vars)                   # We need to categories to be `known`
float_cols = ddf.select_dtypes(include="float").columns.tolist()
ddf[float_cols] = ddf[float_cols].astype(np.float32).persist()   # XGBoost converts float64 to float32

val_scores = []

start = datetime.now()
for train, test in make_cv_splits(ddf, 5):
    train = dd.concat(train)
    y_train = train['trip_time'].to_frame().persist()
    X_train = train.drop(columns=['trip_time']).persist()

    # Make the test data
    y_test = test['trip_time'].to_frame().persist()
    X_test = test.drop(columns='trip_time').persist()

    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

    model = xgb.dask.train(
        client,
        {"tree_method": "hist"},
        dtrain,
        num_boost_round=4,
        evals=[(dtrain, "train")],
    )

    predictions = xgb.dask.predict(client, model, X_test)

    score = mean_squared_error(y_test.to_dask_array(lengths=True).reshape(-1,), 
                               predictions.to_dask_array(lengths=True), squared=False
                              )
    val_scores.append(score)
print(f"Total time:  {datetime.now() - start} seconds")
print(np.mean(val_scores))

Total time:  0:05:51.447481 seconds
433.0238076392043


In [7]:
client.shutdown()

In [8]:
model

{'booster': <xgboost.core.Booster at 0x13e2d8e20>,
 'history': {'train': OrderedDict([('rmse',
                [927.6812390694637,
                 688.6282248043086,
                 531.5322083567262,
                 431.284455877504])])}}