# HPO of XGBoost with Optuna and Dask

In [1]:
import datetime
import joblib
import uuid

from distributed import Client, wait
import dask.dataframe as dd
import coiled

import dask
import numpy as np
import pandas as pd
import optuna
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix

import dask.dataframe as dd
from xgboost.core import XGBoostError

In [2]:
ACCOUNT="dask-engineering"                                  # <-- This is your account
FILEPATH="s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"  # <-- Location of the feature table
WORKER_INSTANCE_TYPE = "m6i.4xlarge"                        # <-- EC2 instance size
CLUSTER_NAME = f"nyc-uber-lyft-{uuid.uuid1().hex}"          # <-- This allows us to resuse the cluster across trials
CLUSTERS = {}    # Dict for collecting clusters for shutting down when done

### Train Model

In [3]:
# Here we subset data for cross-validation
def _make_cv_split(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test

In [4]:
def train_model(trial_number, study_params, n_splits=5):

    cluster = CLUSTERS.get(CLUSTER_NAME, None)
    if cluster is None:
        cluster = coiled.Cluster(
            worker_vm_types=[WORKER_INSTANCE_TYPE],
            scheduler_vm_types=["m6i.2xlarge"],
            package_sync=True, # copy local packages
            name=CLUSTER_NAME,
            shutdown_on_close=True,
            show_widget=False,
            n_workers=10,
            use_best_zone=True,
            account=ACCOUNT,
            backend_options={"region": "us-east-2", "spot": True},
            scheduler_options={"idle_timeout": "10 minutes"},
            )
    CLUSTERS[CLUSTER_NAME] = cluster

    print("starting run")
    with Client(cluster) as client:
        
        # Load and pre-process the DataFrame
        ddf = dd.read_parquet(FILEPATH)
        categorical_vars = ddf.select_dtypes(include="category").columns.tolist()
        ddf = ddf.categorize(columns=categorical_vars)     # We need to categories to be `known`
        float_cols = ddf.select_dtypes(include="float").columns.tolist()
        ddf[float_cols] = ddf[float_cols].astype(np.float32).persist()  # Under the hood, XGBoost converts floats to `float32`
        
        val_scores = []

        for i, (train, test) in enumerate(_make_cv_split(ddf, n_splits)):
            print(f"Starting training run {i}")
            start = datetime.datetime.now()
            train = dd.concat(train)

            try:
                assert all(train[c].cat.known for c in categorical_vars)
                assert all(test[c].cat.known for c in categorical_vars)
            except Exception as e:
                cluster.shutdown()
                raise RuntimeError(f"Categorical_vars are not known")

            y_train = train['trip_time'].to_frame().persist()
            X_train = train.drop(columns=['trip_time']).persist()
            
            # Make the test data
            y_test = test['trip_time'].to_frame().persist()
            X_test = test.drop(columns='trip_time').persist()

            try:
                print("Make dtrain")
                dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

                print("Training model")
                model = xgb.dask.train(
                    client,
                    {
                        'verbosity': 1,
                        'tree_method': 'hist', 
                        "objective": "reg:squarederror",
                        **study_params
                    },
                    dtrain,
                    num_boost_round=4,
                    evals=[(dtrain, "train")],
                )

                print("Make predictions")
                predictions = xgb.dask.predict(client, model, X_test)

                # Materialize the predictions and y_test
                y_test = y_test.compute().to_numpy().reshape(-1,)
                predictions = predictions.compute().to_numpy()

                print("Score the model")
                score = mean_squared_error(y_test, predictions, squared=False)
                
                print(f"rmse_score:  {score}")
                val_scores.append(score)
                print(f"val_scores:  {val_scores}")
                print(f"Finished training run in:  {datetime.datetime.now() - start} seconds")

            except XGBoostError as e:
                print(f"Trial {i} failed with {e}")

        return np.mean(val_scores)

In [5]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 125),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }
    rmse = train_model(
        trial_number=trial.number,
        study_params=params, 
        n_splits=5,
    )
    print(f"final mse:  {rmse}")
    return rmse

In [None]:
# Create a single study and run some trials

study = optuna.create_study(study_name="nyc-travel-time-model")    
study.optimize(objective, n_trials=2)

for c in CLUSTERS.values():
    c.shutdown()

[32m[I 2023-01-12 20:49:46,864][0m A new study created in memory with name: nyc-travel-time-model[0m


starting run
Starting training run 0
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  543.8064939574474
val_scores:  [543.8064939574474]
Finished training run in:  0:04:26.521681 seconds
Starting training run 1
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  566.2642121296011
val_scores:  [543.8064939574474, 566.2642121296011]
Finished training run in:  0:04:16.372493 seconds
Starting training run 2
Make dtrain
Training model
Make predictions
Score the model
rmse_score:  576.8750530144338
val_scores:  [543.8064939574474, 566.2642121296011, 576.8750530144338]
Finished training run in:  0:04:35.971731 seconds
Starting training run 3
Make dtrain
Training model
Make predictions


In [None]:
len(study.trials)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
study.best_trial

In [None]:
# Uncomment this if you want to save the results of your study to examine later.

#joblib.dump(study, "data/study_m6i4xlarge.pkl")