# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)


In [None]:
FILEPATH="s3://prefect-dask-examples/nyc-uber-lyft/feature_table_fixed_upper_bound.parquet"

# instance type can be on of "m6i.xlarge", "m6i.2xlarge", or "m6i.4xlarge"
WORKER_INSTANCE_TYPE = "m6i.2xlarge"

In [None]:
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

from distributed import Client, wait

import dask.dataframe as dd
from coiled import Cluster
import coiled

import optuna
from dask_ml.metrics import mean_squared_error as lazy_mse
import xgboost as xgb
from xgboost.dask import DaskDMatrix

import dask.array as da
import dask.dataframe as dd
from s3fs import S3FileSystem
from xgboost.core import XGBoostError
import numpy as np
import joblib
import pandas as pd
import dask

In [None]:
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

### Train Model

In [None]:
# Here we subset data for cross-validation

def _make_cv(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac, shuffle=True)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        yield train, test


In [None]:
def train_model(trial_number, study_params, n_splits=5, cluster_name = None):
    if cluster_name is None:
        thread_id = threading.get_ident()
        cluster_name = "xgb-nyc-taxi-" + str(thread_id)
    cluster = coiled.Cluster(
        worker_vm_types=[WORKER_INSTANCE_TYPE],
        scheduler_vm_types=["m6i.2xlarge"],
        package_sync=True, # copy local packages,
        name=cluster_name,
        shutdown_on_close=False,  # reuse cluster across runs
        show_widget=False,
        n_workers=10,
        use_best_zone=True,
        account="dask-engineering",
        backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True},
        scheduler_options={"idle_timeout": "10 minutes"},
        )

    print("starting run")
    with Client(cluster) as client:
        with client.as_current():
            # Load and pre-process the DataFrame
            ddf = dd.read_parquet(FILEPATH)
            categorical_vars = ddf.select_dtypes(include="category").columns.tolist()
            ddf = ddf.categorize(columns=categorical_vars)
            float_cols = ddf.select_dtypes(include="float").columns.tolist()
            ddf[float_cols] = ddf[float_cols].astype(np.float32).persist()  # Under the hood, XGBoost converts floats to `float32`

            val_scores = []

            for i, (train, test) in enumerate(_make_cv(ddf, n_splits)):
                print(f"Starting training run {i}")
                start = datetime.datetime.now()
                train = dd.concat(train)

                try:
                    assert all(train[c].cat.known for c in categorical_vars)
                    assert all(test[c].cat.known for c in categorical_vars)
                except Exception as e:
                    cluster.shutdown()
                    raise RuntimeError(f"Categorical_vars are not known")

                y_train = train['trip_time'].to_frame().persist()
                X_train = train.drop(columns=['trip_time']).persist()

                # Make the training data
                y_test = test['trip_time'].to_frame().persist()
                X_test = test.drop(columns='trip_time').persist()

                try:
                    print("Make dtrain")
                    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)

                    # print("Make dtest")
                    # dtest = DaskDMatrix(client, X_test, y_test, enable_categorical=True)

                    print("Training model")

                    model = xgb.dask.train(
                        client,
                        {
                            'verbosity': 2,
                            'tree_method': 'hist', 
                            "objective": "reg:squarederror",
                            **study_params
                        },
                        dtrain,
                        num_boost_round=4,
                        evals=[(dtrain, "train")],
                    )

                    print("Make predictions")
                    # It's faster to run the prediction directly on X_test DataFrame
                    # We also need to confirm that predictions on dtest when it
                    # contains categoricals performs as expected
                    predictions = xgb.dask.predict(client, model, X_test)

                    print("Score the model")
                    score = lazy_mse(y_test.to_dask_array(lengths=True).reshape(-1,), 
                                     predictions.to_dask_array(lengths=True), squared=False,
                                    )
                    wait(score)  # Explicitly waiting avoids https://github.com/dask/distributed/issues/4612
                    print(f"rmse_score:  {score}")
                    val_scores.append(score)
                    print(f"val_scores:  {val_scores}")
                    print(f"Finished training run in:  {datetime.datetime.now() - start} seconds")

                except XGBoostError as e:
                    print(f"Trial {i} failed with {e}")

        return np.mean(val_scores)

In [None]:
train_options = dict(
    n_splits = 5 
)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 125),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }
    rmse = train_model(
        trial_number=trial.number,
        study_params=params, 
        n_splits=train_options["n_splits"],
    )
    print(f"final mse:  {rmse}")
    return rmse

In [None]:
# create a single study

study = optuna.create_study(study_name="nyc-taxi-study-smaller-instances")    

executor = ThreadPoolExecutor(4)

futures = [
    executor.submit(study.optimize, objective, n_trials=1) for _ in range(4)
]


In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
study.best_trial

In [None]:
len(study.trials)

In [None]:
joblib.dump(study, "data/study.pkl")

In [None]:
f = futures[0].result()

In [None]:
cluster.shutdown()

In [None]:
dir(Client)

In [None]:
dir(Cluster)