# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [7]:
from distributed import LocalCluster, Client
import dask.dataframe as dd

import optuna
from sklearn.metrics import roc_auc_score, mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix
from xgboost import DMatrix

from dask_ml.model_selection import train_test_split, KFold
from dask.datasets import timeseries

In [2]:
import dask, coiled
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.55
dask: 2022.12.0+13.g0d8e12be
dask.distributed: 2022.12.0+17.gf8302593
optuna: 3.0.4
xgboost: 1.7.2
coiled: 0.2.55


In [3]:
def objective(trial):
    params = {
        # 'n_estimators': trial.suggest_int('n_estimators', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.99),
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
    }
    accuracy = cv_estimate(
        trial_number=trial.number,
        clf_params=params, 
        n_splits=train_options["n_splits"]
    ) 
    return accuracy

train_options = dict(
    n_splits = 5 
)

### Load data

In [None]:
ddf = timeseries()
ddf.head()

In [35]:
import dask.dataframe as dd

def load_data():
    ddf = timeseries()
    ddf = ddf.reset_index(drop=True)
    ddf.name = ddf.name.astype("category")
    ddf = ddf.categorize(columns="name")
    X = ddf.drop(columns="y")
    y = ddf.y

    return X.to_dask_array(lengths=True), y.to_dask_array(lengths=True)
    # return X.compute(), y.compute()

In [37]:
A, b = load_data()

In [38]:
A.compute()

array([['Jerry', 997, 0.5703340236376859],
       ['Hannah', 975, -0.9435919675257181],
       ['Zelda', 1027, -0.03062696729018244],
       ...,
       ['Quinn', 1000, -0.22060738702394134],
       ['Hannah', 995, 0.9723931932620258],
       ['Frank', 994, -0.4874377956771685]], dtype=object)


## Dask Dataframe

In [39]:
def load_data():
    ddf = timeseries()
    ddf.name = ddf.name.astype("category")
    ddf = ddf.categorize(columns="name")

    X = ddf.drop(columns="y")
    y = ddf.y
    return X.to_dask_array(lengths=True), y.to_dask_array(lengths=True)
    # return X, y

train_options = dict(
    n_splits = 5 
)

def cv_estimate(trial_number, clf_params, n_splits=5):
    with LocalCluster() as cluster:  # for testing
        with Client(cluster) as client:

            # Load data here
            X, y = load_data()
            X = X.persist()
            y = y.persist()

            val_scores = 0

            dtrain = DaskDMatrix(client, X, y, enable_categorical=True)
            dtest = DaskDMatrix(client, X, y)#, enable_categorical=True)

            model = xgb.dask.train(
                client,
                {
                    'verbosity': 1,
                    'tree_method': 'hist', 
                    "objective": "reg:squarederror",
                    **clf_params
                },
                dtrain,
                num_boost_round=4, 
                evals=[(dtrain, 'train')],
            )
            predictions = xgb.dask.predict(client, model, dtest)

            actual = dask.compute(y)
            predictions = dask.compute(predictions)

            score = mean_squared_error(actual, predictions)
            val_scores += score
    return val_scores / n_splits


study = optuna.create_study()
study.optimize(objective, n_trials=1)

[32m[I 2022-12-17 07:24:16,693][0m A new study created in memory with name: no-name-6404397c-f37f-4057-a6a5-792f06714f1a[0m
Perhaps you already have a cluster running?
Hosting the HTTP server on port 60125 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:60126
INFO:distributed.scheduler:  dashboard at:           127.0.0.1:60125
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:60129'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:60130'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:60131'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:60132'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:60137', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:60137
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:60148
INFO:distributed.scheduler:Regis

ValueError: could not convert string to float: 'Oliver'

cv_estimate(1, {}, data_kwargs)

In [None]:
ddf.name.to_frame().to_dask_array(lengths=True)

In [4]:
## Pandas Version

In [5]:
def load_data():
    ddf = timeseries()
    ddf.name = ddf.name.astype("category")
    ddf = ddf.categorize(columns="name")
    X = ddf.drop(columns="y")
    y = ddf.y
    return X.compute(), y.compute()


def cv_estimate(trial_number, clf_params, n_splits=5):

    # Load data here
    X, y = load_data()
    val_scores = 0

    dtrain = DMatrix(X, y, enable_categorical=True)
    dtest = DMatrix(X, y, enable_categorical=True)

    model = xgb.train(
        {
            'verbosity': 1,
            'tree_method': 'hist', 
            "objective": "reg:squarederror",
            **clf_params
        },
        dtrain,
        num_boost_round=4, 
        evals=[(dtrain, 'train')],
        early_stopping_rounds=1
    )

    predictions = model.predict(dtest)

    score = mean_squared_error(y, predictions)
    val_scores += score
    return val_scores / n_splits


# create a single study
study = optuna.create_study()
study.optimize(objective, n_trials=1)

[32m[I 2022-12-17 07:02:13,081][0m A new study created in memory with name: no-name-16ec0b47-4bf1-40a0-8bef-a09ed7f385b8[0m


[0]	train-rmse:0.57769
[1]	train-rmse:0.57753


[32m[I 2022-12-17 07:02:14,591][0m Trial 0 finished with value: 0.06672154306662705 and parameters: {'learning_rate': 0.9739727695409763, 'subsample': 0.15840055436977452, 'max_depth': 6, 'colsample_bytree': 0.47696236144894666, 'min_child_weight': 5}. Best is trial 0 with value: 0.06672154306662705.[0m


In [20]:
X, y= load_data()

In [11]:
dask.compute(X)

(                        name    id         x
 timestamp                                   
 2000-01-01 00:00:00    Kevin   962 -0.022696
 2000-01-01 00:00:01    Laura  1049  0.455030
 2000-01-01 00:00:02    Kevin   986  0.340772
 2000-01-01 00:00:03  Michael   977 -0.537244
 2000-01-01 00:00:04    Quinn   978  0.512202
 ...                      ...   ...       ...
 2000-01-30 23:59:55  Charlie  1003  0.969770
 2000-01-30 23:59:56    Alice  1025 -0.224312
 2000-01-30 23:59:57   Oliver  1013 -0.003289
 2000-01-30 23:59:58  Charlie  1020  0.729158
 2000-01-30 23:59:59   Oliver   988  0.868284
 
 [2592000 rows x 3 columns],)

In [12]:
dask.compute(y)

(timestamp
 2000-01-01 00:00:00   -0.979115
 2000-01-01 00:00:01   -0.654596
 2000-01-01 00:00:02    0.180137
 2000-01-01 00:00:03    0.432194
 2000-01-01 00:00:04    0.614216
                          ...   
 2000-01-30 23:59:55   -0.735426
 2000-01-30 23:59:56   -0.366024
 2000-01-30 23:59:57    0.715517
 2000-01-30 23:59:58    0.870851
 2000-01-30 23:59:59   -0.693440
 Freq: S, Name: y, Length: 2592000, dtype: float64,)

In [19]:
c = Client()

INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:59017
INFO:distributed.scheduler:  dashboard at:            127.0.0.1:8787
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:59020'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:59021'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:59022'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:59023'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:59029', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:59029
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59034
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:59028', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:59028
INFO:distributed.core:Starting establi

In [22]:
x_ = DaskDMatrix(c, X, enable_categorical=True)

In [23]:
dir(x_)

['__await__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_create_fn_args',
 '_init',
 '_map_local_data',
 '_n_cols',
 'enable_categorical',
 'feature_names',
 'feature_types',
 'feature_weights',
 'is_quantile',
 'missing',
 'num_col',
 'partition_order',
 'worker_map']

In [24]:
dask.compute(x_)

(<xgboost.dask.DaskDMatrix at 0x2985cb2b0>,)