# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [1]:
from distributed import LocalCluster, Client
import dask.dataframe as dd

import optuna
from sklearn.metrics import roc_auc_score, mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix
from xgboost import DMatrix

from dask_ml.model_selection import train_test_split, KFold
from dask.datasets import timeseries
import pandas as pd

In [2]:
import dask, coiled
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.55
dask: 2022.12.0+13.g0d8e12be
dask.distributed: 2022.12.0+17.gf8302593
optuna: 3.0.4
xgboost: 1.7.2
coiled: 0.2.55


In [3]:
def objective(trial):
    params = {
        # 'n_estimators': trial.suggest_int('n_estimators', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.99),
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
    }
    accuracy = cv_estimate(
        trial_number=trial.number,
        clf_params=params, 
        n_splits=train_options["n_splits"]
    ) 
    return accuracy

train_options = dict(
    n_splits = 5 
)

### Load data

In [4]:
df = pd.DataFrame(data=[[None, None, None]], columns=['id', 'x', 'y'])
df[['x', 'y']] = df[['x', 'y']].astype(pd.Float32Dtype())
df.id = df.id.astype(pd.Int32Dtype())
ddf2 = dd.from_pandas(df, npartitions=1)
ddf2.dtypes

id      Int32
x     Float32
y     Float32
dtype: object

In [5]:
ddf = timeseries()
ddf = ddf.reset_index(drop=True)
ddf = ddf.drop(columns='name')
ddf[['x', 'y']] = ddf[['x', 'y']].astype(pd.Float32Dtype())
ddf.id = ddf.id.astype(pd.Int32Dtype())
ddf = dd.concat([ddf, ddf2], axis=0)
ddf.dtypes

id      Int32
x     Float32
y     Float32
dtype: object

In [9]:
import dask.dataframe as dd

def load_data():
    ddf = timeseries()
    ddf = ddf.reset_index(drop=True)
    ddf = ddf.drop(columns='name')
    ddf[['x', 'y']] = ddf[['x', 'y']].astype(pd.Float32Dtype())
    ddf.id = ddf.id.astype(pd.Int32Dtype())
    ddf = dd.concat([ddf, ddf2], axis=0)
    # ddf.dtypes
    X = ddf.drop(columns="y")
    y = ddf.y

    return X.to_dask_array(), y.to_dask_array()
    # return X.compute(), y.compute()

In [11]:
A,b = load_data()
A.dtypes

TypeError: Cannot interpret 'Float32Dtype()' as a data type

In [7]:
A, b = load_data()

TypeError: Cannot interpret 'Float32Dtype()' as a data type

In [None]:
A.compute()


## Dask Dataframe

In [None]:
def load_data():
    ddf = timeseries()
    ddf.name = ddf.name.astype("category")
    ddf = ddf.categorize(columns="name")

    X = ddf.drop(columns="y")
    y = ddf.y
    return X.to_dask_array(lengths=True), y.to_dask_array(lengths=True)
    # return X, y

train_options = dict(
    n_splits = 5 
)

def cv_estimate(trial_number, clf_params, n_splits=5):
    with LocalCluster() as cluster:  # for testing
        with Client(cluster) as client:

            # Load data here
            X, y = load_data()
            X = X.persist()
            y = y.persist()

            val_scores = 0

            dtrain = DaskDMatrix(client, X, y, enable_categorical=True)
            dtest = DaskDMatrix(client, X, y)#, enable_categorical=True)

            model = xgb.dask.train(
                client,
                {
                    'verbosity': 1,
                    'tree_method': 'hist', 
                    "objective": "reg:squarederror",
                    **clf_params
                },
                dtrain,
                num_boost_round=4, 
                evals=[(dtrain, 'train')],
            )
            predictions = xgb.dask.predict(client, model, dtest)

            actual = dask.compute(y)
            predictions = dask.compute(predictions)

            score = mean_squared_error(actual, predictions)
            val_scores += score
    return val_scores / n_splits


study = optuna.create_study()
study.optimize(objective, n_trials=1)

cv_estimate(1, {}, data_kwargs)

In [None]:
ddf.name.to_frame().to_dask_array(lengths=True)

In [None]:
## Pandas Version

In [None]:
def load_data():
    ddf = timeseries()
    ddf.name = ddf.name.astype("category")
    ddf = ddf.categorize(columns="name")
    X = ddf.drop(columns="y")
    y = ddf.y
    return X.compute(), y.compute()


def cv_estimate(trial_number, clf_params, n_splits=5):

    # Load data here
    X, y = load_data()
    val_scores = 0

    dtrain = DMatrix(X, y, enable_categorical=True)
    dtest = DMatrix(X, y, enable_categorical=True)

    model = xgb.train(
        {
            'verbosity': 1,
            'tree_method': 'hist', 
            "objective": "reg:squarederror",
            **clf_params
        },
        dtrain,
        num_boost_round=4, 
        evals=[(dtrain, 'train')],
        early_stopping_rounds=1
    )

    predictions = model.predict(dtest)

    score = mean_squared_error(y, predictions)
    val_scores += score
    return val_scores / n_splits


# create a single study
study = optuna.create_study()
study.optimize(objective, n_trials=1)

In [None]:
X, y= load_data()

In [None]:
dask.compute(X)

In [None]:
dask.compute(y)

In [None]:
c = Client()

In [None]:
x_ = DaskDMatrix(c, X, enable_categorical=True)

In [None]:
dir(x_)

In [None]:
dask.compute(x_)