# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [1]:
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

from distributed import Client
import dask.dataframe as dd
from coiled import Cluster
import coiled

import optuna
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost.dask import DaskDMatrix

from dask_ml.datasets import make_classification_df
from dask_ml.model_selection import train_test_split, KFold
from dask_ml.preprocessing import OneHotEncoder
import dask.array as da


In [2]:
import dask, coiled
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.58
dask: 2022.12.0+13.g0d8e12be
dask.distributed: 2022.12.0+17.gf8302593
optuna: 3.1.0.dev
xgboost: 1.7.2
coiled: 0.2.58


### Load data

In [3]:
import dask.dataframe as dd
from s3fs import S3FileSystem
import dask_optuna

# storage = dask_optuna.DaskStorage()

def load_data():
    fs = S3FileSystem()
    files = fs.glob("s3://prefect-dask-examples/nyc-uber-lyft/**/*.parquet", detail=True)
    files = [f"s3://{v['Key']}" for _,v in files.items() if v["type"] == "file"]
    ddf= dd.read_parquet(files).select_dtypes(exclude=["string", "category"])
    ddf = ddf.drop(columns=["base_passenger_fare", "sales_tax", "bcf", "congestion_surcharge", "tips", "driver_pay", "dropoff_datetime"])
    ddf = ddf.assign(accessible_vehicle = 1)
    ddf.accessible_vehicle = ddf.accessible_vehicle.where(ddf.on_scene_datetime.isnull(),0)  # Only applies if the vehicle is wheelchair accessible
    ddf = ddf.assign(request_dow = ddf.request_datetime.dt.dayofweek)
    ddf = ddf.assign(pickup_datetime_dow = ddf.pickup_datetime.dt.dayofweek)
    ddf = ddf.assign(request_hour = ddf.request_datetime.dt.hour)
    ddf = ddf.assign(pickup_datetime_hour = ddf.pickup_datetime.dt.hour)
    ddf = ddf.drop(columns=['on_scene_datetime', 'request_datetime', 'pickup_datetime', "PULocationID", "DOLocationID"])
    
    
    categories = ["request_dow", "request_hour", "pickup_datetime_hour", "pickup_datetime_dow"]
    for cat in categories:
        ddf[cat] = ddf[cat].astype('category')
    
    ddf = ddf.categorize(columns=categories)
    X = ddf.drop(columns=['request_dow', 'pickup_datetime_dow', 'request_hour', 'pickup_datetime_hour'])
    print("Completed data preprocessing")
    return X

In [4]:
train_options = dict(
    n_splits = 5 
)

def _make_cv(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        train = dd.concat(train)
        yield train, test


In [5]:
def cv_estimate(trial_number, clf_params, n_splits=5):
    thread_id = threading.get_ident()
    with coiled.Cluster(
        package_sync=True, # copy local packages
        name="xgb-nyc-taxi-" + str(thread_id), 
        shutdown_on_close=False,  # reuse cluster across runs
        show_widget=False,
        n_workers=64,
        worker_memory="16 GiB",
        account="dask-engineering",
        backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True}
    ) as cluster:
        with Client(cluster) as client:
            with client.as_current():
                print(f"starting run in thread:  {thread_id}")

                ddf = load_data()
                ddf = ddf.persist()

                val_scores = 0
                for i, (train, test) in enumerate(_make_cv(ddf, n_splits)):
                    y_train = train['trip_time'].to_frame()
                    X_train = train.drop(columns=['trip_time'])
                    y_test = test['trip_time'].to_frame()
                    X_test = test.drop(columns='trip_time')
                    # print(f"Trial {trial_number} KFold {i} started")
                    start = datetime.datetime.now()
                    print(start)

                    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)
                    print("created dtrain")
                    dtest = DaskDMatrix(client, X_test, y_test, enable_categorical=True)
                    params = {
                        'n_estimators': 5,
                        'learning_rate': 0.99,
                        'subsample':  0.9,
                        'max_depth': 10,
                        'colsample_bytree': 0.9,
                        'min_child_weight': 9,
                    }

                    model = xgb.dask.train(
                        client,
                        {
                            'verbosity': 1,
                            'tree_method': 'hist', 
                            "objective": "reg:squarederror",
                            **params
                        },
                        dtrain,
                        num_boost_round=4, 
                        evals=[(dtrain, 'train')],
                        early_stopping_rounds=1
                    )
                    print("made model")
                    predictions = xgb.dask.predict(client, model["booster"], dtest)
                    # predictions = xgb.dask.predict(client, model["booster"], X_test)
                    print("made predictions")

                    actual, predictions = dask.compute(y_test, predictions)
                    score = mean_squared_error(actual, predictions)
                    val_scores += score
                    end = datetime.datetime.now()
                print(f"KFold {i}, score: {score}")


    return val_scores / n_splits

In [6]:
# cv_estimate(1, {}, data_kwargs)

In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.99),
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
    }
    accuracy = cv_estimate(
        trial_number=trial.number,
        clf_params=params, 
        n_splits=train_options["n_splits"]
    ) 
    return accuracy

In [8]:

# create a single study
study = optuna.create_study()

executor = ThreadPoolExecutor(4)

futures = [
    executor.submit(study.optimize, objective, n_trials=1) for _ in range(4)
]

[32m[I 2022-12-20 23:32:03,165][0m A new study created in memory with name: no-name-61032dc6-8240-4c60-871c-cdef6dc6464b[0m


In [16]:
futures

[<Future at 0x283088160 state=running>,
 <Future at 0x283a3bb80 state=running>,
 <Future at 0x283a3b040 state=running>,
 <Future at 0x12313c340 state=running>]

created d train
made model
made predictions
created d train
made model
made predictions
created d train
made model
made predictions
2022-12-20 23:58:06.946915
made model
made predictions
2022-12-21 00:00:07.472986
2022-12-21 00:02:17.285905


In [17]:
futures

[<Future at 0x283088160 state=running>,
 <Future at 0x283a3bb80 state=running>,
 <Future at 0x283a3b040 state=running>,
 <Future at 0x12313c340 state=running>]

created d train
2022-12-21 00:04:35.441473
created d train
made model
made predictions


2022-12-21 00:07:52,902 - distributed.client - ERROR - 
Traceback (most recent call last):
  File "/Users/greghayes/mambaforge/envs/xgboost_test/lib/python3.10/site-packages/distributed/utils.py", line 742, in wrapper
    return await func(*args, **kwargs)
  File "/Users/greghayes/mambaforge/envs/xgboost_test/lib/python3.10/site-packages/distributed/client.py", line 1301, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/Users/greghayes/mambaforge/envs/xgboost_test/lib/python3.10/site-packages/distributed/client.py", line 1331, in _ensure_connected
    comm = await connect(
  File "/Users/greghayes/mambaforge/envs/xgboost_test/lib/python3.10/site-packages/distributed/comm/core.py", line 291, in connect
    comm = await asyncio.wait_for(
  File "/Users/greghayes/mambaforge/envs/xgboost_test/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
    await waiter
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/Users/greghayes/mambafor

## IGNORE BELOW

In [None]:
def make_cv(df, num_folds):
    frac = [1 / num_folds]*num_folds
    splits = df.random_split(frac)
    for i in range(num_folds):
        train = [splits[j] for j in range(num_folds) if j != i]
        test = splits[i]
        train = dd.concat(train)
        yield train, test

for i, (train, test) in enumerate(make_cv(ddf, 5)):
    print(type(train))
    y_train = train['trip_time'].to_frame()
    X_test = train.drop(columns=['trip_time'])
    y_test = test['trip_time'].to_frame()
    X_test = test.drop(columns='trip_time')    

In [None]:
# with coiled.Cluster(
#     package_sync=True, # copy local packages
#     # name="xgb-nyc-taxi-" + str(thread_id), 
#     # name="xgb-nyc-taxi-11005882368",
#     shutdown_on_close=False,  # reuse cluster across runs
#     show_widget=False,
#     n_workers=64,
#     worker_memory="16 GiB",
#     account="dask-engineering",
#     backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True}
# ) as cluster:
print("starting run")
client = Client(cluster)

ddf = load_data()
ddf = ddf.persist()

val_scores = 0
for i, (train, test) in enumerate(make_cv(ddf, 5)):
    y_train = train['trip_time'].to_frame()
    X_train = train.drop(columns=['trip_time'])
    y_test = test['trip_time'].to_frame()
    X_test = test.drop(columns='trip_time')
    # print(f"Trial {trial_number} KFold {i} started")
    start = datetime.datetime.now()
    print(start)

    dtrain = DaskDMatrix(client, X_train, y_train, enable_categorical=True)
    print("created d train")
    dtest = DaskDMatrix(client, X_test, y_test, enable_categorical=True)
    params = {
        'n_estimators': 5,
        'learning_rate': 0.99,
        'subsample':  0.9,
        'max_depth': 10,
        'colsample_bytree': 0.9,
        'min_child_weight': 9,
    }

    model = xgb.dask.train(
        client,
        {
            'verbosity': 1,
            'tree_method': 'hist', 
            "objective": "reg:squarederror",
            **params
        },
        dtrain,
        num_boost_round=4, 
        evals=[(dtrain, 'train')],
        early_stopping_rounds=1
    )
    print("made model")
    predictions = xgb.dask.predict(client, model["booster"], dtest)
    # predictions = xgb.dask.predict(client, model["booster"], X_test)
    print("made predictions")

    actual, predictions = dask.compute(y_test, predictions)
    score = mean_squared_error(actual, predictions)
    val_scores += score
    end = datetime.datetime.now()
    print(f"KFold {i}, score: {score}")


In [None]:
# done

In [None]:
client.shutdown()

In [None]:
# dd.from_dask_array(X).head()

In [None]:
dtrain = DaskDMatrix(client, data=X, label=y)

In [None]:
dir(xgb.dask.)

In [None]:
!pip install dask_optuna

In [None]:
!pip install git+https://github.com/optuna/optuna.git