# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [1]:
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

from distributed import Client
import dask.dataframe as dd
from coiled import Cluster
import coiled

import optuna
from dask_ml.metrics import mean_squared_error as lazy_mse
import xgboost as xgb
from xgboost.dask import DaskDMatrix

from dask_ml.datasets import make_classification_df
from dask_ml.model_selection import train_test_split, KFold
from dask_ml.preprocessing import OneHotEncoder
import dask.array as da
import dask.dataframe as dd
import dask
from s3fs import S3FileSystem

import pandas as pd

In [2]:
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

coiled: 0.2.58
dask: 2022.12.1
dask.distributed: 2022.12.1
optuna: 3.1.0.dev
xgboost: 1.7.2
coiled: 0.2.58


In [3]:
Q3 = 1415.0

### Load data

In [4]:
BOROUGH_MAPPING = {
    "Manhattan": "Superborough 1",
    "Bronx": "Superborough 1",
    "EWR": "Superborough 1",
    "Brooklyn": "Superborough 2",
    "Queens": "Superborough 2",
    "Staten Island": "Superborough 3",
    "Unknown": "Unknown",
    }

In [5]:
def load_data():
    print("loading data")
    to_exclude=["string", "category", "object"]
    ddf= dd.read_parquet("s3://prefect-dask-examples/nyc-uber-lyft/processed_files.parquet")
    ddf = ddf.assign(accessible_vehicle = 1)
    print("Make accessible feature")
    ddf.accessible_vehicle = ddf.accessible_vehicle.where(ddf.on_scene_datetime.isnull(),0)  # Only applies if the vehicle is wheelchair accessible
    ddf = ddf.assign(pickup_month = ddf.pickup_datetime.dt.month)
    ddf = ddf.assign(pickup_dow = ddf.pickup_datetime.dt.dayofweek)
    ddf = ddf.assign(pickup_hour = ddf.pickup_datetime.dt.hour)
    
    ddf = ddf.drop(columns=['on_scene_datetime', 'request_datetime',
                            'pickup_datetime', 'dispatching_base_num',
                            'originating_base_num', 'shared_request_flag',
                           'shared_match_flag','dropoff_datetime',
                            'base_passenger_fare', 'bcf', 'sales_tax',
                            'tips', 'driver_pay', 'access_a_ride_flag',
                            'wav_match_flag',
                           ]
                  )

    ddf = ddf.dropna(how="any")
    ddf = ddf.repartition(partition_size="128MB").persist()
    ddf = ddf.reset_index(drop=True)

    original_rowcount = len(ddf.index)

    # Remove outliers
    # Based on our earlier EDA, we will set the lower bound at zero, which is consistent with our
    # domain knowledge that no trip should have a duration less than zero.  We calculate the upper_bound
    # and filter the IQR
    lower_bound = 0
    upper_bound = Q3 + (1.5*(Q3 - lower_bound))
    
    ddf = ddf.loc[(ddf['trip_time'] >= lower_bound) & (ddf['trip_time'] <= upper_bound)]
    
    ddf = ddf.repartition(partition_size="128MB").persist()
    print(f"Fraction of dataset left after removing outliers:  {len(ddf.index) / original_rowcount}")

    return ddf

In [6]:
def make_cross_borough_cat(df):
    PUSuperborough = [BOROUGH_MAPPING.get(i) for i in df.PUBorough.tolist()]
    DOSuperborough = [BOROUGH_MAPPING.get(i) for i in df.DOBorough.tolist()]
    PUSuperborough_DOSuperborough_Pair = [f"{i}-{j}" for i,j in zip(PUSuperborough, DOSuperborough)]
    return df.assign(PUSuperborough_DOSuperborough = PUSuperborough_DOSuperborough_Pair)

In [7]:
def make_taxi_data(ddf):
    print("Load taxi data")
    taxi_df = pd.read_csv("data/taxi+_zone_lookup.csv", usecols=["LocationID", "Borough"])

    ddf = dd.merge(ddf, taxi_df, left_on="PULocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "PUBorough"})
    ddf = ddf.drop(columns="LocationID")

    ddf = dd.merge(ddf, taxi_df, left_on="DOLocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "DOBorough"})
    ddf = ddf.drop(columns="LocationID")  
    
    print("Make superboroughs")
    ddf = ddf.map_partitions(lambda df: make_cross_borough_cat(df))
    ddf['airport_fee'] = ddf['airport_fee'].replace("None", 0)
    ddf['airport_fee'] = ddf['airport_fee'].replace('nan', 0)
    ddf['airport_fee'] = ddf['airport_fee'].astype(float)
    ddf['airport_fee'] = ddf['airport_fee'].fillna(0)

    ddf = ddf.repartition(partition_size="128MB").persist()

    print("Make categoricals")
    categories = ['hvfhs_license_num', 'PULocationID', "DOLocationID", 'wav_request_flag',
                  'accessible_vehicle', 'pickup_month', 'pickup_dow', 'pickup_hour', 
                  'PUBorough', 'DOBorough', 'PUSuperborough_DOSuperborough'
                 ]
    ddf[categories] = ddf[categories].astype('category')
    ddf = ddf.categorize(columns=categories)
    ddf = ddf.repartition(partition_size="128MB")
    return ddf

## Test Loading Dataset

In [8]:
cluster = coiled.Cluster(
    worker_vm_types=["m6i.4xlarge"],
    scheduler_vm_types=["m6i.2xlarge"],
    package_sync=True, # copy local packages,
    name="dask-engineering-f799f650-0",
    shutdown_on_close=True,  # reuse cluster across runs
    show_widget=False,
    n_workers=20,
    use_best_zone=True,
    account="dask-engineering",
    )
client = Client(cluster)

Collecting git+https://github.com/optuna/optuna.git@e8a010bb58aea943866e5f7addf0de953228de99
  Cloning https://github.com/optuna/optuna.git (to revision e8a010bb58aea943866e5f7addf0de953228de99) to /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-6_zobhy3


  Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-req-build-6_zobhy3
  Running command git rev-parse -q --verify 'sha^e8a010bb58aea943866e5f7addf0de953228de99'
  Running command git fetch -q https://github.com/optuna/optuna.git e8a010bb58aea943866e5f7addf0de953228de99


  Resolved https://github.com/optuna/optuna.git to commit e8a010bb58aea943866e5f7addf0de953228de99
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: optuna
  Building wheel for optuna (pyproject.toml): started
  Building wheel for optuna (pyproject.toml): finished with status 'done'
  Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=360987 sha256=b7a53928e721c87e9cdda44ee4bce712974fca62c21d84fb3f9d28c1caa9b7f0
  Stored in directory: /private/var/folders/b5/f_y899x168j7cs2m7szjld5c0000gn/T/pip-ephem-wheel-cache-xtkh6rii/wheels/51/b0/4b/c05d88297ef6716b5a87865bbbc77cf5b3aa7d959460a80772
Successfully built optuna


In [9]:
client

0,1
Connection method: Cluster object,Cluster type: coiled.ClusterBeta
Dashboard: http://18.188.3.79:8787,

0,1
Dashboard: http://18.188.3.79:8787,Workers: 2
Total threads: 32,Total memory: 121.76 GiB

0,1
Comm: tls://10.0.16.25:8786,Workers: 2
Dashboard: http://10.0.16.25:8787/status,Total threads: 32
Started: Just now,Total memory: 121.76 GiB

0,1
Comm: tls://10.0.21.251:40621,Total threads: 16
Dashboard: http://10.0.21.251:8787/status,Memory: 60.88 GiB
Nanny: tls://10.0.21.251:39197,
Local directory: /scratch/dask-worker-space/worker-h0y5wq95,Local directory: /scratch/dask-worker-space/worker-h0y5wq95

0,1
Comm: tls://10.0.31.189:35221,Total threads: 16
Dashboard: http://10.0.31.189:8787/status,Memory: 60.88 GiB
Nanny: tls://10.0.31.189:38503,
Local directory: /scratch/dask-worker-space/worker-mx8rqobk,Local directory: /scratch/dask-worker-space/worker-mx8rqobk


In [10]:
ddf = load_data()
ddf = make_taxi_data(ddf)

loading data
Make accessible feature
Fraction of dataset left after removing outliers:  0.9842347215803948
Load taxi data
Make superboroughs
Make categoricals


In [11]:
ddf.head()

Unnamed: 0,hvfhs_license_num,PULocationID,DOLocationID,trip_miles,trip_time,tolls,congestion_surcharge,airport_fee,wav_request_flag,accessible_vehicle,pickup_month,pickup_dow,pickup_hour,PUBorough,DOBorough,PUSuperborough_DOSuperborough
0,HV0003,47,152,4.32,1279,0.0,0.0,0.0,N,0,4,1,6,Bronx,Manhattan,Superborough 1-Superborough 1
1,HV0003,47,152,5.56,1547,0.0,0.0,0.0,N,0,4,1,7,Bronx,Manhattan,Superborough 1-Superborough 1
2,HV0003,47,152,5.49,1153,0.0,0.0,0.0,N,0,4,1,12,Bronx,Manhattan,Superborough 1-Superborough 1
3,HV0005,47,152,5.88,1080,0.02,0.0,0.0,N,1,4,1,12,Bronx,Manhattan,Superborough 1-Superborough 1
4,HV0003,47,152,6.53,1372,0.0,0.0,0.0,N,0,4,1,13,Bronx,Manhattan,Superborough 1-Superborough 1


In [12]:
ddf.columns.tolist()

['hvfhs_license_num',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'tolls',
 'congestion_surcharge',
 'airport_fee',
 'wav_request_flag',
 'accessible_vehicle',
 'pickup_month',
 'pickup_dow',
 'pickup_hour',
 'PUBorough',
 'DOBorough',
 'PUSuperborough_DOSuperborough']

In [13]:
ddf.dtypes

hvfhs_license_num                category
PULocationID                     category
DOLocationID                     category
trip_miles                        float64
trip_time                           int64
tolls                             float64
congestion_surcharge              float64
airport_fee                       float64
wav_request_flag                 category
accessible_vehicle               category
pickup_month                     category
pickup_dow                       category
pickup_hour                      category
PUBorough                        category
DOBorough                        category
PUSuperborough_DOSuperborough    category
dtype: object

In [14]:
ddf.to_parquet("s3://prefect-dask-examples/nyc-uber-lyft/feature_table_fixed_upper_bound.parquet", overwrite=True)

In [15]:
client.shutdown()

2023-01-03 15:53:55,811 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [None]:
client.restart()