# XGBoost.Dask in many threads

Sometimes we want to train many large XGBoost models in parallel.  We do so in this example with ...

1.  The `xgboost.dask` project to do large training runs
2.  Optuna to do hyper-parameter-optimization
3.  A thread pool, to run many of these in parallel
4.  Coiled to launch Dask clusters (but you could swap in your favorite Dask deployment technology as you like)

Using `xgboost.dask` from many threads tooks a couple of small tweaks across projects.  This notebook resulted in the following PRs and issues:

-  https://github.com/dask/distributed/issues/7377
-  https://github.com/dask/dask/pull/9723
-  https://github.com/dask/distributed/pull/7369
-  https://github.com/dmlc/xgboost/pull/8558 (mostly cosmetic, not necessary)
-  Also something in Coiled to allow package_sync to be thread-safe, should be released by 2022-12-07

In [None]:
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

from distributed import Client
import dask.dataframe as dd
from coiled import Cluster
import coiled

import optuna
# from sklearn.metrics import mean_squared_error
from dask_ml.metrics import mean_squared_error as lazy_mse
import xgboost as xgb
from xgboost.dask import DaskDMatrix

from dask_ml.datasets import make_classification_df
from dask_ml.model_selection import train_test_split, KFold
from dask_ml.preprocessing import OneHotEncoder
import dask.array as da
import dask.dataframe as dd
from s3fs import S3FileSystem

import pandas as pd

In [None]:
import dask, coiled
print("coiled:", coiled.__version__)
print("dask:", dask.__version__)
print("dask.distributed:", dask.distributed.__version__)
print("optuna:", optuna.__version__)
print("xgboost:", xgb.__version__)
print("coiled:", coiled.__version__)

### Load data

In [None]:
BOROUGH_MAPPING = {
    "Manhattan": "Superborough 1",
    "Bronx": "Superborough 1",
    "EWR": "Superborough 1",
    "Brooklyn": "Superborough 2",
    "Queens": "Superborough 2",
    "Staten Island": "Superborough 3",
    "Unknown": "Unknown",
    }

In [None]:
def load_data():
    print("loading data")
    to_exclude=["string", "category", "object"]
    ddf= dd.read_parquet("s3://prefect-dask-examples/nyc-uber-lyft/processed_files.parquet")
    ddf = ddf.assign(accessible_vehicle = 1)
    print("Make accessible feature")
    ddf.accessible_vehicle = ddf.accessible_vehicle.where(ddf.on_scene_datetime.isnull(),0)  # Only applies if the vehicle is wheelchair accessible
    ddf = ddf.assign(pickup_month = ddf.pickup_datetime.dt.month)
    ddf = ddf.assign(pickup_dow = ddf.pickup_datetime.dt.dayofweek)
    ddf = ddf.assign(pickup_hour = ddf.pickup_datetime.dt.hour)
    
    ddf = ddf.drop(columns=['on_scene_datetime', 'request_datetime',
                            'pickup_datetime', 'dispatching_base_num',
                            'originating_base_num', 'shared_request_flag',
                           'shared_match_flag','dropoff_datetime',
                           ]
                  )

    ddf = ddf.dropna(how="any")
    ddf = ddf.repartition(partition_size="128MB")
    ddf = ddf.reset_index(drop=True)

    original_rowcount = len(ddf.index)

    # Remove outliers
    # Based on our earlier EDA, we will set the lower bound at zero, which is consistent with our
    # domain knowledge that no trip should have a duration less than zero.  We calculate the upper_bound
    # and filter the IQR
    lower_bound = 0
    Q3 = ddf['trip_time'].quantile(0.75)
    upper_bound = Q3 + (1.5*(Q3 - lower_bound))
    
    ddf = ddf.loc[(ddf['trip_time'] >= lower_bound) & (ddf['trip_time'] <= upper_bound)]
    
    ddf = ddf.repartition(partition_size="128MB")
    print(f"Fraction of dataset left after removing outliers:  {len(ddf.index) / original_rowcount}")

    return ddf

In [None]:
def get_superborough(df):
    PUSuperborough = [BOROUGH_MAPPING.get(i) for i in df.PUBorough.tolist()]
    DOSuperborough = [BOROUGH_MAPPING.get(i) for i in df.DOBorough.tolist()]
    cross_superborough = ["N" if i==j else "Y" for (i,j) in zip(PUSuperborough, DOSuperborough)]
    return df.assign(CrossSuperborough = cross_superborough)

In [None]:
def make_taxi_data(ddf):
    print("Load taxi data")
    taxi_df = pd.read_csv("data/taxi+_zone_lookup.csv", usecols=["LocationID", "Borough"])

    ddf = dd.merge(ddf, taxi_df, left_on="PULocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "PUBorough"})
    ddf = ddf.drop(columns="LocationID")

    ddf = dd.merge(ddf, taxi_df, left_on="DOLocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "DOBorough"})
    ddf = ddf.drop(columns="LocationID")  
    
    print("Make superboroughs")
    ddf = ddf.map_partitions(lambda df: get_superborough(df))
    ddf['airport_fee'] = ddf['airport_fee'].replace("None", 0)
    ddf['airport_fee'] = ddf['airport_fee'].replace('nan', 0)
    ddf['airport_fee'] = ddf['airport_fee'].astype(float)
    ddf['airport_fee'] = ddf['airport_fee'].fillna(0)

    print("Drop unneeded cols")
    to_drop = ['base_passenger_fare', 'bcf', 'sales_tax', 'tips',
               'driver_pay', 'access_a_ride_flag', 'wav_match_flag'
              ]
    ddf2 = ddf.drop(columns=to_drop)
    ddf2 = ddf2.repartition(partition_size="100MB")

    print("Make categoricals")
    categories = ['hvfhs_license_num', 'PULocationID', "DOLocationID", 'wav_request_flag',
                  'accessible_vehicle', 'pickup_month', 'pickup_dow', 'pickup_hour', 
                  'PUBorough', 'DOBorough', 'CrossSuperborough'
                 ]
    ddf2[categories] = ddf2[categories].astype('category')
    ddf2 = ddf2.categorize(columns=categories)
    ddf2 = ddf2.repartition(partition_size="128MB")
    return ddf2

## Test Loading Dataset

In [None]:
cluster = coiled.Cluster(
    worker_vm_types=["m6i.4xlarge"],
    scheduler_vm_types=["m6i.2xlarge"],
    package_sync=True, # copy local packages,
    name="dask-engineering-f799f650-0",
    shutdown_on_close=True,  # reuse cluster across runs
    show_widget=False,
    n_workers=20,
    use_best_zone=True,
    account="dask-engineering",
    )
client = Client(cluster)

In [None]:
client

In [None]:
ddf = load_data()
ddf = make_taxi_data(ddf)

In [None]:
ddf.head()

In [None]:
ddf.columns.tolist()

In [None]:
ddf.dtypes

In [None]:
ddf.to_parquet("s3://prefect-dask-examples/nyc-uber-lyft/feature_table.parquet", overwrite=True)

In [None]:
client.shutdown()