In [None]:
from coiled import Cluster
from distributed import Client
import dask.dataframe as dd

from dask_ml.model_selection import train_test_split
import xgboost as xgb

cluster = Cluster(
   early_stopping_rounds=workers=64,
    account="paul-hobson", # or dask-engineering/
    package_sync=True,
    name="paul-ml-opt",
    show_widget=False,
    worker_memory="16 GiB",
    backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True}
)
client = Client(cluster)
client

In [None]:
s3_uri = "s3://coiled-datasets/uber-lyft-tlc/*.parquet"
nyc_taxi = (
    dd.read_parquet(s3_uri, use_nullable_dtypes=True)
        .select_dtypes(exclude="string")
)

nyc_taxi["pickup_hour"] = nyc_taxi["pickup_datetime"].dt.hour

cols = nyc_taxi.select_dtypes(include="datetime64[ns]").columns.tolist()
nyc_taxi[cols] = nyc_taxi[cols].astype(int).div(1e9).astype(int)
nyc_taxi["trip_time"] = nyc_taxi["dropoff_datetime"] - nyc_taxi["pickup_datetime"]


In [None]:
nyc_taxi.shape[0].compute()

In [None]:
nyc_taxi.map_partitions(lambda x: x.memory_usage(deep=True).sum()).compute()

In [None]:
from dask.utils import format_bytes
_.map(format_bytes)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    nyc_taxi.drop(columns=["trip_time"]),
    nyc_taxi["trip_time"]
)

In [None]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train, enable_categorical=True)
dtest = xgb.dask.DaskDMatrix(client, X_test, y_test, enable_categorical=True)

In [None]:
output = xgb.dask.train(
    client,
    {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
    dtrain,
    num_boost_round=4,
    evals=[(dtrain, "train")],
    early_stopping_rounds=1
)

In [None]:
output

In [None]:
y_pred = xgb.dask.predict(client, output["booster"], X_test)

In [None]:
_y = y_pred.compute()

In [None]:
client.close()
cluster.close()

In [None]:
residuals = (y_test - y_pred)

In [None]:
import dask.array as da
import dask

In [None]:
hist = da.histogram(
    residuals, 
    range=[residuals.min(), residuals.max()], 
    bins=1000
)

In [None]:
hist = dask.compute(hist)

In [None]:
client.close()
cluster.close()

In [None]:
y_train.shape[0].compute()

* early stopping
  * integer (easy)
  * callback (more realistic)
* pass sequence of lengths to .to_dask_array
*