# Train an XGBoost Model with Dask

We start by training a single XGBoost model with Dask using the [`xgboost.dask`](https://xgboost.readthedocs.io/en/stable/tutorials/dask.html) module built into XGBoost. In this notebook we ...

-  Load the data
-  Perform basic feature engineering (date type optimization, categorization)
-  Train a single model with XGBoost, using custom cross-validation

In [None]:
from __future__ import annotations

from collections.abc import Iterator
from datetime import datetime

import coiled
import dask.array as da
import dask.dataframe as dd
import distributed
import numpy as np
import pandas as pd
import xgboost
from dask_ml.metrics import mean_squared_error

In [None]:
# Location of feature table
FILEPATH = "s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet"

# Number of folds in each trial. This also determines the train/test split
# (e.g. N_FOLDS=5 -> train=4/5 of the total data, test=1/5)
N_FOLDS = 5

# Dask worker instance type and number
WORKER_INSTANCE_TYPE = "r6i.large"
N_WORKERS = 50

In [None]:
cluster = coiled.Cluster(
    worker_vm_types=[WORKER_INSTANCE_TYPE],
    scheduler_vm_types=["m6i.large"],
    package_sync=True,  # align remote packages to local ones
    n_workers=N_WORKERS,
    backend_options={
        "region": "us-east-2",
        "multizone": True,
        "spot": True,
        "spot_on_demand_fallback": True,
    },
    scheduler_options={"idle_timeout": "15 minutes"},
)
client = distributed.Client(cluster)

In [None]:
# Load feature table generated by Feature Engineering.ipynb
ddf = dd.read_parquet(FILEPATH)

# Reduce dataset size. Uncomment to speed up the exercise.
# ddf = ddf.partitions[:20]

# Under the hood, XGBoost converts floats to `float32`.
# Let's do it only once here.
float_cols = ddf.select_dtypes(include="float").columns.tolist()
ddf = ddf.astype({c: np.float32 for c in float_cols})

# We need the categories to be known
categorical_vars = ddf.select_dtypes(include="category").columns.tolist()

# categorize() reads the whole input and then discards it.
# Let's read from disk only once.
ddf = ddf.persist()
ddf = ddf.categorize(columns=categorical_vars)

# We will need to access this multiple times. Let's persist it.
ddf = ddf.persist()

ddf.head()

### Train Model

In [None]:
# Here we subset data for cross-validation
def make_cv_splits(
    n_folds: int = N_FOLDS,
) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    splits = ddf.random_split(frac, shuffle=True)
    for i in range(n_folds):
        train = [splits[j] for j in range(n_folds) if j != i]
        test = splits[i]
        yield dd.concat(train), test

In [None]:
start = datetime.now()
scores = []

for i, (train, test) in enumerate(make_cv_splits()):
    print(f"Training/Test split #{i}")
    y_train = train["trip_time"]
    X_train = train.drop(columns=["trip_time"])
    y_test = test["trip_time"]
    X_test = test.drop(columns=["trip_time"])

    print("Building DMatrix...")
    d_train = xgboost.dask.DaskDMatrix(
        None, X_train, y_train, enable_categorical=True
    )

    print("Training model...")
    model = xgboost.dask.train(
        None,
        {"tree_method": "hist"},
        d_train,
        num_boost_round=4,
        evals=[(d_train, "train")],
    )

    print("Running model on test data...")
    predictions = xgboost.dask.predict(None, model, X_test)

    print("Measuring accuracy of model vs. ground truth...")
    score = mean_squared_error(
        y_test.to_dask_array(),
        predictions.to_dask_array(),
        squared=False,
        compute=False,
    )
    # Compute predictions and mean squared error for this iteration
    # while we start the next one
    scores.append(score.reshape(1).persist())
    print("-" * 80)

scores = da.concatenate(scores).compute()
print(f"RSME={scores.mean()} +/- {scores.std()}")
print(f"Total time:  {datetime.now() - start}")

In [None]:
client.shutdown()

In [None]:
model