Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dask] Update dask demo for using the new dask backend. #10347

Merged
merged 1 commit into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions demo/dask/gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
====================================
"""

import cupy as cp
import dask
import dask_cudf
from dask import array as da
from dask import dataframe as dd
Expand All @@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
# history obtained from evaluation metrics.
output = dxgb.train(
client,
{
"verbosity": 2,
"tree_method": "hist",
# Golden line for GPU training
"device": "cuda",
},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
Expand All @@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
.. versionadded:: 1.2.0

"""
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))

# `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
# be used for anything else other than training unless a reference is specified. See
# the `ref` argument of `DaskQuantileDMatrix`.
dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
output = dxgb.train(
client,
{"verbosity": 2, "tree_method": "hist", "device": "cuda"},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
)

prediction = dxgb.predict(client, output, X)
Expand All @@ -72,15 +67,23 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
# `LocalCUDACluster` is used for assigning GPU to XGBoost processes. Here
# `n_workers` represents the number of GPUs since we use one GPU per worker process.
with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
with Client(cluster) as client:
# generate some random data for demonstration
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
# Generate some random data for demonstration
rng = da.random.default_rng(1)

m = 100000
m = 2**18
n = 100
X = rng.normal(size=(m, n))
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)

X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
# XGBoost can take arrays. This is to show that DataFrame uses the GPU
# backend as well.
assert isinstance(X, dask_cudf.DataFrame)
assert isinstance(y, dask_cudf.Series)

print("Using DaskQuantileDMatrix")
from_ddqdm = using_quantile_device_dmatrix(client, X, y)
print("Using DMatrix")
Expand Down
19 changes: 11 additions & 8 deletions demo/dask/sklearn_gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
===================================================================
"""

import dask
from dask import array as da
from dask.distributed import Client

Expand All @@ -13,17 +14,18 @@


def main(client: Client) -> dxgb.Booster:
# generate some random data for demonstration
# Generate some random data for demonstration
rng = da.random.default_rng(1)

m = 2**18
n = 100
m = 1000000
partition_size = 10000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)

regressor = dxgb.DaskXGBRegressor(verbosity=1)
# set the device to CUDA
# Set the device to CUDA
regressor.set_params(tree_method="hist", device="cuda")
# assigning client here is optional
# Assigning client here is optional
regressor.client = client

regressor.fit(X, y, eval_set=[(X, y)])
Expand All @@ -42,5 +44,6 @@ def main(client: Client) -> dxgb.Booster:
# With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
# `LocalCUDACluster` used here is only for demonstration purpose.
with LocalCUDACluster() as cluster:
with Client(cluster) as client:
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
main(client)
Loading