Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ option(BOOST_CAPY_BUILD_TESTS "Build boost::capy tests" ${BUILD_TESTING})
option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_ROOT})
option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT})
option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF)
option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF)
option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF)

if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
Expand All @@ -49,6 +50,24 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
endif()
endif()

if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
if(NOT BOOST_CAPY_BUILD_STDEXEC_EXAMPLES)
message(FATAL_ERROR
"BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires "
"BOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON")
endif()
if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 23)
message(FATAL_ERROR
"BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
# Tell NVIDIA/stdexec to build the nvexec target when its
# FetchContent is processed (bench/ and/or the example itself).
set(STDEXEC_ENABLE_CUDA ON CACHE BOOL
"Build nvexec when configuring NVIDIA/stdexec" FORCE)
endif()

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS)
Expand Down
4 changes: 4 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
add_subdirectory(awaitable-sender)
endif()

if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
add_subdirectory(gpu-pipeline)
endif()

if(TARGET Boost::asio)
add_subdirectory(asio)
endif()
Expand Down
58 changes: 58 additions & 0 deletions example/gpu-pipeline/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# Copyright (c) 2026 Steve Gerbino
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#
# Official repository: https://github.com/cppalliance/capy
#

# CUDA was enabled at the top level when the option was flipped on.
# Honor a clean error if the user wired around it.
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR
"example/gpu-pipeline requires CUDA; "
"did you set BOOST_CAPY_BUILD_NVEXEC_EXAMPLES?")
endif()

# Fetch NVIDIA/stdexec independently of bench so the example builds
# even with BOOST_CAPY_BUILD_BENCH=OFF. If bench has already declared
# the same content with the same name, this call is a no-op.
include(FetchContent)
FetchContent_Declare(
stdexec
GIT_REPOSITORY https://github.com/NVIDIA/stdexec
GIT_TAG 307b83c5689ea7c2e5b31561cdc428697705333e
SYSTEM
FIND_PACKAGE_ARGS NAMES stdexec
)
FetchContent_MakeAvailable(stdexec)

if(NOT TARGET STDEXEC::nvexec)
message(FATAL_ERROR
"STDEXEC::nvexec target not found after configuring stdexec. "
"Ensure CUDA is enabled and STDEXEC_ENABLE_CUDA=ON.")
endif()

file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
*.cu *.cuh *.hpp
CMakeLists.txt
README.md)

source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})

add_executable(capy_example_gpu_pipeline ${PFILES})

set_target_properties(capy_example_gpu_pipeline PROPERTIES
FOLDER "examples"
CUDA_STANDARD 20
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)

target_compile_features(capy_example_gpu_pipeline PRIVATE cxx_std_23)

target_link_libraries(capy_example_gpu_pipeline PRIVATE
Boost::capy
STDEXEC::stdexec
STDEXEC::nvexec
CUDA::cudart)
12 changes: 12 additions & 0 deletions example/gpu-pipeline/Jamfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#
# Copyright (c) 2026 Steve Gerbino
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#
# Official repository: https://github.com/cppalliance/capy
#

# This example requires CUDA, nvc++, and NVIDIA/stdexec (nvexec).
# It is built only via the CMake build (BOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON);
# the b2 build does not currently provide CUDA support for capy.
88 changes: 88 additions & 0 deletions example/gpu-pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# GPU pipeline example

This example demonstrates that `boost::capy::await_sender` and
`boost::capy::as_sender` compose with NVIDIA's `nvexec::stream_scheduler`,
not just with CPU schedulers. Two scenes:

1. **Scene 1 (Direction 1).** A `boost::capy::task` running on
`boost::capy::thread_pool` `co_await`s a sender whose terminal action is
a real `__global__` SAXPY kernel scheduled on `nvexec::stream_scheduler`.
When the CUDA stream signals completion, the coroutine resumes on the
capy executor with the kernel's result.

2. **Scene 2 (Direction 2).** `boost::capy::test::stream::read_some` is
exposed as a stdexec sender via `boost::capy::as_sender`, composed with
`stdexec::upon_error`, and driven by `stdexec::sync_wait`. Two runs: a
happy-path read, and a peer-close that exercises the `upon_error` arm.

The example wraps `read_some` (a raw IoAwaitable) rather than
`boost::capy::read` (a `task<io_result<size_t>>`). The bridge's `start()`
does not perform symmetric transfer to a wrapped task's own coroutine
handle, so wrapping a task in `as_sender` hangs. Wrapping a raw
IoAwaitable works because its `await_suspend` is either ready-with-data
or returns `noop_coroutine()` after stashing the continuation for the
peer to resume.

The bridge headers (`awaitable_sender.hpp`, `sender_awaitable.hpp`) are
copied verbatim from `bench/stdexec/`; the bridge in the bench was already
written against NVIDIA/stdexec.

## Prerequisites

- NVIDIA GPU and driver visible to `nvidia-smi`.
- CUDA toolkit. On Arch: `pacman -S cuda`. CUDA 13.x works.
- A C++23-capable compiler with both `<coroutine>` support and CUDA
device-side compilation. Verified locally with clang 22 as host *and*
CUDA compiler.
- `CMAKE_CXX_STANDARD=23`.

nvc++ from the NVHPC SDK is the nominally blessed compiler for nvexec,
but nvc++ 26.3 does not enable C++20 coroutines (no `__cpp_impl_coroutine`,
`co_return` parses as undefined). capy is built on coroutines, so nvc++
cannot compile capy at present. Clang-cuda is the working alternative.

## Building and running

```
CXX=clang++ cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=23 \
-DCMAKE_CUDA_COMPILER=clang++ \
-DCMAKE_CUDA_HOST_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=89 \
-DCUDAToolkit_ROOT=/opt/cuda \
-DBOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON \
-DBOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON
cmake --build build --config Release --target capy_example_gpu_pipeline
./build/example/gpu-pipeline/capy_example_gpu_pipeline
```

Replace `89` with your GPU's compute capability (`nvidia-smi
--query-gpu=compute_cap --format=csv,noheader`).

## Expected output

The exact thread ids vary, but the structure is fixed:

```
main thread: <tid-main>
--- scene 1: await_sender( gpu sender ) ---
scene1: pre-await on thread <tid-A>
scene1: post-await on thread <tid-B>
scene1: y[0] = 5
--- scene 2a: as_sender( read_some ) happy ---
scene2 happy: read 13 bytes
--- scene 2b: as_sender( read_some ) error ---
scene2 error: upon_error fired with "eof" (n=0)
all scenes passed
```

Exit status is 0 on success and non-zero on any failed assertion or CUDA
error.

## Scope

Correctness only. No performance measurement; no GPU-side cancellation;
no multi-device topologies. See
`docs/superpowers/specs/2026-05-27-stdexec-gpu-example-design.md` for the
full scope statement.
Loading
Loading