Skip to content

Commit

Permalink
Fixes #311: Added a launch configuration builder class, `cuda::launch…
Browse files Browse the repository at this point in the history
…_config_builder_t` (which you can create using `cuda::launch_config_builder()`). It make building lauch configurations easier...

* Easy to build linear launch configurations.
* Can specify the overall dimensions and the block or grid dims instead of always having to compute block and grid dims yourself.
* When compiling in Debug mode (i.e. with `NDEBUG` undefined), checks compatibility of dimensions and shared memory size with the kernel or device associated with the builder.

Remains to be implemented:

* Integration of optimal block size / launch grid functions from the API with this builder.
  • Loading branch information
eyalroz committed Apr 16, 2022
1 parent 4b0a438 commit 7aaf52d
Show file tree
Hide file tree
Showing 7 changed files with 477 additions and 22 deletions.
8 changes: 5 additions & 3 deletions examples/by_runtime_api_module/event_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,11 @@ int main(int argc, char **argv)
auto buffer = cuda::memory::managed::make_unique<char[]>(
device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
auto wrapped_kernel = cuda::kernel::get(device, increment);
cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.maximum_threads_per_block();
cuda::grid::dimension_t num_blocks = div_rounding_up(buffer_size, threads_per_block);
auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
auto launch_config = cuda::launch_config_builder()
.kernel(&wrapped_kernel)
.overall_size(buffer_size)
.use_maximum_linear_block()
.build();

stream.enqueue.kernel_launch(print_message<N,1>, { 1, 1 }, message<N>("I am launched before the first event"));
stream.enqueue.event(event_1);
Expand Down
2 changes: 1 addition & 1 deletion examples/by_runtime_api_module/execution_control.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ int main(int argc, char **argv)

const int bar = 123;
const unsigned num_blocks = 3;
std::cout << "Getting kernel attibute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
std::cout << "Getting kernel attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block);
std::cout
Expand Down
7 changes: 3 additions & 4 deletions examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,9 @@ int main(int, char **)

auto d_a = cuda::memory::device::make_unique<datum[]>(device, n);

auto threads = cuda::grid::block_dimensions_t(512, 1);
assert_(n % threads.x == 0);
auto blocks = cuda::grid::dimensions_t(n / threads.x, 1);
auto launch_config = cuda::make_launch_config(blocks, threads);
auto launch_config = cuda::launch_config_builder()
.overall_size(n)
.block_size(512).build();

// create cuda event handles
auto start_event = cuda::event::create(
Expand Down
15 changes: 8 additions & 7 deletions examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,17 @@ int main()
cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
.block_size(256)
.build();

std::cout
<< "CUDA kernel launch with " << blocksPerGrid
<< " blocks of " << threadsPerBlock << " threads\n";
<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";

cuda::launch(
vectorAdd,
cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
vectorAdd, launch_config,
d_A.get(), d_B.get(), d_C.get(), numElements
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,17 @@ int main(void)
cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
.block_size(256)
.build();

std::cout
<< "CUDA kernel launch with " << blocksPerGrid
<< " blocks of " << threadsPerBlock << " threads\n";
<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";

cuda::launch(
vectorAdd,
cuda::launch_configuration_t( blocksPerGrid, threadsPerBlock ),
vectorAdd, launch_config,
d_A.get(), d_B.get(), d_C.get(), numElements
);

Expand Down
3 changes: 3 additions & 0 deletions src/cuda/api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled

#include <cuda/api/pci_id_impl.hpp>
#include <cuda/api/apriori_compiled_kernel.hpp>
#include <cuda/api/launch_configuration.hpp>
#include <cuda/api/kernel_launch.hpp>
#include <cuda/api/virtual_memory.hpp>

Expand All @@ -59,4 +60,6 @@ static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled
#include <cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp>
#include <cuda/api/multi_wrapper_impls/module.hpp>

#include <cuda/api/launch_config_builder.hpp>

#endif // CUDA_API_WRAPPERS_HPP_
Loading

0 comments on commit 7aaf52d

Please sign in to comment.