Fixes #311: Added a launch configuration builder class, `cuda::launch…

…_config_builder_t` (which you can create using `cuda::launch_config_builder()`). It make building lauch configurations easier... * Easy to build linear launch configurations. * Can specify the overall dimensions and the block or grid dims instead of always having to compute block and grid dims yourself. * When compiling in Debug mode (i.e. with `NDEBUG` undefined), checks compatibility of dimensions and shared memory size with the kernel or device associated with the builder. Remains to be implemented: * Integration of optimal block size / launch grid functions from the API with this builder.
eyalroz · Apr 16, 2022 · 7aaf52d · 7aaf52d
1 parent 4b0a438
commit 7aaf52d
Show file tree

Hide file tree

Showing 7 changed files with 477 additions and 22 deletions.
diff --git a/examples/by_runtime_api_module/event_management.cu b/examples/by_runtime_api_module/event_management.cu
@@ -98,9 +98,11 @@ int main(int argc, char **argv)
 	auto buffer = cuda::memory::managed::make_unique<char[]>(
 		device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
 	auto wrapped_kernel = cuda::kernel::get(device, increment);
-	cuda::grid::block_dimension_t threads_per_block = wrapped_kernel.maximum_threads_per_block();
-	cuda::grid::dimension_t num_blocks = div_rounding_up(buffer_size, threads_per_block);
-	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
+	auto launch_config = cuda::launch_config_builder()
+		.kernel(&wrapped_kernel)
+		.overall_size(buffer_size)
+		.use_maximum_linear_block()
+		.build();
 
 	stream.enqueue.kernel_launch(print_message<N,1>, { 1, 1 }, message<N>("I am launched before the first event"));
 	stream.enqueue.event(event_1);

diff --git a/examples/by_runtime_api_module/execution_control.cu b/examples/by_runtime_api_module/execution_control.cu
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
 
 	const int bar = 123;
 	const unsigned num_blocks = 3;
-	std::cout << "Getting kernel attibute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
+	std::cout << "Getting kernel attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
 	auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block);
 	std::cout

diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -65,10 +65,9 @@ int main(int, char **)
 
 	auto d_a = cuda::memory::device::make_unique<datum[]>(device, n);
 
-	auto threads = cuda::grid::block_dimensions_t(512, 1);
-	assert_(n % threads.x == 0);
-	auto blocks  = cuda::grid::dimensions_t(n / threads.x, 1);
-	auto launch_config = cuda::make_launch_config(blocks, threads);
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(n)
+		.block_size(512).build();
 
 	// create cuda event handles
 	auto start_event = cuda::event::create(

diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -52,16 +52,17 @@ int main()
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
 
-	// Launch the Vector Add CUDA Kernel
-	int threadsPerBlock = 256;
-	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(numElements)
+		.block_size(256)
+		.build();
+
 	std::cout
-		<< "CUDA kernel launch with " << blocksPerGrid
-		<< " blocks of " << threadsPerBlock << " threads\n";
+		<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
+		<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";
 
 	cuda::launch(
-		vectorAdd,
-		cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
+		vectorAdd, launch_config,
 		d_A.get(), d_B.get(), d_C.get(), numElements
 	);
 

diff --git a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
@@ -68,16 +68,17 @@ int main(void)
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
 
-	// Launch the Vector Add CUDA Kernel
-	int threadsPerBlock = 256;
-	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(numElements)
+		.block_size(256)
+		.build();
+
 	std::cout
-		<< "CUDA kernel launch with " << blocksPerGrid
-		<< " blocks of " << threadsPerBlock << " threads\n";
+	<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
+	<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";
 
 	cuda::launch(
-		vectorAdd,
-		cuda::launch_configuration_t( blocksPerGrid, threadsPerBlock ),
+		vectorAdd, launch_config,
 		d_A.get(), d_B.get(), d_C.get(), numElements
 	);
 

diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
@@ -43,6 +43,7 @@ static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled
 
 #include <cuda/api/pci_id_impl.hpp>
 #include <cuda/api/apriori_compiled_kernel.hpp>
+#include <cuda/api/launch_configuration.hpp>
 #include <cuda/api/kernel_launch.hpp>
 #include <cuda/api/virtual_memory.hpp>
 
@@ -59,4 +60,6 @@ static_assert(__cplusplus >= 201103L, "The CUDA API headers can only be compiled
 #include <cuda/api/multi_wrapper_impls/apriori_compiled_kernel.hpp>
 #include <cuda/api/multi_wrapper_impls/module.hpp>
 
+#include <cuda/api/launch_config_builder.hpp>
+
 #endif // CUDA_API_WRAPPERS_HPP_