Skip to content

Commit

Permalink
Fixes #609, #610: Replace the use of our cuda::dynarray<T> (i.e. `s…
Browse files Browse the repository at this point in the history
…td::vector<T>`'s, effectively) with `cuda::unique_span<T>`'s, and make sure they're padded by an extra allocated character set to '\0'
  • Loading branch information
eyalroz committed Apr 29, 2024
1 parent 9695001 commit 05fbd41
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 41 deletions.
4 changes: 2 additions & 2 deletions examples/modified_cuda_samples/clock_nvrtc/clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ long double compute_average_elapsed_clocks(const clock_t* timers, std::size_t nu
return offset_sum / num_blocks;
}

cuda::dynarray<char> compile_to_cubin(
cuda::unique_span<char> compile_to_cubin(
const char* kernel_source,
const char* kernel_name,
cuda::device_t target_device)
Expand Down Expand Up @@ -134,7 +134,7 @@ int main()
auto device_id { 0 }; // Not bothering with supporting a command-line argument here
auto device = cuda::device::get(device_id);
auto cubin = compile_to_cubin(clock_kernel::source, clock_kernel::name, device);
auto module = cuda::module::create(device, cubin);
auto module = cuda::module::create(device, cubin.get());
auto kernel_in_module = module.get_kernel(clock_kernel::name);

cuda::grid::dimension_t num_blocks { 64 };
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/api/library.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ ::std::string identify(const library_t &library);
* Create a CUDA driver library of compiled code from raw image data.
*
* @param[in] module_data the opaque, raw binary data for the module - in a contiguous container
* such as a span, a cuda::dynarray etc..
* such as a span, a cuda::unique_span etc..
*/
///@{
template <typename ContiguousContainer,
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/api/module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i
* can obtain a CUDA context. This is the context into which the module data is to
* be loaded (and in which the module contents may be used)
* @param[in] module_data the opaque, raw binary data for the module - in a contiguous container
* such as a span, a cuda::dynarray etc..
* such as a span, a cuda::unique_span etc..
*/
///@{
template <typename Locus, typename ContiguousContainer,
Expand Down
10 changes: 6 additions & 4 deletions src/cuda/api/multi_wrapper_impls/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,12 @@ inline void set_access_permissions(const cuda::device_t& device, const pool_t& p
template <typename DeviceRange>
void set_access_permissions(DeviceRange devices, const pool_t& pool, access_permissions_t permissions)
{
cuda::dynarray<cuda::device::id_t> device_ids(devices.size());
::std::transform(::std::begin(devices), ::std::end(devices), device_ids.begin());
span<cuda::device::id_t> device_ids_span {device_ids.data(), device_ids.size()};
cuda::memory::detail_::set_access_permissions(device_ids_span, pool.handle(), permissions);
// Not depending on unique_span here :-(
auto device_ids = ::std::unique_ptr<cuda::device::id_t[]>(new cuda::device::id_t[devices.size()]);
auto device_to_id = [](device_t const& device){ return device.id(); };
::std::transform(::std::begin(devices), ::std::end(devices), device_ids.get(), device_to_id);
cuda::memory::detail_::set_access_permissions(
{ device_ids.get(), devices.size() }, pool.handle(), permissions);
}
#endif // #if CUDA_VERSION >= 11020

Expand Down
5 changes: 0 additions & 5 deletions src/cuda/api/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,11 +895,6 @@ using handle_t = CUfunction;

} // namespace kernel

// The C++ standard library doesn't offer ::std::dynarray (although it almost did),
// and we won't introduce our own here. So...
template <typename T>
using dynarray = ::std::vector<T>;

} // namespace cuda

#ifndef __CUDACC__
Expand Down
59 changes: 35 additions & 24 deletions src/cuda/rtc/compilation_output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,23 +311,22 @@ class compilation_output_base_t {
return { buffer.data(), size };
}

dynarray<char> log() const
unique_span<char> log() const
{
size_t size = program::detail_::get_log_size<source_kind>(program_handle_, program_name_.c_str());
::std::vector<char> result(size+1);
if (size == 0) { return result; }
auto result = make_unique_span<char>(size+1); // Let's append a trailing nul character, to be on the safe side
if (size == 0) {
result[size] = '\0';
return result;
}
program::detail_::get_log<source_kind>(result.data(), program_handle_, program_name_.c_str());
// Q: Isn't it kind of "cheating" to use an ::std::vector, then return it as a dynarray? What
// if we get a proper dynarray which doesn't alias ::std::vector?
// A: Well, kind of; it would mean we might have to copy. However - a proper dynarray might
// allow us to construct it with an arbitrary buffer, or a larger dynarray etc. - and
// then we could ensure the allocation happens only once.
result[size] = '\0';
return result;
}
///@}

#if CUDA_VERSION >= 11010
virtual dynarray<char> cubin() const = 0;
virtual unique_span<char> cubin() const = 0;
virtual bool has_cubin() const = 0;
#endif

Expand Down Expand Up @@ -404,12 +403,16 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
return { buffer.data(), size };
}

dynarray<char> ptx() const
unique_span<char> ptx() const
{
size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str());
dynarray<char> result(size);
if (size == 0) { return result; }
auto result = make_unique_span<char>(size+1); // Let's append a trailing nul character, to be on the safe side
if (size == 0) {
result[size] = '\0';
return result;
}
program::detail_::get_ptx(result.data(), program_handle_, program_name_.c_str());
result[size] = '\0';
return result;
}
///@}
Expand Down Expand Up @@ -450,10 +453,10 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
return { buffer.data(), size };
}

dynarray<char> cubin() const override
unique_span<char> cubin() const override
{
size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
dynarray<char> result(size);
auto result = make_unique_span<char>(size);
if (size == 0) { return result; }
program::detail_::get_cubin<source_kind>(result.data(), program_handle_, program_name_.c_str());
return result;
Expand Down Expand Up @@ -500,12 +503,16 @@ class compilation_output_t<cuda_cpp> : public compilation_output_base_t<cuda_cpp
return { buffer.data(), size };
}

dynarray<char> lto_ir() const
unique_span<char> lto_ir() const
{
size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str());
dynarray<char> result(size);
if (size == 0) { return result; }
auto result = make_unique_span<char>(size+1); // Let's append a trailing nul character, to be on the safe side
if (size == 0) {
result[size] = '\0';
return result;
}
program::detail_::get_lto_ir(result.data(), program_handle_, program_name_.c_str());
result[size] = '\0';
return result;
}
/// @}
Expand Down Expand Up @@ -592,12 +599,16 @@ class compilation_output_t<ptx> : public compilation_output_base_t<ptx> {
}

public: // non-mutators
dynarray<char> cubin() const override
unique_span<char> cubin() const override
{
size_t size = program::detail_::get_cubin_size<source_kind>(program_handle_, program_name_.c_str());
dynarray<char> result(size);
if (size == 0) { return result; }
auto result = make_unique_span<char>(size+1); // Let's append a trailing nul character, to be on the safe side
if (size == 0) {
result[size] = '\0';
return result;
}
program::detail_::get_cubin<source_kind>(result.data(), program_handle_, program_name_.c_str());
result[size] = '\0';
return result;
}
///@}
Expand Down Expand Up @@ -668,15 +679,15 @@ template<> inline module_t create<cuda_cpp>(
// Note: The above won't fail even if no CUBIN was produced
bool has_cubin = (cubin_size > 0);
if (has_cubin) {
dynarray<char> cubin(cubin_size);
auto cubin = make_unique_span<char>(cubin_size);
rtc::program::detail_::get_cubin<cuda_cpp>(cubin.data(), program_handle, program_name);
return module::create(context, cubin, options);
return module::create(context, cubin.get(), options);
}
// Note: At this point, we must have PTX in the output, as otherwise the compilation could
// not have succeeded
#endif
auto ptx = compiled_program.ptx();
return module::create(context, ptx, options);
return module::create(context, ptx.get(), options);
}

#if CUDA_VERSION >= 11010
Expand All @@ -690,7 +701,7 @@ template<> inline module_t create<source_kind_t::ptx>(
+ cuda::rtc::program::detail_::identify<source_kind_t::ptx>(compiled_program.program_handle()));
}
auto cubin = compiled_program.cubin();
return module::create(context, cubin, options);
return module::create(context, cubin.get(), options);
}
#endif // CUDA_VERSION >= 11010

Expand Down
7 changes: 3 additions & 4 deletions src/cuda/rtc/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ inline program_t<Kind> create(const ::std::string& program_name)
} // namespace program

#if CUDA_VERSION >= 11020
inline dynarray<device::compute_capability_t>
inline unique_span<device::compute_capability_t>
supported_targets()
{
int num_supported_archs;
Expand All @@ -613,9 +613,8 @@ supported_targets()
auto raw_archs = ::std::unique_ptr<int[]>(new int[num_supported_archs]);
status = nvrtcGetSupportedArchs(raw_archs.get());
throw_if_error<cuda_cpp>(status, "Failed obtaining the architectures supported by NVRTC");
dynarray<device::compute_capability_t> result;
result.reserve(num_supported_archs);
::std::transform(raw_archs.get(), raw_archs.get() + num_supported_archs, ::std::back_inserter(result),
auto result = make_unique_span<device::compute_capability_t>(num_supported_archs);
::std::transform(raw_archs.get(), raw_archs.get() + num_supported_archs, ::std::begin(result),
[](int raw_arch) { return device::compute_capability_t::from_combined_number(raw_arch); });
return result;
}
Expand Down

0 comments on commit 05fbd41

Please sign in to comment.