Skip to content

Commit

Permalink
Add OpenCL prefetch support and make stdpar prefetch bypass all SYCL …
Browse files Browse the repository at this point in the history
…layers
  • Loading branch information
illuhad committed Dec 6, 2023
1 parent 5b3b69c commit 5f22d60
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 8 deletions.
6 changes: 6 additions & 0 deletions include/hipSYCL/runtime/ocl/ocl_usm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ class ocl_usm {
const std::vector<cl::Event> &wait_events,
cl::Event *out) = 0;

virtual cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
std::size_t bytes,
cl_mem_migration_flags flags,
const std::vector<cl::Event> &wait_events,
cl::Event *out) = 0;

virtual cl_int enable_indirect_usm_access(cl::Kernel&) = 0;

static std::unique_ptr<ocl_usm> from_intel_extension(ocl_hardware_manager* hw_mgr, int device_index);
Expand Down
19 changes: 18 additions & 1 deletion include/hipSYCL/std/stdpar/detail/offload.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#ifndef HIPSYCL_PSTL_OFFLOAD_HPP
#define HIPSYCL_PSTL_OFFLOAD_HPP

#include "hipSYCL/runtime/operations.hpp"
#include "hipSYCL/std/stdpar/detail/execution_fwd.hpp"
#include "hipSYCL/std/stdpar/detail/stdpar_builtins.hpp"
#include "hipSYCL/std/stdpar/detail/sycl_glue.hpp"
Expand Down Expand Up @@ -296,6 +297,22 @@ inline constexpr prefetch_mode get_prefetch_mode() noexcept {
return mode;
}

inline void prefetch(sycl::queue& q, const void* ptr, std::size_t bytes) noexcept {
auto* inorder_executor = q.hipSYCL_inorder_executor();
if(inorder_executor) {
// Attempt to invoke backend functionality directly -
// in general we might have to issue multiple prefetches for
// each kernel, so overheads can quickly add up.
HIPSYCL_DEBUG_INFO << "[stdpar] Submitting raw prefetch to backend: "
<< bytes << " bytes @" << ptr << std::endl;
rt::inorder_queue* ordered_q = inorder_executor->get_queue();
rt::prefetch_operation op{ptr, bytes, ordered_q->get_device()};
ordered_q->submit_prefetch(op, nullptr);
} else {
q.prefetch(ptr, bytes);
}
}

template<class AlgorithmType, class Size, typename... Args>
void prepare_offloading(AlgorithmType type, Size problem_size, const Args&... args) {
auto& q = detail::single_device_dispatch::get_queue();
Expand Down Expand Up @@ -337,7 +354,7 @@ void prepare_offloading(AlgorithmType type, Size problem_size, const Args&... ar
should_prefetch = most_recent_offload_batch < current_batch_id;

if (should_prefetch) {
q.prefetch(lookup_result.root_address, prefetch_size);
prefetch(q, lookup_result.root_address, prefetch_size);
__atomic_store_n(most_recent_offload_batch_ptr, current_batch_id,
__ATOMIC_RELEASE);
}
Expand Down
13 changes: 10 additions & 3 deletions include/hipSYCL/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,12 @@ class queue : public detail::property_carrying_object
std::size_t hipSYCL_hash_code() const {
return _node_group_id;
}

rt::inorder_executor* hipSYCL_inorder_executor() const {
if(!_dedicated_inorder_executor)
return nullptr;
return static_cast<rt::inorder_executor*>(_dedicated_inorder_executor.get());
}
private:
template<int Dim>
void apply_preferred_group_size(const property_list& prop_list, handler& cgh) {
Expand Down Expand Up @@ -1017,14 +1023,14 @@ class queue : public detail::property_carrying_object
rt::device_id rt_dev = detail::extract_rt_device(this->get_device());
// Dedicated executor may not be supported by all backends,
// so this might return nullptr.
std::shared_ptr<rt::backend_executor> dedicated_executor =
_dedicated_inorder_executor =
_requires_runtime.get()
->backends()
.get(rt_dev.get_backend())
->create_inorder_executor(rt_dev, priority);

if(dedicated_executor) {
_default_hints->set_hint(rt::hints::prefer_executor{dedicated_executor});
if(_dedicated_inorder_executor) {
_default_hints->set_hint(rt::hints::prefer_executor{_dedicated_inorder_executor});
}
}

Expand All @@ -1051,6 +1057,7 @@ class queue : public detail::property_carrying_object
std::shared_ptr<rt::dag_node_ptr> _previous_submission;
std::shared_ptr<std::mutex> _lock;
std::size_t _node_group_id;
std::shared_ptr<rt::backend_executor> _dedicated_inorder_executor;
};

HIPSYCL_SPECIALIZE_GET_INFO(queue, context)
Expand Down
7 changes: 6 additions & 1 deletion src/runtime/cuda/cuda_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ class cuda_instrumentation_guard {
operation &op, dag_node_ptr node)
: _queue{q}, _operation{&op}, _node{node} {
assert(q);
assert(_node);

if(!_node)
return;

if (_node->get_execution_hints()
.has_hint<
Expand All @@ -113,6 +115,9 @@ class cuda_instrumentation_guard {
}

~cuda_instrumentation_guard() {
if(!_node)
return;

if (_node->get_execution_hints()
.has_hint<rt::hints::request_instrumentation_finish_timestamp>()) {
std::shared_ptr<dag_node_event> task_finish = _queue->insert_event();
Expand Down
7 changes: 6 additions & 1 deletion src/runtime/hip/hip_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ class hip_instrumentation_guard {
operation &op, dag_node_ptr node)
: _queue{q}, _operation{&op}, _node{node} {
assert(q);
assert(_node);

if(!_node)
return;

if (_node->get_execution_hints()
.has_hint<
Expand All @@ -103,6 +105,9 @@ class hip_instrumentation_guard {
}

~hip_instrumentation_guard() {
if(!_node)
return;

if (_node->get_execution_hints()
.has_hint<rt::hints::request_instrumentation_finish_timestamp>()) {
std::shared_ptr<dag_node_event> task_finish = _queue->insert_event();
Expand Down
25 changes: 23 additions & 2 deletions src/runtime/ocl/ocl_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,29 @@ result ocl_queue::submit_kernel(kernel_operation &op, dag_node_ptr node) {
return make_success();
}

result ocl_queue::submit_prefetch(prefetch_operation &, dag_node_ptr) {
// TODO, prefetch is just a hint
result ocl_queue::submit_prefetch(prefetch_operation &op, dag_node_ptr) {
ocl_hardware_context *ocl_ctx = static_cast<ocl_hardware_context *>(
_hw_manager->get_device(_device_index));
ocl_usm* usm = ocl_ctx->get_usm_provider();

cl::Event evt;
cl_int err = 0;
if(op.get_target().is_host()) {
err = usm->enqueue_prefetch(_queue, op.get_pointer(), op.get_num_bytes(),
CL_MIGRATE_MEM_OBJECT_HOST, {}, &evt);
} else {
err = usm->enqueue_prefetch(_queue, op.get_pointer(), op.get_num_bytes(),
0, {}, &evt);
}

if(err != CL_SUCCESS) {
return make_error(
__hipsycl_here(),
error_info{"ocl_queue: enqueuing prefetch failed",
error_code{"CL", static_cast<int>(err)}});
}

register_submitted_op(evt);
return make_success();
}

Expand Down
38 changes: 38 additions & 0 deletions src/runtime/ocl/ocl_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,23 @@ class ocl_usm_intel_extension : public ocl_usm {
return err;
}

cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
std::size_t bytes,
cl_mem_migration_flags flags,
const std::vector<cl::Event> &wait_events,
cl::Event *event) override {
cl_event tmp;
cl_int err = _migrate_mem(
queue.get(), ptr, bytes, flags, wait_events.size(),
(wait_events.size() > 0) ? (cl_event *)&wait_events.front() : nullptr,
(event != nullptr) ? &tmp : nullptr);

if(event != nullptr && err == CL_SUCCESS) {
*event = tmp;
}
return err;
}

cl_int enable_indirect_usm_access(cl::Kernel& k) override {
auto maybe_ignore = [this](cl_int error_code) {
// Intel CPU OpenCL seems to not understand these flags. We can just ignore USM errors
Expand Down Expand Up @@ -427,6 +444,27 @@ class ocl_usm_svm : public ocl_usm {
return queue.enqueueMemFillSVM(ptr, pattern_byte, bytes, &wait_events, out);
}


cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
std::size_t bytes,
cl_mem_migration_flags flags,
const std::vector<cl::Event> &wait_events,
cl::Event *event) override {
// Seems there is a bug in CommandQueue::enqueueMigrateSVM, so we directly
// call the OpenCL function
cl_event tmp;
cl_int err = ::clEnqueueSVMMigrateMem(
queue.get(), 1, &ptr, &bytes, flags, wait_events.size(),
(wait_events.size() > 0) ? (cl_event *)&wait_events.front() : nullptr,
(event != nullptr) ? &tmp : nullptr);

if(event != nullptr && err == CL_SUCCESS) {
*event = tmp;
}
return err;
}


cl_int enable_indirect_usm_access(cl::Kernel& k) override {
return k.setExecInfo(CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool{true});
}
Expand Down
3 changes: 3 additions & 0 deletions src/runtime/omp/omp_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ class instrumentation_task_guard {
class omp_instrumentation_setup {
public:
omp_instrumentation_setup(operation &op, dag_node_ptr node) {
if(!node)
return;

if (node->get_execution_hints()
.has_hint<
rt::hints::request_instrumentation_submission_timestamp>()) {
Expand Down

0 comments on commit 5f22d60

Please sign in to comment.