Add OpenCL prefetch support and make stdpar prefetch bypass all SYCL …

…layers
fxzjshm · Dec 6, 2023 · 5f22d60 · 5f22d60
1 parent 5b3b69c
commit 5f22d60
Show file tree

Hide file tree

Showing 8 changed files with 110 additions and 8 deletions.
diff --git a/include/hipSYCL/runtime/ocl/ocl_usm.hpp b/include/hipSYCL/runtime/ocl/ocl_usm.hpp
@@ -65,6 +65,12 @@ class ocl_usm {
                               const std::vector<cl::Event> &wait_events,
                               cl::Event *out) = 0;
 
+  virtual cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
+                                  std::size_t bytes,
+                                  cl_mem_migration_flags flags,
+                                  const std::vector<cl::Event> &wait_events,
+                                  cl::Event *out) = 0;
+
   virtual cl_int enable_indirect_usm_access(cl::Kernel&) = 0;
 
   static std::unique_ptr<ocl_usm> from_intel_extension(ocl_hardware_manager* hw_mgr, int device_index);

diff --git a/include/hipSYCL/std/stdpar/detail/offload.hpp b/include/hipSYCL/std/stdpar/detail/offload.hpp
@@ -28,6 +28,7 @@
 #ifndef HIPSYCL_PSTL_OFFLOAD_HPP
 #define HIPSYCL_PSTL_OFFLOAD_HPP
 
+#include "hipSYCL/runtime/operations.hpp"
 #include "hipSYCL/std/stdpar/detail/execution_fwd.hpp"
 #include "hipSYCL/std/stdpar/detail/stdpar_builtins.hpp"
 #include "hipSYCL/std/stdpar/detail/sycl_glue.hpp"
@@ -296,6 +297,22 @@ inline constexpr prefetch_mode get_prefetch_mode() noexcept {
   return mode;
 }
 
+inline void prefetch(sycl::queue& q, const void* ptr, std::size_t bytes) noexcept {
+  auto* inorder_executor = q.hipSYCL_inorder_executor();
+  if(inorder_executor) {
+    // Attempt to invoke backend functionality directly -
+    // in general we might have to issue multiple prefetches for
+    // each kernel, so overheads can quickly add up.
+    HIPSYCL_DEBUG_INFO << "[stdpar] Submitting raw prefetch to backend: "
+                       << bytes << " bytes @" << ptr << std::endl;
+    rt::inorder_queue* ordered_q = inorder_executor->get_queue();
+    rt::prefetch_operation op{ptr, bytes, ordered_q->get_device()};
+    ordered_q->submit_prefetch(op, nullptr);
+  } else {
+    q.prefetch(ptr, bytes);
+  }
+}
+
 template<class AlgorithmType, class Size, typename... Args>
 void prepare_offloading(AlgorithmType type, Size problem_size, const Args&... args) {
   auto& q = detail::single_device_dispatch::get_queue();
@@ -337,7 +354,7 @@ void prepare_offloading(AlgorithmType type, Size problem_size, const Args&... ar
               should_prefetch = most_recent_offload_batch < current_batch_id;
 
             if (should_prefetch) {
-              q.prefetch(lookup_result.root_address, prefetch_size);
+              prefetch(q, lookup_result.root_address, prefetch_size);
               __atomic_store_n(most_recent_offload_batch_ptr, current_batch_id,
                                __ATOMIC_RELEASE);
             }

diff --git a/include/hipSYCL/sycl/queue.hpp b/include/hipSYCL/sycl/queue.hpp
@@ -921,6 +921,12 @@ class queue : public detail::property_carrying_object
   std::size_t hipSYCL_hash_code() const {
     return _node_group_id;
   }
+
+  rt::inorder_executor* hipSYCL_inorder_executor() const {
+    if(!_dedicated_inorder_executor)
+      return nullptr;
+    return static_cast<rt::inorder_executor*>(_dedicated_inorder_executor.get());
+  }
 private:
   template<int Dim>
   void apply_preferred_group_size(const property_list& prop_list, handler& cgh) {
@@ -1017,14 +1023,14 @@ class queue : public detail::property_carrying_object
       rt::device_id rt_dev = detail::extract_rt_device(this->get_device());
       // Dedicated executor may not be supported by all backends,
       // so this might return nullptr.
-      std::shared_ptr<rt::backend_executor> dedicated_executor =
+      _dedicated_inorder_executor =
           _requires_runtime.get()
               ->backends()
               .get(rt_dev.get_backend())
               ->create_inorder_executor(rt_dev, priority);
 
-      if(dedicated_executor) {
-        _default_hints->set_hint(rt::hints::prefer_executor{dedicated_executor});
+      if(_dedicated_inorder_executor) {
+        _default_hints->set_hint(rt::hints::prefer_executor{_dedicated_inorder_executor});
       }
     }
 
@@ -1051,6 +1057,7 @@ class queue : public detail::property_carrying_object
   std::shared_ptr<rt::dag_node_ptr> _previous_submission;
   std::shared_ptr<std::mutex> _lock;
   std::size_t _node_group_id;
+  std::shared_ptr<rt::backend_executor> _dedicated_inorder_executor;
 };
 
 HIPSYCL_SPECIALIZE_GET_INFO(queue, context)

diff --git a/src/runtime/cuda/cuda_queue.cpp b/src/runtime/cuda/cuda_queue.cpp
@@ -89,7 +89,9 @@ class cuda_instrumentation_guard {
                              operation &op, dag_node_ptr node) 
                              : _queue{q}, _operation{&op}, _node{node} {
     assert(q);
-    assert(_node);
+
+    if(!_node)
+      return;
 
     if (_node->get_execution_hints()
             .has_hint<
@@ -113,6 +115,9 @@ class cuda_instrumentation_guard {
   }
 
   ~cuda_instrumentation_guard() {
+    if(!_node)
+      return;
+
     if (_node->get_execution_hints()
             .has_hint<rt::hints::request_instrumentation_finish_timestamp>()) {
       std::shared_ptr<dag_node_event> task_finish = _queue->insert_event();

diff --git a/src/runtime/hip/hip_queue.cpp b/src/runtime/hip/hip_queue.cpp
@@ -79,7 +79,9 @@ class hip_instrumentation_guard {
                              operation &op, dag_node_ptr node) 
                              : _queue{q}, _operation{&op}, _node{node} {
     assert(q);
-    assert(_node);
+
+    if(!_node)
+      return;
 
     if (_node->get_execution_hints()
             .has_hint<
@@ -103,6 +105,9 @@ class hip_instrumentation_guard {
   }
 
   ~hip_instrumentation_guard() {
+    if(!_node)
+      return;
+
     if (_node->get_execution_hints()
             .has_hint<rt::hints::request_instrumentation_finish_timestamp>()) {
       std::shared_ptr<dag_node_event> task_finish = _queue->insert_event();

diff --git a/src/runtime/ocl/ocl_queue.cpp b/src/runtime/ocl/ocl_queue.cpp
@@ -263,8 +263,29 @@ result ocl_queue::submit_kernel(kernel_operation &op, dag_node_ptr node) {
   return make_success();
 }
 
-result ocl_queue::submit_prefetch(prefetch_operation &, dag_node_ptr) {
-  // TODO, prefetch is just a hint
+result ocl_queue::submit_prefetch(prefetch_operation &op, dag_node_ptr) {
+  ocl_hardware_context *ocl_ctx = static_cast<ocl_hardware_context *>(
+        _hw_manager->get_device(_device_index));
+  ocl_usm* usm = ocl_ctx->get_usm_provider();
+
+  cl::Event evt;
+  cl_int err = 0;
+  if(op.get_target().is_host()) {
+    err = usm->enqueue_prefetch(_queue, op.get_pointer(), op.get_num_bytes(),
+                                CL_MIGRATE_MEM_OBJECT_HOST, {}, &evt);
+  } else {
+    err = usm->enqueue_prefetch(_queue, op.get_pointer(), op.get_num_bytes(),
+                                0, {}, &evt);
+  }
+
+  if(err != CL_SUCCESS) {
+    return make_error(
+          __hipsycl_here(),
+          error_info{"ocl_queue: enqueuing prefetch failed",
+                     error_code{"CL", static_cast<int>(err)}});
+  }
+
+  register_submitted_op(evt);
   return make_success();
 }
 

diff --git a/src/runtime/ocl/ocl_usm.cpp b/src/runtime/ocl/ocl_usm.cpp
@@ -251,6 +251,23 @@ class ocl_usm_intel_extension : public ocl_usm {
     return err;
   }
 
+  cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
+                          std::size_t bytes,
+                          cl_mem_migration_flags flags,
+                          const std::vector<cl::Event> &wait_events,
+                          cl::Event *event) override {
+    cl_event tmp;
+    cl_int err = _migrate_mem(
+        queue.get(), ptr, bytes, flags, wait_events.size(),
+        (wait_events.size() > 0) ? (cl_event *)&wait_events.front() : nullptr,
+        (event != nullptr) ? &tmp : nullptr);
+
+    if(event != nullptr && err == CL_SUCCESS) {
+      *event = tmp;
+    }
+    return err;
+  }
+
   cl_int enable_indirect_usm_access(cl::Kernel& k) override {
     auto maybe_ignore = [this](cl_int error_code) {
       // Intel CPU OpenCL seems to not understand these flags. We can just ignore USM errors
@@ -427,6 +444,27 @@ class ocl_usm_svm : public ocl_usm {
     return queue.enqueueMemFillSVM(ptr, pattern_byte, bytes, &wait_events, out);
   }
 
+
+  cl_int enqueue_prefetch(cl::CommandQueue &queue, const void *ptr,
+                          std::size_t bytes,
+                          cl_mem_migration_flags flags,
+                          const std::vector<cl::Event> &wait_events,
+                          cl::Event *event) override {
+    // Seems there is a bug in CommandQueue::enqueueMigrateSVM, so we directly
+    // call the OpenCL function
+    cl_event tmp;
+    cl_int err = ::clEnqueueSVMMigrateMem(
+        queue.get(), 1, &ptr, &bytes, flags, wait_events.size(),
+        (wait_events.size() > 0) ? (cl_event *)&wait_events.front() : nullptr,
+        (event != nullptr) ? &tmp : nullptr);
+
+    if(event != nullptr && err == CL_SUCCESS) {
+      *event = tmp;
+    }
+    return err;
+  }
+
+
   cl_int enable_indirect_usm_access(cl::Kernel& k) override {
     return k.setExecInfo(CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool{true});
   }

diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
@@ -129,6 +129,9 @@ class instrumentation_task_guard {
 class omp_instrumentation_setup {
 public:
   omp_instrumentation_setup(operation &op, dag_node_ptr node) {
+    if(!node)
+      return;
+
     if (node->get_execution_hints()
             .has_hint<
                 rt::hints::request_instrumentation_submission_timestamp>()) {