From 097fe4ab00feb29e17d53db33993ad2533c65d8f Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 19 Aug 2021 11:17:47 +0200
Subject: [PATCH] Fix uploading the EventSetup conditions to multiple CUDA
 devices

Associate to the correct CUDA device the events used to track if the
conditions have been transferred to each device.
---
 HeterogeneousCore/CUDACore/interface/ESProduct.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
index 676d3e9d1c0d9..8740095292380 100644
--- a/HeterogeneousCore/CUDACore/interface/ESProduct.h
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -9,6 +9,7 @@
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
@@ -19,10 +20,13 @@ namespace cms {
     class ESProduct {
     public:
       ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
+        cms::cuda::ScopedSetDevice scopedDevice;
         for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+          scopedDevice.set(i);
           gpuDataPerDevice_[i].m_event = getEventCache().get();
         }
       }
+
       ~ESProduct() = default;
 
       // transferAsync should be a function of (T&, cudaStream_t)
@@ -30,12 +34,10 @@ namespace cms {
       // to the CUDA stream
       template <typename F>
       const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
-        auto device = currentDevice();
-
+        int device = currentDevice();
         auto& data = gpuDataPerDevice_[device];
 
-        // If GPU data has already been filled, we can return it
-        // immediately
+        // If the GPU data has already been filled, we can return it immediately
         if (not data.m_filled.load()) {
           // It wasn't, so need to fill it
           std::scoped_lock<std::mutex> lk{data.m_mutex};
@@ -103,4 +105,4 @@ namespace cms {
   }  // namespace cuda
 }  // namespace cms
 
-#endif
+#endif  // HeterogeneousCore_CUDACore_ESProduct_h