cms-sw · cmsbuild · Aug 20, 2021 · Aug 19, 2021
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -16,8 +16,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
-#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 
 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
   // read the current device
@@ -300,8 +299,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
 
   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
   if constexpr (cms::cuda::allocator::useCaching) {
-    cms::cuda::allocator::getCachingDeviceAllocator();
-    cms::cuda::allocator::getCachingHostAllocator();
+    cms::cuda::allocator::cachingAllocatorsConstruct();
   }
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
@@ -319,8 +317,7 @@ CUDAService::~CUDAService() {
   if (enabled_) {
     // Explicitly destruct the allocator before the device resets below
     if constexpr (cms::cuda::allocator::useCaching) {
-      cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
-      cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+      cms::cuda::allocator::cachingAllocatorsFreeCached();
     }
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h b/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
@@ -0,0 +1,13 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
+#define HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
+
+namespace cms::cuda::allocator {
+  // Use caching or not
+  constexpr bool useCaching = true;
+
+  // these intended to be called only from CUDAService
+  void cachingAllocatorsConstruct();
+  void cachingAllocatorsFreeCached();
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h b/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
@@ -0,0 +1,42 @@
+#ifndef HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
+#define HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace cms::cuda::allocator {
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 2;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 8;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 30;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
+
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    cudaCheck(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    cudaCheck(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
+    }
+    return ret;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc b/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
+
+#include "getCachingDeviceAllocator.h"
+#include "getCachingHostAllocator.h"
+
+namespace cms::cuda::allocator {
+  void cachingAllocatorsConstruct() {
+    cms::cuda::allocator::getCachingDeviceAllocator();
+    cms::cuda::allocator::getCachingHostAllocator();
+  }
+
+  void cachingAllocatorsFreeCached() {
+    cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
+    cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+  }
+}  // namespace cms::cuda::allocator
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -4,44 +4,14 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
+
 #include "CachingDeviceAllocator.h"
+#include "cachingAllocatorCommon.h"
 
 #include <iomanip>
 
 namespace cms::cuda::allocator {
-  // Use caching or not
-  constexpr bool useCaching = true;
-  // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 2;
-  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 8;
-  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 30;
-  // Total storage for the allocator. 0 means no limit.
-  constexpr size_t maxCachedBytes = 0;
-  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
-  constexpr double maxCachedFraction = 0.8;
-  constexpr bool debug = false;
-
-  inline size_t minCachedBytes() {
-    size_t ret = std::numeric_limits<size_t>::max();
-    int currentDevice;
-    cudaCheck(cudaGetDevice(&currentDevice));
-    const int numberOfDevices = deviceCount();
-    for (int i = 0; i < numberOfDevices; ++i) {
-      size_t freeMemory, totalMemory;
-      cudaCheck(cudaSetDevice(i));
-      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
-      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
-    }
-    cudaCheck(cudaSetDevice(currentDevice));
-    if (maxCachedBytes > 0) {
-      ret = std::min(ret, maxCachedBytes);
-    }
-    return ret;
-  }
-
   inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
     LogDebug("CachingDeviceAllocator").log([](auto& log) {
       log << "cub::CachingDeviceAllocator settings\n"

diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -4,9 +4,11 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "CachingHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 
-#include "getCachingDeviceAllocator.h"
+#include "CachingDeviceAllocator.h"
+#include "CachingHostAllocator.h"
+#include "cachingAllocatorCommon.h"
 
 #include <iomanip>