Skip to content

Commit

Permalink
Add public interface for constructing and freeing caching allocators
Browse files Browse the repository at this point in the history
  • Loading branch information
makortel committed Aug 19, 2021
1 parent 43765e2 commit e5ed64c
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 41 deletions.
9 changes: 3 additions & 6 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
// read the current device
Expand Down Expand Up @@ -300,8 +299,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {

// Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
if constexpr (cms::cuda::allocator::useCaching) {
cms::cuda::allocator::getCachingDeviceAllocator();
cms::cuda::allocator::getCachingHostAllocator();
cms::cuda::allocator::cachingAllocatorsConstruct();
}
cms::cuda::getEventCache().clear();
cms::cuda::getStreamCache().clear();
Expand All @@ -319,8 +317,7 @@ CUDAService::~CUDAService() {
if (enabled_) {
// Explicitly destruct the allocator before the device resets below
if constexpr (cms::cuda::allocator::useCaching) {
cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
cms::cuda::allocator::cachingAllocatorsFreeCached();
}
cms::cuda::getEventCache().clear();
cms::cuda::getStreamCache().clear();
Expand Down
13 changes: 13 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
#define HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h

namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;

// these intended to be called only from CUDAService
void cachingAllocatorsConstruct();
void cachingAllocatorsFreeCached();
} // namespace cms::cuda::allocator

#endif
42 changes: 42 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
#define HeterogeneousCore_CUDACore_src_cachingAllocatorCommon

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"

#include <algorithm>
#include <limits>

namespace cms::cuda::allocator {
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
constexpr double maxCachedFraction = 0.8;
constexpr bool debug = false;

inline size_t minCachedBytes() {
size_t ret = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
const int numberOfDevices = deviceCount();
for (int i = 0; i < numberOfDevices; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
}
cudaCheck(cudaSetDevice(currentDevice));
if (maxCachedBytes > 0) {
ret = std::min(ret, maxCachedBytes);
}
return ret;
}
} // namespace cms::cuda::allocator

#endif
16 changes: 16 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "getCachingDeviceAllocator.h"
#include "getCachingHostAllocator.h"

namespace cms::cuda::allocator {
void cachingAllocatorsConstruct() {
cms::cuda::allocator::getCachingDeviceAllocator();
cms::cuda::allocator::getCachingHostAllocator();
}

void cachingAllocatorsFreeCached() {
cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
}
} // namespace cms::cuda::allocator
36 changes: 3 additions & 33 deletions HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,14 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "CachingDeviceAllocator.h"
#include "cachingAllocatorCommon.h"

#include <iomanip>

namespace cms::cuda::allocator {
// Use caching or not
constexpr bool useCaching = true;
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
constexpr double maxCachedFraction = 0.8;
constexpr bool debug = false;

inline size_t minCachedBytes() {
size_t ret = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
const int numberOfDevices = deviceCount();
for (int i = 0; i < numberOfDevices; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
}
cudaCheck(cudaSetDevice(currentDevice));
if (maxCachedBytes > 0) {
ret = std::min(ret, maxCachedBytes);
}
return ret;
}

inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
LogDebug("CachingDeviceAllocator").log([](auto& log) {
log << "cub::CachingDeviceAllocator settings\n"
Expand Down
6 changes: 4 additions & 2 deletions HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "CachingHostAllocator.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"

#include "getCachingDeviceAllocator.h"
#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
#include "cachingAllocatorCommon.h"

#include <iomanip>

Expand Down

0 comments on commit e5ed64c

Please sign in to comment.