Skip to content
This repository has been archived by the owner on Feb 7, 2023. It is now read-only.

Commit

Permalink
Fix memory pool implementation
Browse files Browse the repository at this point in the history
Summary:
The memory pool implementation was written back in the days when I only had
one GPU, and as a result I overlooked the fact that:

(1) CNMEM needs to have the same current device for the allocation and
deallocation to take place correctly.
(2) cub needs the device id of the pointer passed in for proper deallocation.

As a result, since C2 right now switches contexts very frequently, I added a
global map to keep record of the pointer affiliations, and use that for
deallocation when we are at another context.

I have not tested the speed but assuming that std::unordered_map is not too bad
this should be fairly fast.

Differential Revision: D4617300

fbshipit-source-id: e8bb366616cd93504e7d68b7f999011cd49caba5
  • Loading branch information
Yangqing Jia authored and facebook-github-bot committed Feb 27, 2017
1 parent e53b0c6 commit d4fea53
Showing 1 changed file with 36 additions and 5 deletions.
41 changes: 36 additions & 5 deletions caffe2/core/context_gpu.cu
Expand Up @@ -2,6 +2,7 @@
#include <atomic>
#include <cstdlib>
#include <string>
#include <unordered_map>

#include "cub/util_allocator.cuh"
#include "cnmem.h"
Expand Down Expand Up @@ -58,6 +59,19 @@ CudaMemoryPoolType g_cuda_memory_pool_type;
vector<bool> g_cnmem_available_for_device;
// For cub allocator
unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
// an unordered map that holds the map from the cuda memory pointer to the
// device id that it is allocated from. This is used in the cuda memory pool
// cases, where we need the device id to carry out the deletion.
// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
// that is usually quite slow. We might want to benchmark the speed difference
// though.
// Note(jiayq): another alternate approach is to augment the Tensor class that
// would allow one to record the device id. However, this does not address any
// non-tensor allocation and deallocation.
// Ideally, a memory pool should already have the device id information, as
// long as we are using UVA (as of CUDA 5 and later) so the addresses are
// unique.
static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;

CudaMemoryPoolType GetCudaMemoryPoolType() {
return g_cuda_memory_pool_type;
Expand Down Expand Up @@ -298,10 +312,16 @@ void* CUDAContext::New(size_t nbytes) {
gpuId,
" but cnmem pool is not set up for it.");
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
g_cuda_device_affiliation[ptr] = GetCurrentGPUID();
VLOG(2) << "CNMEM allocating pointer " << ptr << " on device "
<< GetCurrentGPUID();
return ptr;
}
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
g_cuda_device_affiliation[ptr] = GetCurrentGPUID();
VLOG(2) << "CUB allocating pointer " << ptr << " on device "
<< GetCurrentGPUID();
return ptr;
}
return nullptr;
Expand All @@ -324,15 +344,26 @@ void CUDAContext::Delete(void* ptr) {
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
}
break; }
case CudaMemoryPoolType::CNMEM:
CNMEM_CHECK(cnmemFree(ptr, nullptr));
case CudaMemoryPoolType::CNMEM: {
auto it = g_cuda_device_affiliation.find(ptr);
DCHECK(it != g_cuda_device_affiliation.end());
DeviceGuard guard(it->second);
VLOG(2) << "CNMEM freeing pointer " << ptr << " on device " << it->second;
CNMEM_CHECK(cnmemFree(ptr, nullptr));
g_cuda_device_affiliation.erase(it);
break;
case CudaMemoryPoolType::CUB:
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
}
case CudaMemoryPoolType::CUB: {
auto it = g_cuda_device_affiliation.find(ptr);
DCHECK(it != g_cuda_device_affiliation.end());
VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
CUDA_CHECK(g_cub_allocator->DeviceFree(it->second, ptr));
g_cuda_device_affiliation.erase(it);
break;
}
}
}

} // namespace caffe2

0 comments on commit d4fea53

Please sign in to comment.