Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CUDAUtilities from the Patatrack development #29908

Merged
merged 2 commits into from
May 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion HeterogeneousCore/CUDAUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<iftool name="cuda-gcc-support">
<use name="cub"/>
<use name="cuda"/>
<use name="eigen"/>
<use name="FWCore/Utilities"/>
Expand Down
75 changes: 29 additions & 46 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
#include <cstdint>
#include <type_traits>

#ifdef __CUDACC__
#include <cub/cub.cuh>
#endif

#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
Expand Down Expand Up @@ -55,62 +51,60 @@ namespace cms {
}

template <typename Histo>
inline void launchZero(Histo *__restrict__ h,
cudaStream_t stream
inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In principle I'd prefer C++-style casts.

int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
#ifdef __CUDACC__
cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
#else
::memset(off, 0, 4 * Histo::totbins());
::memset(poff, 0, size);
#endif
}

template <typename Histo>
inline void launchFinalize(Histo *__restrict__ h,
uint8_t *__restrict__ ws
inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
#ifdef __CUDACC__
assert(ws);
uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
size_t wss = Histo::wsSize();
assert(wss > 0);
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
auto nthreads = 1024;
auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
poff, poff, Histo::totbins(), ppsws);
cudaCheck(cudaGetLastError());
#else
h->finalize();
#endif
}

template <typename Histo, typename T>
inline void fillManyFromVector(Histo *__restrict__ h,
uint8_t *__restrict__ ws,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int nthreads,
cudaStream_t stream
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int nthreads,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
= cudaStreamDefault
#endif
) {
launchZero(h, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(h, ws, stream);
launchFinalize(h, stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
Expand Down Expand Up @@ -186,18 +180,6 @@ namespace cms {

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

__host__ static size_t wsSize() {
#ifdef __CUDACC__
uint32_t *v = nullptr;
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
return temp_storage_bytes;
#else
return 0;
#endif
}

static constexpr UT bin(T t) {
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1 << nbits()) - 1;
Expand All @@ -209,7 +191,7 @@ namespace cms {
i = 0;
}

__host__ __device__ void add(CountersOnly const &co) {
__host__ __device__ __forceinline__ void add(CountersOnly const &co) {
for (uint32_t i = 0; i < totbins(); ++i) {
#ifdef __CUDA_ARCH__
atomicAdd(off + i, co.off[i]);
Expand Down Expand Up @@ -325,6 +307,7 @@ namespace cms {
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }

Counter off[totbins()];
int32_t psws; // prefix-scan working space
index_type bins[capacity()];
};

Expand Down
55 changes: 55 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef HeterogeneousCore_CUDAUtilities_HostAllocator_h
#define HeterogeneousCore_CUDAUtilities_HostAllocator_h

#include <memory>
#include <new>
#include <cuda_runtime.h>

namespace cms {
namespace cuda {

class bad_alloc : public std::bad_alloc {
public:
bad_alloc(cudaError_t error) noexcept : error_(error) {}

const char* what() const noexcept override { return cudaGetErrorString(error_); }

private:
cudaError_t error_;
};

template <typename T, unsigned int FLAGS = cudaHostAllocDefault>
class HostAllocator {
public:
using value_type = T;

template <typename U>
struct rebind {
using other = HostAllocator<U, FLAGS>;
};

T* allocate(std::size_t n) const __attribute__((warn_unused_result)) __attribute__((malloc))
__attribute__((returns_nonnull)) {
void* ptr = nullptr;
cudaError_t status = cudaMallocHost(&ptr, n * sizeof(T), FLAGS);
if (status != cudaSuccess) {
throw bad_alloc(status);
}
if (ptr == nullptr) {
throw std::bad_alloc();
}
return static_cast<T*>(ptr);
}

void deallocate(T* p, std::size_t n) const {
cudaError_t status = cudaFreeHost(p);
if (status != cudaSuccess) {
throw bad_alloc(status);
}
}
};

} // namespace cuda
} // namespace cms

#endif // HeterogeneousCore_CUDAUtilities_HostAllocator_h
37 changes: 24 additions & 13 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,27 @@ namespace cms {
#endif
}

// limited to 1024*1024 elements....
#ifdef __CUDA_ARCH__
// see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
__device__ __forceinline__ unsigned dynamic_smem_size() {
unsigned ret;
asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
return ret;
}
#endif

// in principle not limited....
template <typename T>
__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
__global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
__shared__ T ws[32];
// first each block does a scan of size 1024; (better be enough blocks....)
assert(1024 * gridDim.x >= size);
int off = 1024 * blockIdx.x;
#ifdef __CUDA_ARCH__
assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
#endif
assert(blockDim.x * gridDim.x >= size);
// first each block does a scan
int off = blockDim.x * blockIdx.x;
if (size - off > 0)
blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);

// count blocks that finished
__shared__ bool isLastBlockDone;
Expand All @@ -149,25 +161,24 @@ namespace cms {
if (!isLastBlockDone)
return;

assert(int(gridDim.x) == *pc);

// good each block has done its work and now we are left in last block

// let's get the partial sums from each block
__shared__ T psum[1024];
extern __shared__ T psum[];
for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
auto j = 1024 * i + 1023;
auto j = blockDim.x * i + blockDim.x - 1;
psum[i] = (j < size) ? co[j] : T(0);
}
__syncthreads();
blockPrefixScan(psum, psum, gridDim.x, ws);

// now it would have been handy to have the other blocks around...
int first = threadIdx.x; // + blockDim.x * blockIdx.x
for (int i = first + 1024; i < size; i += blockDim.x) { // *gridDim.x) {
auto k = i / 1024; // block
co[i] += psum[k - 1];
for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
co[i] += psum[k];
}
}

} // namespace cuda
} // namespace cms

Expand Down