Skip to content

Commit

Permalink
DBM: Migrate to offload_runtime.h
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Mar 13, 2022
1 parent 05cf8f2 commit 85bb107
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 100 deletions.
18 changes: 9 additions & 9 deletions src/dbm/dbm_multiply.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "dbm_multiply.h"
#include "dbm_multiply_comm.h"
#include "dbm_multiply_cpu.h"
#include "dbm_multiply_cuda.h"
#include "dbm_multiply_gpu.h"
#include "dbm_multiply_internal.h"

/*******************************************************************************
Expand Down Expand Up @@ -58,7 +58,7 @@ static float *compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix,
******************************************************************************/
typedef struct {
#if defined(__DBM_CUDA)
dbm_multiply_cuda_context_t cuda;
dbm_multiply_gpu_context_t gpu;
#endif
} backend_context_t;

Expand All @@ -70,8 +70,8 @@ static backend_context_t *backend_start(const dbm_matrix_t *matrix_c) {
backend_context_t *ctx = calloc(1, sizeof(backend_context_t));

#if defined(__DBM_CUDA)
dbm_multiply_cuda_start(MAX_BATCH_SIZE, matrix_c->nshards, matrix_c->shards,
&ctx->cuda);
dbm_multiply_gpu_start(MAX_BATCH_SIZE, matrix_c->nshards, matrix_c->shards,
&ctx->gpu);
#else
(void)matrix_c; // mark as used
#endif
Expand All @@ -88,7 +88,7 @@ static void backend_upload_packs(const dbm_pack_t *pack_a,
backend_context_t *ctx) {

#if defined(__DBM_CUDA)
dbm_multiply_cuda_upload_packs(pack_a, pack_b, &ctx->cuda);
dbm_multiply_gpu_upload_packs(pack_a, pack_b, &ctx->gpu);
#else
(void)pack_a; // mark as used
(void)pack_b;
Expand All @@ -110,8 +110,8 @@ static void backend_process_batch(const int ntasks, dbm_task_t batch[ntasks],
(void)pack_a; // mark as used
(void)pack_b;
(void)shard_c;
dbm_multiply_cuda_process_batch(ntasks, batch, transa, transb, alpha, kshard,
&ctx->cuda);
dbm_multiply_gpu_process_batch(ntasks, batch, transa, transb, alpha, kshard,
&ctx->gpu);
#else
(void)kshard; // mark as used
(void)ctx;
Expand All @@ -126,7 +126,7 @@ static void backend_process_batch(const int ntasks, dbm_task_t batch[ntasks],
******************************************************************************/
static void backend_download_results(backend_context_t *ctx) {
#if defined(__DBM_CUDA)
dbm_multiply_cuda_download_results(&ctx->cuda);
dbm_multiply_gpu_download_results(&ctx->gpu);
#else
(void)ctx; // mark as used
#endif
Expand All @@ -138,7 +138,7 @@ static void backend_download_results(backend_context_t *ctx) {
******************************************************************************/
static void backend_stop(backend_context_t *ctx) {
#if defined(__DBM_CUDA)
dbm_multiply_cuda_stop(&ctx->cuda);
dbm_multiply_gpu_stop(&ctx->gpu);
#endif
free(ctx);
}
Expand Down
121 changes: 54 additions & 67 deletions src/dbm/dbm_multiply_cuda.cu → src/dbm/dbm_multiply_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,7 @@

#include "../offload/offload_library.h"
#include "dbm_mempool.h"
#include "dbm_multiply_cuda.h"

/*******************************************************************************
* \brief Check given Cuda status and upon failure abort with a nice message.
* \author Ole Schuett
******************************************************************************/
#define CHECK(status) \
if (status != cudaSuccess) { \
fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), __FILE__, \
__LINE__); \
abort(); \
}
#include "dbm_multiply_gpu.h"

/*******************************************************************************
* \brief Atomic add for doubles that also works prior to compute capability 6.
Expand All @@ -33,10 +22,10 @@ __device__ static void atomicAddDouble(double *address, double val) {
if (val == 0.0)
return;

#if __CUDA_ARCH__ >= 600
atomicAdd(address, val); // part of cuda library
#if __GPU_ARCH__ >= 600
atomicAdd(address, val); // part of gpu library
#else
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
// https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull, assumed;

Expand All @@ -52,35 +41,35 @@ __device__ static void atomicAddDouble(double *address, double val) {
}

/*******************************************************************************
* \brief Internal routine for intializing the cuda backend.
* \brief Internal routine for intializing the gpu backend.
* \author Ole Schuett
******************************************************************************/
void dbm_multiply_cuda_start(const int max_batch_size, const int nshards,
dbm_shard_t *shards_c_host,
dbm_multiply_cuda_context_t *ctx) {
void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
dbm_shard_t *shards_c_host,
dbm_multiply_gpu_context_t *ctx) {
// Select GPU device.
offload_activate_chosen_device();

ctx->nshards = nshards;
ctx->shards_c_host = shards_c_host;
ctx->max_batch_size = max_batch_size;
CHECK(cudaStreamCreate(&ctx->main_stream));
offloadStreamCreate(&ctx->main_stream);

// Allocate device storage for batches.
const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
ctx->batches_dev = (dbm_task_t *)dbm_mempool_device_malloc(size);

// Allocate and upload shards of result matrix C.
ctx->shards_c_dev =
(dbm_shard_cuda_t *)malloc(nshards * sizeof(dbm_shard_cuda_t));
(dbm_shard_gpu_t *)malloc(nshards * sizeof(dbm_shard_gpu_t));
for (int i = 0; i < nshards; i++) {
CHECK(cudaStreamCreate(&ctx->shards_c_dev[i].stream));
offloadStreamCreate(&ctx->shards_c_dev[i].stream);
ctx->shards_c_dev[i].data_size = ctx->shards_c_host[i].data_size;
const size_t size = ctx->shards_c_dev[i].data_size * sizeof(double);
ctx->shards_c_dev[i].data = (double *)dbm_mempool_device_malloc(size);
CHECK(cudaMemcpyAsync(ctx->shards_c_dev[i].data, ctx->shards_c_host[i].data,
size, cudaMemcpyHostToDevice,
ctx->shards_c_dev[i].stream));
offloadMemcpyAsyncHtoD(ctx->shards_c_dev[i].data,
ctx->shards_c_host[i].data, size,
ctx->shards_c_dev[i].stream);
}
}

Expand All @@ -89,44 +78,43 @@ void dbm_multiply_cuda_start(const int max_batch_size, const int nshards,
* \author Ole Schuett
******************************************************************************/
static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
const cudaStream_t stream) {
const offloadStream_t stream) {

const size_t size = pack_host->data_size * sizeof(double);
if (pack_dev->data_size < pack_host->data_size) {
dbm_mempool_free(pack_dev->data);
pack_dev->data = (double *)dbm_mempool_device_malloc(size);
}
CHECK(cudaMemcpyAsync(pack_dev->data, pack_host->data, size,
cudaMemcpyHostToDevice, stream));
offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
}

/*******************************************************************************
* \brief Internal routine for uploading newly arrived packs onto the device.
* \author Ole Schuett
******************************************************************************/
void dbm_multiply_cuda_upload_packs(const dbm_pack_t *pack_a,
const dbm_pack_t *pack_b,
dbm_multiply_cuda_context_t *ctx) {
void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
const dbm_pack_t *pack_b,
dbm_multiply_gpu_context_t *ctx) {
// Select GPU device.
offload_activate_chosen_device();

// Wait for all c-streams to complete before overwriting old packs.
cudaEvent_t event;
CHECK(cudaEventCreate(&event));
offloadEvent_t event;
offloadEventCreate(&event);
for (int i = 0; i < ctx->nshards; i++) {
CHECK(cudaEventRecord(event, ctx->shards_c_dev[i].stream))
CHECK(cudaStreamWaitEvent(ctx->main_stream, event, 0));
offloadEventRecord(event, ctx->shards_c_dev[i].stream);
offloadStreamWaitEvent(ctx->main_stream, event, 0);
}

upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);

// Have all c-streams wait until new packs are uploaded.
CHECK(cudaEventRecord(event, ctx->main_stream))
offloadEventRecord(event, ctx->main_stream);
for (int i = 0; i < ctx->nshards; i++) {
CHECK(cudaStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0));
offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);
}
CHECK(cudaEventDestroy(event));
offloadEventDestroy(event);
}

/*******************************************************************************
Expand Down Expand Up @@ -163,10 +151,10 @@ process_batch_kernel(const bool transa, const bool transb, const double alpha,
* \brief Internal routine for executing the tasks in given batch on the GPU.
* \author Ole Schuett
******************************************************************************/
void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
const bool transa, const bool transb,
const double alpha, const int kshard,
dbm_multiply_cuda_context_t *ctx) {
void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
const bool transa, const bool transb,
const double alpha, const int kshard,
dbm_multiply_gpu_context_t *ctx) {
if (ntasks == 0) {
return; // Nothing to do.
}
Expand All @@ -175,16 +163,15 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
offload_activate_chosen_device();

const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[kshard];
dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];

// Upload new batch.
dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
const size_t size = ntasks * sizeof(dbm_task_t);
CHECK(cudaMemcpyAsync(batch_dev, batch, size, cudaMemcpyHostToDevice,
shard_c_dev->stream));
cudaEvent_t batch_uploaded;
CHECK(cudaEventCreate(&batch_uploaded));
CHECK(cudaEventRecord(batch_uploaded, shard_c_dev->stream));
offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
offloadEvent_t batch_uploaded;
offloadEventCreate(&batch_uploaded);
offloadEventRecord(batch_uploaded, shard_c_dev->stream);

// Grow shard_c_dev->data if nessecary.
if (shard_c_dev->data_size != shard_c_host->data_promised) {
Expand All @@ -194,12 +181,12 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
shard_c_dev->data_size = shard_c_host->data_promised;
const size_t new_size = shard_c_dev->data_size * sizeof(double);
shard_c_dev->data = (double *)dbm_mempool_device_malloc(new_size);
CHECK(cudaMemsetAsync(shard_c_dev->data, 0, new_size,
shard_c_dev->stream)); // TODO: zero only tail
CHECK(cudaMemcpyAsync(shard_c_dev->data, old_data_dev, old_size,
cudaMemcpyDeviceToDevice, shard_c_dev->stream));
offloadMemsetAsync(shard_c_dev->data, 0, new_size,
shard_c_dev->stream); // TODO: zero only tail
offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev, old_size,
shard_c_dev->stream);
// Wait for copy to complete before freeing old buffer.
CHECK(cudaStreamSynchronize(shard_c_dev->stream));
offloadStreamSynchronize(shard_c_dev->stream);
dbm_mempool_free(old_data_dev);
}

Expand All @@ -211,18 +198,18 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
shard_c_dev->stream>>>(
transa, transb, alpha, batch_dev, ctx->pack_a_dev.data,
ctx->pack_b_dev.data, shard_c_dev->data);
CHECK(cudaGetLastError());
OFFLOAD_CHECK(offloadGetLastError());

// Wait for batch to be uploaded before refilling it.
CHECK(cudaEventSynchronize(batch_uploaded));
CHECK(cudaEventDestroy(batch_uploaded));
offloadEventSynchronize(batch_uploaded);
offloadEventDestroy(batch_uploaded);
}

/*******************************************************************************
* \brief Internal routine for downloading results from the device.
* \author Ole Schuett
******************************************************************************/
void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx) {
void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
// Select GPU device.
offload_activate_chosen_device();

Expand All @@ -233,36 +220,36 @@ void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx) {
dbm_shard_allocate_promised_blocks(shard_c_host);

// Download results from device.
dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i];
dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
assert(shard_c_host->data_size == shard_c_dev->data_size);
const size_t size = shard_c_dev->data_size * sizeof(double);
CHECK(cudaMemcpyAsync(shard_c_host->data, shard_c_dev->data, size,
cudaMemcpyDeviceToHost, shard_c_dev->stream));
offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,
shard_c_dev->stream);
}
}

/*******************************************************************************
* \brief Internal routine for shutting down the cuda backend.
* \brief Internal routine for shutting down the gpu backend.
* \author Ole Schuett
******************************************************************************/
void dbm_multiply_cuda_stop(dbm_multiply_cuda_context_t *ctx) {
void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
// Select GPU device.
offload_activate_chosen_device();

// Wait for completion, then free cuda ressources.
// Wait for completion, then free gpu ressources.
#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < ctx->nshards; i++) {
dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i];
CHECK(cudaStreamSynchronize(shard_c_dev->stream));
CHECK(cudaStreamDestroy(shard_c_dev->stream));
dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
offloadStreamSynchronize(shard_c_dev->stream);
offloadStreamDestroy(shard_c_dev->stream);
dbm_mempool_free(shard_c_dev->data);
}
free(ctx->shards_c_dev);

dbm_mempool_free(ctx->pack_a_dev.data);
dbm_mempool_free(ctx->pack_b_dev.data);
dbm_mempool_free(ctx->batches_dev);
CHECK(cudaStreamDestroy(ctx->main_stream));
offloadStreamDestroy(ctx->main_stream);
}

#endif // __DBM_CUDA
Expand Down

0 comments on commit 85bb107

Please sign in to comment.