DBM: Migrate to offload_runtime.h

cp2k · Mar 13, 2022 · 85bb107 · 85bb107
1 parent 05cf8f2
commit 85bb107
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 100 deletions.
diff --git a/src/dbm/dbm_multiply.c b/src/dbm/dbm_multiply.c
@@ -15,7 +15,7 @@
 #include "dbm_multiply.h"
 #include "dbm_multiply_comm.h"
 #include "dbm_multiply_cpu.h"
-#include "dbm_multiply_cuda.h"
+#include "dbm_multiply_gpu.h"
 #include "dbm_multiply_internal.h"
 
 /*******************************************************************************
@@ -58,7 +58,7 @@ static float *compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix,
  ******************************************************************************/
 typedef struct {
 #if defined(__DBM_CUDA)
-  dbm_multiply_cuda_context_t cuda;
+  dbm_multiply_gpu_context_t gpu;
 #endif
 } backend_context_t;
 
@@ -70,8 +70,8 @@ static backend_context_t *backend_start(const dbm_matrix_t *matrix_c) {
   backend_context_t *ctx = calloc(1, sizeof(backend_context_t));
 
 #if defined(__DBM_CUDA)
-  dbm_multiply_cuda_start(MAX_BATCH_SIZE, matrix_c->nshards, matrix_c->shards,
-                          &ctx->cuda);
+  dbm_multiply_gpu_start(MAX_BATCH_SIZE, matrix_c->nshards, matrix_c->shards,
+                         &ctx->gpu);
 #else
   (void)matrix_c; // mark as used
 #endif
@@ -88,7 +88,7 @@ static void backend_upload_packs(const dbm_pack_t *pack_a,
                                  backend_context_t *ctx) {
 
 #if defined(__DBM_CUDA)
-  dbm_multiply_cuda_upload_packs(pack_a, pack_b, &ctx->cuda);
+  dbm_multiply_gpu_upload_packs(pack_a, pack_b, &ctx->gpu);
 #else
   (void)pack_a;   // mark as used
   (void)pack_b;
@@ -110,8 +110,8 @@ static void backend_process_batch(const int ntasks, dbm_task_t batch[ntasks],
   (void)pack_a; // mark as used
   (void)pack_b;
   (void)shard_c;
-  dbm_multiply_cuda_process_batch(ntasks, batch, transa, transb, alpha, kshard,
-                                  &ctx->cuda);
+  dbm_multiply_gpu_process_batch(ntasks, batch, transa, transb, alpha, kshard,
+                                 &ctx->gpu);
 #else
   (void)kshard; // mark as used
   (void)ctx;
@@ -126,7 +126,7 @@ static void backend_process_batch(const int ntasks, dbm_task_t batch[ntasks],
  ******************************************************************************/
 static void backend_download_results(backend_context_t *ctx) {
 #if defined(__DBM_CUDA)
-  dbm_multiply_cuda_download_results(&ctx->cuda);
+  dbm_multiply_gpu_download_results(&ctx->gpu);
 #else
   (void)ctx; // mark as used
 #endif
@@ -138,7 +138,7 @@ static void backend_download_results(backend_context_t *ctx) {
  ******************************************************************************/
 static void backend_stop(backend_context_t *ctx) {
 #if defined(__DBM_CUDA)
-  dbm_multiply_cuda_stop(&ctx->cuda);
+  dbm_multiply_gpu_stop(&ctx->gpu);
 #endif
   free(ctx);
 }

diff --git a/src/dbm/dbm_multiply_cuda.cu → src/dbm/dbm_multiply_gpu.cu b/src/dbm/dbm_multiply_cuda.cu → src/dbm/dbm_multiply_gpu.cu
@@ -12,18 +12,7 @@
 
 #include "../offload/offload_library.h"
 #include "dbm_mempool.h"
-#include "dbm_multiply_cuda.h"
-
-/*******************************************************************************
- * \brief Check given Cuda status and upon failure abort with a nice message.
- * \author Ole Schuett
- ******************************************************************************/
-#define CHECK(status)                                                          \
-  if (status != cudaSuccess) {                                                 \
-    fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), __FILE__, \
-            __LINE__);                                                         \
-    abort();                                                                   \
-  }
+#include "dbm_multiply_gpu.h"
 
 /*******************************************************************************
  * \brief Atomic add for doubles that also works prior to compute capability 6.
@@ -33,10 +22,10 @@ __device__ static void atomicAddDouble(double *address, double val) {
   if (val == 0.0)
     return;
 
-#if __CUDA_ARCH__ >= 600
-  atomicAdd(address, val); // part of cuda library
+#if __GPU_ARCH__ >= 600
+  atomicAdd(address, val); // part of gpu library
 #else
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+  // https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
   unsigned long long int *address_as_ull = (unsigned long long int *)address;
   unsigned long long int old = *address_as_ull, assumed;
 
@@ -52,35 +41,35 @@ __device__ static void atomicAddDouble(double *address, double val) {
 }
 
 /*******************************************************************************
- * \brief Internal routine for intializing the cuda backend.
+ * \brief Internal routine for intializing the gpu backend.
  * \author Ole Schuett
  ******************************************************************************/
-void dbm_multiply_cuda_start(const int max_batch_size, const int nshards,
-                             dbm_shard_t *shards_c_host,
-                             dbm_multiply_cuda_context_t *ctx) {
+void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
+                            dbm_shard_t *shards_c_host,
+                            dbm_multiply_gpu_context_t *ctx) {
   // Select GPU device.
   offload_activate_chosen_device();
 
   ctx->nshards = nshards;
   ctx->shards_c_host = shards_c_host;
   ctx->max_batch_size = max_batch_size;
-  CHECK(cudaStreamCreate(&ctx->main_stream));
+  offloadStreamCreate(&ctx->main_stream);
 
   // Allocate device storage for batches.
   const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
   ctx->batches_dev = (dbm_task_t *)dbm_mempool_device_malloc(size);
 
   // Allocate and upload shards of result matrix C.
   ctx->shards_c_dev =
-      (dbm_shard_cuda_t *)malloc(nshards * sizeof(dbm_shard_cuda_t));
+      (dbm_shard_gpu_t *)malloc(nshards * sizeof(dbm_shard_gpu_t));
   for (int i = 0; i < nshards; i++) {
-    CHECK(cudaStreamCreate(&ctx->shards_c_dev[i].stream));
+    offloadStreamCreate(&ctx->shards_c_dev[i].stream);
     ctx->shards_c_dev[i].data_size = ctx->shards_c_host[i].data_size;
     const size_t size = ctx->shards_c_dev[i].data_size * sizeof(double);
     ctx->shards_c_dev[i].data = (double *)dbm_mempool_device_malloc(size);
-    CHECK(cudaMemcpyAsync(ctx->shards_c_dev[i].data, ctx->shards_c_host[i].data,
-                          size, cudaMemcpyHostToDevice,
-                          ctx->shards_c_dev[i].stream));
+    offloadMemcpyAsyncHtoD(ctx->shards_c_dev[i].data,
+                           ctx->shards_c_host[i].data, size,
+                           ctx->shards_c_dev[i].stream);
   }
 }
 
@@ -89,44 +78,43 @@ void dbm_multiply_cuda_start(const int max_batch_size, const int nshards,
  * \author Ole Schuett
  ******************************************************************************/
 static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
-                        const cudaStream_t stream) {
+                        const offloadStream_t stream) {
 
   const size_t size = pack_host->data_size * sizeof(double);
   if (pack_dev->data_size < pack_host->data_size) {
     dbm_mempool_free(pack_dev->data);
     pack_dev->data = (double *)dbm_mempool_device_malloc(size);
   }
-  CHECK(cudaMemcpyAsync(pack_dev->data, pack_host->data, size,
-                        cudaMemcpyHostToDevice, stream));
+  offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
 }
 
 /*******************************************************************************
  * \brief Internal routine for uploading newly arrived packs onto the device.
  * \author Ole Schuett
  ******************************************************************************/
-void dbm_multiply_cuda_upload_packs(const dbm_pack_t *pack_a,
-                                    const dbm_pack_t *pack_b,
-                                    dbm_multiply_cuda_context_t *ctx) {
+void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
+                                   const dbm_pack_t *pack_b,
+                                   dbm_multiply_gpu_context_t *ctx) {
   // Select GPU device.
   offload_activate_chosen_device();
 
   // Wait for all c-streams to complete before overwriting old packs.
-  cudaEvent_t event;
-  CHECK(cudaEventCreate(&event));
+  offloadEvent_t event;
+  offloadEventCreate(&event);
   for (int i = 0; i < ctx->nshards; i++) {
-    CHECK(cudaEventRecord(event, ctx->shards_c_dev[i].stream))
-    CHECK(cudaStreamWaitEvent(ctx->main_stream, event, 0));
+    offloadEventRecord(event, ctx->shards_c_dev[i].stream);
+    offloadStreamWaitEvent(ctx->main_stream, event, 0);
   }
 
   upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
   upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
 
   // Have all c-streams wait until new packs are uploaded.
-  CHECK(cudaEventRecord(event, ctx->main_stream))
+  offloadEventRecord(event, ctx->main_stream);
   for (int i = 0; i < ctx->nshards; i++) {
-    CHECK(cudaStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0));
+    offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);
   }
-  CHECK(cudaEventDestroy(event));
+  offloadEventDestroy(event);
 }
 
 /*******************************************************************************
@@ -163,10 +151,10 @@ process_batch_kernel(const bool transa, const bool transb, const double alpha,
  * \brief Internal routine for executing the tasks in given batch on the GPU.
  * \author Ole Schuett
  ******************************************************************************/
-void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
-                                     const bool transa, const bool transb,
-                                     const double alpha, const int kshard,
-                                     dbm_multiply_cuda_context_t *ctx) {
+void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
+                                    const bool transa, const bool transb,
+                                    const double alpha, const int kshard,
+                                    dbm_multiply_gpu_context_t *ctx) {
   if (ntasks == 0) {
     return; // Nothing to do.
   }
@@ -175,16 +163,15 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
   offload_activate_chosen_device();
 
   const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
-  dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[kshard];
+  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
 
   // Upload new batch.
   dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
   const size_t size = ntasks * sizeof(dbm_task_t);
-  CHECK(cudaMemcpyAsync(batch_dev, batch, size, cudaMemcpyHostToDevice,
-                        shard_c_dev->stream));
-  cudaEvent_t batch_uploaded;
-  CHECK(cudaEventCreate(&batch_uploaded));
-  CHECK(cudaEventRecord(batch_uploaded, shard_c_dev->stream));
+  offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
+  offloadEvent_t batch_uploaded;
+  offloadEventCreate(&batch_uploaded);
+  offloadEventRecord(batch_uploaded, shard_c_dev->stream);
 
   // Grow shard_c_dev->data if nessecary.
   if (shard_c_dev->data_size != shard_c_host->data_promised) {
@@ -194,12 +181,12 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
     shard_c_dev->data_size = shard_c_host->data_promised;
     const size_t new_size = shard_c_dev->data_size * sizeof(double);
     shard_c_dev->data = (double *)dbm_mempool_device_malloc(new_size);
-    CHECK(cudaMemsetAsync(shard_c_dev->data, 0, new_size,
-                          shard_c_dev->stream)); // TODO: zero only tail
-    CHECK(cudaMemcpyAsync(shard_c_dev->data, old_data_dev, old_size,
-                          cudaMemcpyDeviceToDevice, shard_c_dev->stream));
+    offloadMemsetAsync(shard_c_dev->data, 0, new_size,
+                       shard_c_dev->stream); // TODO: zero only tail
+    offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev, old_size,
+                           shard_c_dev->stream);
     // Wait for copy to complete before freeing old buffer.
-    CHECK(cudaStreamSynchronize(shard_c_dev->stream));
+    offloadStreamSynchronize(shard_c_dev->stream);
     dbm_mempool_free(old_data_dev);
   }
 
@@ -211,18 +198,18 @@ void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch,
                          shard_c_dev->stream>>>(
       transa, transb, alpha, batch_dev, ctx->pack_a_dev.data,
       ctx->pack_b_dev.data, shard_c_dev->data);
-  CHECK(cudaGetLastError());
+  OFFLOAD_CHECK(offloadGetLastError());
 
   // Wait for batch to be uploaded before refilling it.
-  CHECK(cudaEventSynchronize(batch_uploaded));
-  CHECK(cudaEventDestroy(batch_uploaded));
+  offloadEventSynchronize(batch_uploaded);
+  offloadEventDestroy(batch_uploaded);
 }
 
 /*******************************************************************************
  * \brief Internal routine for downloading results from the device.
  * \author Ole Schuett
  ******************************************************************************/
-void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx) {
+void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
   // Select GPU device.
   offload_activate_chosen_device();
 
@@ -233,36 +220,36 @@ void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx) {
     dbm_shard_allocate_promised_blocks(shard_c_host);
 
     // Download results from device.
-    dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i];
+    dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
     assert(shard_c_host->data_size == shard_c_dev->data_size);
     const size_t size = shard_c_dev->data_size * sizeof(double);
-    CHECK(cudaMemcpyAsync(shard_c_host->data, shard_c_dev->data, size,
-                          cudaMemcpyDeviceToHost, shard_c_dev->stream));
+    offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,
+                           shard_c_dev->stream);
   }
 }
 
 /*******************************************************************************
- * \brief Internal routine for shutting down the cuda backend.
+ * \brief Internal routine for shutting down the gpu backend.
  * \author Ole Schuett
  ******************************************************************************/
-void dbm_multiply_cuda_stop(dbm_multiply_cuda_context_t *ctx) {
+void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
   // Select GPU device.
   offload_activate_chosen_device();
 
-  // Wait for completion, then free cuda ressources.
+  // Wait for completion, then free gpu ressources.
 #pragma omp parallel for schedule(dynamic)
   for (int i = 0; i < ctx->nshards; i++) {
-    dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i];
-    CHECK(cudaStreamSynchronize(shard_c_dev->stream));
-    CHECK(cudaStreamDestroy(shard_c_dev->stream));
+    dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
+    offloadStreamSynchronize(shard_c_dev->stream);
+    offloadStreamDestroy(shard_c_dev->stream);
     dbm_mempool_free(shard_c_dev->data);
   }
   free(ctx->shards_c_dev);
 
   dbm_mempool_free(ctx->pack_a_dev.data);
   dbm_mempool_free(ctx->pack_b_dev.data);
   dbm_mempool_free(ctx->batches_dev);
-  CHECK(cudaStreamDestroy(ctx->main_stream));
+  offloadStreamDestroy(ctx->main_stream);
 }
 
 #endif // __DBM_CUDA