DBM: Speedup GPU kernel for tall and skinny blocks

cp2k · Jul 3, 2022 · 30093aa · 30093aa
1 parent b1c14e6
commit 30093aa
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 38 deletions.
diff --git a/src/dbm/dbm_miniapp.c b/src/dbm/dbm_miniapp.c
@@ -201,7 +201,6 @@ int main(int argc, char *argv[]) {
   }
 
   bechmark_multiply(4, 4, 4, comm);
-
   bechmark_multiply(128, 4, 4, comm);
   bechmark_multiply(4, 128, 4, comm);
   bechmark_multiply(4, 4, 128, comm);
@@ -210,6 +209,9 @@ int main(int argc, char *argv[]) {
   bechmark_multiply(128, 128, 4, comm);
   bechmark_multiply(128, 128, 128, comm);
 
+  bechmark_multiply(23, 23, 23, comm);
+  bechmark_multiply(32, 32, 32, comm);
+
   dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, 0);
   dbm_library_finalize();
   dbm_mpi_comm_free(&comm);

diff --git a/src/dbm/dbm_multiply_gpu.cu b/src/dbm/dbm_multiply_gpu.cu
@@ -17,6 +17,8 @@
 #include <assert.h>
 #include <stdio.h>
 
+#define NUM_THREADS 256
+
 /*******************************************************************************
  * \brief Atomic add for doubles that also works prior to compute capability 6.
  * \author Ole Schuett
@@ -43,65 +45,158 @@ __device__ static void atomicAddDouble(double *address, double val) {
 #endif
 }
 
-#define TILE_DIM 8
-
 /*******************************************************************************
- * \brief A generic matrix multiplication kernel.
+ * \brief Returns the larger of two given integer (missing from the C standard)
  * \author Ole Schuett
  ******************************************************************************/
-__global__ static void process_batch_kernel(const double alpha,
-                                            const dbm_task_t *batch,
-                                            const double *pack_a_data,
-                                            const double *pack_b_data,
-                                            double *shard_c_data) {
-
-  __shared__ double tile_a[TILE_DIM][TILE_DIM], tile_b[TILE_DIM][TILE_DIM];
-
-  const dbm_task_t task = batch[blockIdx.x];
-  const double *block_a = &pack_a_data[task.offset_a];
-  const double *block_b = &pack_b_data[task.offset_b];
-  double *block_c = &shard_c_data[task.offset_c];
+__device__ constexpr static inline int imax(int x, int y) {
+  return (x > y ? x : y);
+}
 
-  for (int i_tile = 0; i_tile < task.m; i_tile += TILE_DIM) {
-    for (int j_tile = 0; j_tile < task.n; j_tile += TILE_DIM) {
+/*******************************************************************************
+ * \brief Processes a task using tile dimensions give by template parameters.
+ * \author Ole Schuett
+ ******************************************************************************/
+template <int M, int N>
+__device__ static void process_task(const int m, const int n, const int k,
+                                    const double alpha, const double *block_a,
+                                    const double *block_b, double *block_c,
+                                    double *shared_mem) {
+
+  constexpr int K = NUM_THREADS / imax(M, N);
+
+  static_assert(K * imax(M, N) == NUM_THREADS, "Wasting threads.");
+  static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");
+
+  double *tile_a = shared_mem;
+  double *tile_b = &shared_mem[K * M];
+
+  // Layout threads to cover the entire tile.
+  // Indices x,y,z mark the position within the tile.
+  const int y = threadIdx.x / M;       // Can exceed N, guaranteed to reach K.
+  const int x = threadIdx.x - y * M;   // Fastest index, does not exceed M.
+  const int xT = threadIdx.x / N;      // Can exceed M, guaranteed to reach K.
+  const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.
+
+  // Indices {ijl}_tile mark the beginning of the current tile.
+  for (int i_tile = 0; i_tile < m; i_tile += M) {
+    for (int j_tile = 0; j_tile < n; j_tile += N) {
       double result = 0.0;
-      for (int l_tile = 0; l_tile < task.k; l_tile += TILE_DIM) {
-        // Map indicies to threads such that memory reads are coalesced.
-        const int i = i_tile + threadIdx.x;
-        const int j = j_tile + threadIdx.x; // Different from j in final loop!
-        const int l = l_tile + threadIdx.y;
+      for (int l_tile = 0; l_tile < k; l_tile += K) {
 
         // Load tile_a from global into shared memory.
-        const int idx_a = l * task.m + i; // transa = "N"
-        const bool load_a = (l < task.k && i < task.m);
-        tile_a[threadIdx.y][threadIdx.x] = (load_a) ? block_a[idx_a] : 0.0;
+        if (x < M && y < K) {
+          const int i = i_tile + x;
+          const int l = l_tile + y;
+          const int idx = l * m + i; // transa = "N"
+          const bool load = (l < k && i < m);
+          tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
+        }
 
         // Load tile_b from global into shared memory.
-        const int idx_b = l * task.n + j; // transb = "T"
-        const bool load_b = (l < task.k && j < task.n);
-        tile_b[threadIdx.y][threadIdx.x] = (load_b) ? block_b[idx_b] : 0.0;
+        // Use transposed thread mapping to achieve coalesced memory reads.
+        if (yT < N && xT < K) {
+          const int j = j_tile + yT;
+          const int l = l_tile + xT;
+          const int idx = l * n + j; // transb = "T"
+          const bool load = (l < k && j < n);
+          tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
+        }
 
         // Multiply tiles from shared memory.
         __syncthreads();
+        if (x < M && y < N) {
 #pragma unroll
-        for (int z = 0; z < TILE_DIM; z++) {
-          result += tile_a[z][threadIdx.x] * tile_b[z][threadIdx.y];
+          for (int z = 0; z < K; z++) {
+            result += tile_a[z * M + x] * tile_b[z * N + y];
+          }
         }
         __syncthreads();
       }
 
       // Add result tile to block_c in global memory.
-      const int i = i_tile + threadIdx.x;
-      const int j = j_tile + threadIdx.y; // Different from j in inner loop!
-      if (i < task.m && j < task.n) {
-        const int idx_c = j * task.m + i;
-        // Need atomics because other thread blocks might work on same block_c.
-        atomicAddDouble(&block_c[idx_c], alpha * result);
+      if (x < M && y < N) {
+        const int i = i_tile + x;
+        const int j = j_tile + y;
+        if (i < m && j < n) {
+          const int idx = j * m + i;
+          // Need atomics because other thread blocks might work on same C.
+          atomicAddDouble(&block_c[idx], alpha * result);
+        }
       }
     }
   }
 }
 
+/*******************************************************************************
+ * \brief A generic matrix multiplication kernel.
+ * \author Ole Schuett
+ ******************************************************************************/
+__global__ static void process_batch_kernel(const double alpha,
+                                            const dbm_task_t *batch,
+                                            const double *pack_a_data,
+                                            const double *pack_b_data,
+                                            double *shard_c_data) {
+
+  __shared__ double shared_mem[2 * NUM_THREADS];
+
+  const dbm_task_t task = batch[blockIdx.x];
+
+  const int m = task.m;
+  const int n = task.n;
+  const int k = task.k;
+
+  const double *blk_a = &pack_a_data[task.offset_a];
+  const double *blk_b = &pack_b_data[task.offset_b];
+  double *blk_c = &shard_c_data[task.offset_c];
+
+  if (m <= 4 && n <= 4) {
+    process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 4 && n <= 8) {
+    process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 4 && n <= 16) {
+    process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 4 && n <= 32) {
+    process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 4) {
+    process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 8 && n <= 4) {
+    process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 16 && n <= 4) {
+    process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 32 && n <= 4) {
+    process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (n <= 4) {
+    process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 8 && n <= 8) {
+    process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 8 && n <= 16) {
+    process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 8) {
+    process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (m <= 16 && n <= 8) {
+    process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else if (n <= 8) {
+    process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+
+  } else {
+    process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
+  }
+}
+
 /*******************************************************************************
  * \brief Internal routine for intializing the gpu backend.
  * \author Ole Schuett
@@ -231,7 +326,7 @@ void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
 
   // Launch kernel.
   const int nblocks = ntasks; // TODO tune launch parameters.
-  const dim3 threads_per_block(TILE_DIM, TILE_DIM, 1);
+  const int threads_per_block = NUM_THREADS;
   const size_t smem_per_block = 0;
   process_batch_kernel<<<nblocks, threads_per_block, smem_per_block,
                          shard_c_dev->stream>>>(