Skip to content

Commit

Permalink
DBM: Speedup GPU kernel for tall and skinny blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Jul 3, 2022
1 parent b1c14e6 commit 30093aa
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 38 deletions.
4 changes: 3 additions & 1 deletion src/dbm/dbm_miniapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ int main(int argc, char *argv[]) {
}

bechmark_multiply(4, 4, 4, comm);

bechmark_multiply(128, 4, 4, comm);
bechmark_multiply(4, 128, 4, comm);
bechmark_multiply(4, 4, 128, comm);
Expand All @@ -210,6 +209,9 @@ int main(int argc, char *argv[]) {
bechmark_multiply(128, 128, 4, comm);
bechmark_multiply(128, 128, 128, comm);

bechmark_multiply(23, 23, 23, comm);
bechmark_multiply(32, 32, 32, comm);

dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, 0);
dbm_library_finalize();
dbm_mpi_comm_free(&comm);
Expand Down
169 changes: 132 additions & 37 deletions src/dbm/dbm_multiply_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <assert.h>
#include <stdio.h>

#define NUM_THREADS 256

/*******************************************************************************
* \brief Atomic add for doubles that also works prior to compute capability 6.
* \author Ole Schuett
Expand All @@ -43,65 +45,158 @@ __device__ static void atomicAddDouble(double *address, double val) {
#endif
}

#define TILE_DIM 8

/*******************************************************************************
* \brief A generic matrix multiplication kernel.
* \brief Returns the larger of two given integer (missing from the C standard)
* \author Ole Schuett
******************************************************************************/
__global__ static void process_batch_kernel(const double alpha,
const dbm_task_t *batch,
const double *pack_a_data,
const double *pack_b_data,
double *shard_c_data) {

__shared__ double tile_a[TILE_DIM][TILE_DIM], tile_b[TILE_DIM][TILE_DIM];

const dbm_task_t task = batch[blockIdx.x];
const double *block_a = &pack_a_data[task.offset_a];
const double *block_b = &pack_b_data[task.offset_b];
double *block_c = &shard_c_data[task.offset_c];
__device__ constexpr static inline int imax(int x, int y) {
return (x > y ? x : y);
}

for (int i_tile = 0; i_tile < task.m; i_tile += TILE_DIM) {
for (int j_tile = 0; j_tile < task.n; j_tile += TILE_DIM) {
/*******************************************************************************
* \brief Processes a task using tile dimensions give by template parameters.
* \author Ole Schuett
******************************************************************************/
template <int M, int N>
__device__ static void process_task(const int m, const int n, const int k,
const double alpha, const double *block_a,
const double *block_b, double *block_c,
double *shared_mem) {

constexpr int K = NUM_THREADS / imax(M, N);

static_assert(K * imax(M, N) == NUM_THREADS, "Wasting threads.");
static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");

double *tile_a = shared_mem;
double *tile_b = &shared_mem[K * M];

// Layout threads to cover the entire tile.
// Indices x,y,z mark the position within the tile.
const int y = threadIdx.x / M; // Can exceed N, guaranteed to reach K.
const int x = threadIdx.x - y * M; // Fastest index, does not exceed M.
const int xT = threadIdx.x / N; // Can exceed M, guaranteed to reach K.
const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.

// Indices {ijl}_tile mark the beginning of the current tile.
for (int i_tile = 0; i_tile < m; i_tile += M) {
for (int j_tile = 0; j_tile < n; j_tile += N) {
double result = 0.0;
for (int l_tile = 0; l_tile < task.k; l_tile += TILE_DIM) {
// Map indicies to threads such that memory reads are coalesced.
const int i = i_tile + threadIdx.x;
const int j = j_tile + threadIdx.x; // Different from j in final loop!
const int l = l_tile + threadIdx.y;
for (int l_tile = 0; l_tile < k; l_tile += K) {

// Load tile_a from global into shared memory.
const int idx_a = l * task.m + i; // transa = "N"
const bool load_a = (l < task.k && i < task.m);
tile_a[threadIdx.y][threadIdx.x] = (load_a) ? block_a[idx_a] : 0.0;
if (x < M && y < K) {
const int i = i_tile + x;
const int l = l_tile + y;
const int idx = l * m + i; // transa = "N"
const bool load = (l < k && i < m);
tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
}

// Load tile_b from global into shared memory.
const int idx_b = l * task.n + j; // transb = "T"
const bool load_b = (l < task.k && j < task.n);
tile_b[threadIdx.y][threadIdx.x] = (load_b) ? block_b[idx_b] : 0.0;
// Use transposed thread mapping to achieve coalesced memory reads.
if (yT < N && xT < K) {
const int j = j_tile + yT;
const int l = l_tile + xT;
const int idx = l * n + j; // transb = "T"
const bool load = (l < k && j < n);
tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
}

// Multiply tiles from shared memory.
__syncthreads();
if (x < M && y < N) {
#pragma unroll
for (int z = 0; z < TILE_DIM; z++) {
result += tile_a[z][threadIdx.x] * tile_b[z][threadIdx.y];
for (int z = 0; z < K; z++) {
result += tile_a[z * M + x] * tile_b[z * N + y];
}
}
__syncthreads();
}

// Add result tile to block_c in global memory.
const int i = i_tile + threadIdx.x;
const int j = j_tile + threadIdx.y; // Different from j in inner loop!
if (i < task.m && j < task.n) {
const int idx_c = j * task.m + i;
// Need atomics because other thread blocks might work on same block_c.
atomicAddDouble(&block_c[idx_c], alpha * result);
if (x < M && y < N) {
const int i = i_tile + x;
const int j = j_tile + y;
if (i < m && j < n) {
const int idx = j * m + i;
// Need atomics because other thread blocks might work on same C.
atomicAddDouble(&block_c[idx], alpha * result);
}
}
}
}
}

/*******************************************************************************
* \brief A generic matrix multiplication kernel.
* \author Ole Schuett
******************************************************************************/
__global__ static void process_batch_kernel(const double alpha,
const dbm_task_t *batch,
const double *pack_a_data,
const double *pack_b_data,
double *shard_c_data) {

__shared__ double shared_mem[2 * NUM_THREADS];

const dbm_task_t task = batch[blockIdx.x];

const int m = task.m;
const int n = task.n;
const int k = task.k;

const double *blk_a = &pack_a_data[task.offset_a];
const double *blk_b = &pack_b_data[task.offset_b];
double *blk_c = &shard_c_data[task.offset_c];

if (m <= 4 && n <= 4) {
process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 4 && n <= 8) {
process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 4 && n <= 16) {
process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 4 && n <= 32) {
process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 4) {
process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 8 && n <= 4) {
process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 16 && n <= 4) {
process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 32 && n <= 4) {
process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (n <= 4) {
process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 8 && n <= 8) {
process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 8 && n <= 16) {
process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 8) {
process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (m <= 16 && n <= 8) {
process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else if (n <= 8) {
process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);

} else {
process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
}
}

/*******************************************************************************
* \brief Internal routine for intializing the gpu backend.
* \author Ole Schuett
Expand Down Expand Up @@ -231,7 +326,7 @@ void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,

// Launch kernel.
const int nblocks = ntasks; // TODO tune launch parameters.
const dim3 threads_per_block(TILE_DIM, TILE_DIM, 1);
const int threads_per_block = NUM_THREADS;
const size_t smem_per_block = 0;
process_batch_kernel<<<nblocks, threads_per_block, smem_per_block,
shard_c_dev->stream>>>(
Expand Down

0 comments on commit 30093aa

Please sign in to comment.