Skip to content

Commit

Permalink
DBM: Add contraction-like benchmarks to miniapp
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Aug 14, 2022
1 parent 1b908eb commit 615aca1
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 56 deletions.
44 changes: 23 additions & 21 deletions src/dbm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,33 +69,35 @@ $ cd cp2k/src/dbm
$ make -j
$ OMP_NUM_THREADS=32 ./dbm_miniapp.x

MPI-ranks: 1 MPI-cart: 1 x 1 OpenMP-threads: 32

multiply 4 x 4 x 4 : 0.553 s => 29.0 GFLOP/s
multiply 128 x 4 x 4 : 0.277 s => 229.6 GFLOP/s
multiply 4 x 128 x 4 : 0.417 s => 152.2 GFLOP/s
multiply 4 x 4 x 128 : 0.246 s => 258.0 GFLOP/s
multiply 4 x 128 x 128 : 1.329 s => 189.6 GFLOP/s
multiply 128 x 4 x 128 : 0.566 s => 445.3 GFLOP/s
multiply 128 x 128 x 4 : 6.967 s => 36.2 GFLOP/s
multiply 128 x 128 x 128 : 1.660 s => 602.1 GFLOP/s
multiply 23 x 23 x 23 : 5.495 s => 185.0 GFLOP/s
multiply 32 x 32 x 32 : 4.195 s => 244.1 GFLOP/s
OpenMP-threads: 32 GPUs: 1 Libxsmm: n/a MPI-ranks: 1 MPI-cart: 1 x 1

16384 x 128 x 128 with 4 x 4 x 4 blocks: 1.621 s => 21.2 GFLOP/s
128 x 16384 x 128 with 4 x 4 x 4 blocks: 1.374 s => 25.0 GFLOP/s
128 x 128 x 16384 with 4 x 4 x 4 blocks: 1.426 s => 24.1 GFLOP/s
60 x 500 x 500 with 128 x 4 x 4 blocks: 0.159 s => 386.7 GFLOP/s
500 x 60 x 500 with 4 x 128 x 4 blocks: 0.191 s => 322.1 GFLOP/s
500 x 500 x 60 with 4 x 4 x 128 blocks: 0.193 s => 317.7 GFLOP/s
500 x 60 x 60 with 4 x 128 x 128 blocks: 0.668 s => 353.2 GFLOP/s
60 x 500 x 60 with 128 x 4 x 128 blocks: 0.351 s => 672.8 GFLOP/s
60 x 60 x 500 with 128 x 128 x 4 blocks: 0.663 s => 355.7 GFLOP/s
60 x 60 x 60 with 128 x 128 x 128 blocks: 0.870 s => 1041.3 GFLOP/s
350 x 350 x 350 with 23 x 23 x 23 blocks: 2.141 s => 487.3 GFLOP/s
250 x 250 x 250 with 32 x 32 x 32 blocks: 0.845 s => 1212.4 GFLOP/s

-------------------------------------------------------------------------------
- -
- DBM STATISTICS -
- -
-------------------------------------------------------------------------------
M x N x K COUNT PERCENT
? x ? x ? 125000000 53.21%
?? x ?? x ?? 57406923 24.44%
? x ? x ??? 15500000 6.60%
? x ??? x ? 15500000 6.60%
??? x ? x ? 15500000 6.60%
? x ??? x ??? 1922000 0.82%
??? x ? x ??? 1922000 0.82%
??? x ??? x ? 1922000 0.82%
??? x ??? x ??? 238328 0.10%
? x ? x ? 805306368 88.07%
?? x ?? x ?? 58500000 6.40%
? x ? x ??? 15000000 1.64%
? x ??? x ? 15000000 1.64%
??? x ? x ? 15000000 1.64%
? x ??? x ??? 1800000 0.20%
??? x ? x ??? 1800000 0.20%
??? x ??? x ? 1800000 0.20%
??? x ??? x ??? 216000 0.02%
-------------------------------------------------------------------------------
```
86 changes: 51 additions & 35 deletions src/dbm/dbm_miniapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#include <stdlib.h>
#include <string.h>

#if defined(__LIBXSMM)
#include <libxsmm.h>
#endif

#include "../offload/offload_library.h"
#include "dbm_library.h"
#include "dbm_matrix.h"
Expand All @@ -37,41 +41,38 @@ static inline int imin(int x, int y) { return (x < y ? x : y); }
* \brief Private routine for creating a distribution and an empty matrix.
* \author Ole Schuett
******************************************************************************/
static dbm_matrix_t *create_some_matrix(const int row_size, const int col_size,
static dbm_matrix_t *create_some_matrix(const int nrows, const int ncols,
const int row_size, const int col_size,
const dbm_mpi_comm_t comm) {
const int N = 8000;
const int nrow = imin(500, N / row_size);
const int ncol = imin(500, N / col_size);

int cart_dims[2], cart_periods[2], cart_coords[2];
dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);

// Create distribution.
int *row_dist = malloc(nrow * sizeof(int));
int *col_dist = malloc(ncol * sizeof(int));
for (int i = 0; i < nrow; i++) {
int *row_dist = malloc(nrows * sizeof(int));
int *col_dist = malloc(ncols * sizeof(int));
for (int i = 0; i < nrows; i++) {
row_dist[i] = i % cart_dims[0];
}
for (int i = 0; i < ncol; i++) {
for (int i = 0; i < ncols; i++) {
col_dist[i] = i % cart_dims[1];
}
const int fortran_comm = dbm_mpi_comm_c2f(comm);
dbm_distribution_t *dist = NULL;
dbm_distribution_new(&dist, fortran_comm, nrow, ncol, row_dist, col_dist);
dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
free(row_dist);
free(col_dist);

// Create matrix.
int *row_sizes = malloc(nrow * sizeof(int));
int *col_sizes = malloc(ncol * sizeof(int));
for (int i = 0; i < nrow; i++) {
int *row_sizes = malloc(nrows * sizeof(int));
int *col_sizes = malloc(ncols * sizeof(int));
for (int i = 0; i < nrows; i++) {
row_sizes[i] = row_size;
}
for (int i = 0; i < ncol; i++) {
for (int i = 0; i < ncols; i++) {
col_sizes[i] = col_size;
}
dbm_matrix_t *matrix = NULL;
dbm_create(&matrix, dist, "some name", nrow, ncol, row_sizes, col_sizes);
dbm_create(&matrix, dist, "some name", nrows, ncols, row_sizes, col_sizes);
dbm_distribution_release(dist);
free(row_sizes);
free(col_sizes);
Expand Down Expand Up @@ -146,11 +147,11 @@ static void set_all_blocks(dbm_matrix_t *matrix) {
* \brief Run a benchmark of dbm_multiply with given block sizes.
* \author Ole Schuett
******************************************************************************/
void benchmark_multiply(const int m, const int n, const int k,
const dbm_mpi_comm_t comm) {
dbm_matrix_t *matrix_a = create_some_matrix(m, k, comm);
dbm_matrix_t *matrix_b = create_some_matrix(k, n, comm);
dbm_matrix_t *matrix_c = create_some_matrix(m, n, comm);
void benchmark_multiply(const int M, const int N, const int K, const int m,
const int n, const int k, const dbm_mpi_comm_t comm) {
dbm_matrix_t *matrix_a = create_some_matrix(M, K, m, k, comm);
dbm_matrix_t *matrix_b = create_some_matrix(K, N, k, n, comm);
dbm_matrix_t *matrix_c = create_some_matrix(M, N, m, n, comm);
reserve_all_blocks(matrix_a);
reserve_all_blocks(matrix_b);
set_all_blocks(matrix_a);
Expand All @@ -169,8 +170,9 @@ void benchmark_multiply(const int m, const int n, const int k,
dbm_mpi_sum_int64(&flop, 1, comm);
if (dbm_mpi_comm_rank(comm) == 0) {
const double duration = time_end_multiply - time_start_multiply;
printf("multiply %3i x %3i x %3i : %6.3f s => %5.1f GFLOP/s \n", m,
n, k, duration, 1e-9 * flop / duration);
printf("%5i x %5i x %5i with %3i x %3i x %3i blocks: %6.3f s => %6.1f "
"GFLOP/s\n",
M, N, K, m, n, k, duration, 1e-9 * flop / duration);
fflush(stdout);
}
}
Expand Down Expand Up @@ -199,22 +201,36 @@ int main(int argc, char *argv[]) {
dbm_mpi_cart_create(world_comm, 2, dims, periods, false);

if (my_rank == 0) {
printf("MPI-ranks: %i MPI-cart: %i x %i OpenMP-threads: %i\n\n", nranks,
dims[0], dims[1], omp_get_max_threads());
printf("OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
imin(offload_get_device_count(), nranks));
#if defined(__LIBXSMM)
printf(" Libxsmm: %s", LIBXSMM_VERSION);
#else
printf(" Libxsmm: n/a");
#endif
#if defined(__parallel)
printf(" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
#else
printf(" MPI: n/a");
#endif
printf("\n\n");
fflush(stdout);
}

benchmark_multiply(4, 4, 4, comm);
benchmark_multiply(128, 4, 4, comm);
benchmark_multiply(4, 128, 4, comm);
benchmark_multiply(4, 4, 128, comm);
benchmark_multiply(4, 128, 128, comm);
benchmark_multiply(128, 4, 128, comm);
benchmark_multiply(128, 128, 4, comm);
benchmark_multiply(128, 128, 128, comm);

benchmark_multiply(23, 23, 23, comm);
benchmark_multiply(32, 32, 32, comm);
benchmark_multiply(16384, 128, 128, 4, 4, 4, comm);
benchmark_multiply(128, 16384, 128, 4, 4, 4, comm);
benchmark_multiply(128, 128, 16384, 4, 4, 4, comm);

benchmark_multiply(60, 500, 500, 128, 4, 4, comm);
benchmark_multiply(500, 60, 500, 4, 128, 4, comm);
benchmark_multiply(500, 500, 60, 4, 4, 128, comm);
benchmark_multiply(500, 60, 60, 4, 128, 128, comm);
benchmark_multiply(60, 500, 60, 128, 4, 128, comm);
benchmark_multiply(60, 60, 500, 128, 128, 4, comm);
benchmark_multiply(60, 60, 60, 128, 128, 128, comm);

benchmark_multiply(350, 350, 350, 23, 23, 23, comm);
benchmark_multiply(250, 250, 250, 32, 32, 32, comm);

dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, my_rank);
dbm_library_finalize();
Expand Down

0 comments on commit 615aca1

Please sign in to comment.