Skip to content

Commit

Permalink
DBM: Turn miniapp into useful benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Jun 19, 2022
1 parent 982fd8e commit beca748
Showing 1 changed file with 88 additions and 61 deletions.
149 changes: 88 additions & 61 deletions src/dbm/dbm_miniapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,28 @@ static void print_func(char *message, int output_unit) {
printf("%s", message);
}

/*******************************************************************************
* \brief Returns the smaller of two given integer (missing from the C standard)
* \author Ole Schuett
******************************************************************************/
static inline int imin(int x, int y) { return (x < y ? x : y); }

/*******************************************************************************
* \brief Private routine for creating a distribution and an empty matrix.
* \author Ole Schuett
******************************************************************************/
static dbm_matrix_t *create_some_matrix(const dbm_mpi_comm_t comm) {
const int nrow = 200;
const int ncol = 200;
static dbm_matrix_t *create_some_matrix(const int row_size, const int col_size,
const dbm_mpi_comm_t comm) {
const int N = 8000;
const int nrow = imin(500, N / row_size);
const int ncol = imin(500, N / col_size);

int cart_dims[2], cart_periods[2], cart_coords[2];
dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);

// Create distribution.
int row_dist[nrow];
int col_dist[ncol];
int *row_dist = malloc(nrow * sizeof(int));
int *col_dist = malloc(ncol * sizeof(int));
for (int i = 0; i < nrow; i++) {
row_dist[i] = i % cart_dims[0];
}
Expand All @@ -49,19 +57,23 @@ static dbm_matrix_t *create_some_matrix(const dbm_mpi_comm_t comm) {
const int fortran_comm = dbm_mpi_comm_c2f(comm);
dbm_distribution_t *dist = NULL;
dbm_distribution_new(&dist, fortran_comm, nrow, ncol, row_dist, col_dist);
free(row_dist);
free(col_dist);

// Create matrix.
int row_sizes[nrow];
int col_sizes[ncol];
int *row_sizes = malloc(nrow * sizeof(int));
int *col_sizes = malloc(ncol * sizeof(int));
for (int i = 0; i < nrow; i++) {
row_sizes[i] = 18;
row_sizes[i] = row_size;
}
for (int i = 0; i < ncol; i++) {
col_sizes[i] = 18;
col_sizes[i] = col_size;
}
dbm_matrix_t *matrix = NULL;
dbm_create(&matrix, dist, "some name", nrow, ncol, row_sizes, col_sizes);
dbm_distribution_release(dist);
free(row_sizes);
free(col_sizes);
return matrix;
}

Expand All @@ -77,22 +89,34 @@ static void reserve_all_blocks(dbm_matrix_t *matrix) {

#pragma omp parallel
{
const int nblocks = nrows * ncols;
int reserve_row[nblocks / omp_get_num_threads() + 1];
int reserve_col[nblocks / omp_get_num_threads() + 1];
int nblocks_reserve = 0;
#pragma omp for
for (int i = 0; i < nblocks; i++) {
const int row = i % nrows;
const int col = i % ncols;
if (dbm_get_stored_coordinates(matrix, row, col) ==
matrix->dist->my_rank) {
reserve_row[nblocks_reserve] = row;
reserve_col[nblocks_reserve] = col;
nblocks_reserve++;
int nblocks = 0;
#pragma omp for collapse(2)
for (int row = 0; row < nrows; row++) {
for (int col = 0; col < ncols; col++) {
if (dbm_get_stored_coordinates(matrix, row, col) ==
matrix->dist->my_rank) {
nblocks++;
}
}
}
int *reserve_row = malloc(nblocks * sizeof(int));
int *reserve_col = malloc(nblocks * sizeof(int));
int iblock = 0;
#pragma omp for collapse(2)
for (int row = 0; row < nrows; row++) {
for (int col = 0; col < ncols; col++) {
if (dbm_get_stored_coordinates(matrix, row, col) ==
matrix->dist->my_rank) {
reserve_row[iblock] = row;
reserve_col[iblock] = col;
iblock++;
}
}
}
dbm_reserve_blocks(matrix, nblocks_reserve, reserve_row, reserve_col);
assert(iblock == nblocks);
dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
free(reserve_row);
free(reserve_col);
}
}

Expand All @@ -117,21 +141,52 @@ static void set_all_blocks(dbm_matrix_t *matrix) {
dbm_iterator_stop(iter);
}
}
/*******************************************************************************
* \brief Run a benchmark of dbm_multiply with given block sizes.
* \author Ole Schuett
******************************************************************************/
void bechmark_multiply(const int m, const int n, const int k,
const dbm_mpi_comm_t comm) {
dbm_matrix_t *matrix_a = create_some_matrix(m, k, comm);
dbm_matrix_t *matrix_b = create_some_matrix(k, n, comm);
dbm_matrix_t *matrix_c = create_some_matrix(m, n, comm);
reserve_all_blocks(matrix_a);
reserve_all_blocks(matrix_b);
set_all_blocks(matrix_a);
set_all_blocks(matrix_b);

int64_t flop = 0;
const double time_start_multiply = omp_get_wtime();
dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
1e-8, &flop);
const double time_end_multiply = omp_get_wtime();

dbm_release(matrix_a);
dbm_release(matrix_b);
dbm_release(matrix_c);

if (dbm_mpi_comm_rank(comm) == 0) {
const double duration = time_end_multiply - time_start_multiply;
printf("multiply %3i x %3i x %3i : %6.3f s => %5.1f GFLOP/s \n", m,
n, k, duration, 1e-9 * flop / duration);
}
}

/*******************************************************************************
* \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
* \author Ole Schuett
******************************************************************************/
int main(int argc, char *argv[]) {
dbm_mpi_init(&argc, &argv);
dbm_library_init();

const dbm_mpi_comm_t world_comm = dbm_mpi_get_comm_world();
const int nranks = dbm_mpi_comm_size(world_comm);
const int my_rank = dbm_mpi_comm_rank(world_comm);

if (my_rank == 0) {
printf("MPI ranks: %i\n", nranks);
printf("OpenMP threads: %i\n", omp_get_max_threads());
printf("MPI-ranks: %i OpenMP-threads: %i\n\n", nranks,
omp_get_max_threads());
}

if (offload_get_device_count() > 0) {
Expand All @@ -144,43 +199,15 @@ int main(int argc, char *argv[]) {
dbm_mpi_comm_t comm =
dbm_mpi_cart_create(world_comm, 2, dims, periods, false);

dbm_library_init();

// Benchmark reserve blocks.
const double time_start_reserve = omp_get_wtime();
for (int icycle = 0; icycle < 50; icycle++) {
dbm_matrix_t *matrix = create_some_matrix(comm);
reserve_all_blocks(matrix);
dbm_release(matrix);
}
const double time_end_reserve = omp_get_wtime();
if (my_rank == 0) {
const double t = time_end_reserve - time_start_reserve;
printf("reserve blocks: %.3f seconds\n", t);
}

// Benchmark matrix multiply.
dbm_matrix_t *matrix_a = create_some_matrix(comm);
dbm_matrix_t *matrix_b = create_some_matrix(comm);
dbm_matrix_t *matrix_c = create_some_matrix(comm);
reserve_all_blocks(matrix_a);
reserve_all_blocks(matrix_b);
set_all_blocks(matrix_a);
set_all_blocks(matrix_b);
int64_t flop;
const double time_start_multiply = omp_get_wtime();
dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
1e-8, &flop);
const double time_end_multiply = omp_get_wtime();
dbm_release(matrix_a);
dbm_release(matrix_b);
dbm_release(matrix_c);
bechmark_multiply(4, 4, 4, comm);

if (my_rank == 0) {
const double t = time_end_multiply - time_start_multiply;
printf("matrix multiply: %.3f s, %.1f MFLOP/s \n", t, 1e-6 * flop / t);
printf("done :-)\n");
}
bechmark_multiply(128, 4, 4, comm);
bechmark_multiply(4, 128, 4, comm);
bechmark_multiply(4, 4, 128, comm);
bechmark_multiply(4, 128, 128, comm);
bechmark_multiply(128, 4, 128, comm);
bechmark_multiply(128, 128, 4, comm);
bechmark_multiply(128, 128, 128, comm);

dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, 0);
dbm_library_finalize();
Expand Down

0 comments on commit beca748

Please sign in to comment.