From 82a440d156576e601d373b29a74ed0c05b700aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Sch=C3=BCtt?= Date: Sun, 9 Jan 2022 22:54:06 +0100 Subject: [PATCH] DBM: Add new Distributed Block-sparse Matrix library --- INSTALL.md | 1 + src/PACKAGE | 1 + src/cp2k_info.F | 3 + src/dbm/LICENSE | 29 + src/dbm/Makefile | 41 + src/dbm/PACKAGE | 5 + src/dbm/README.md | 80 + src/dbm/dbm_api.F | 1474 +++++++++++++++++ src/dbm/dbm_distribution.c | 153 ++ src/dbm/dbm_distribution.h | 83 + src/dbm/dbm_hyperparams.h | 22 + src/dbm/dbm_library.c | 51 + src/dbm/dbm_library.h | 26 + src/dbm/dbm_matrix.c | 664 ++++++++ src/dbm/dbm_matrix.h | 229 +++ src/dbm/dbm_mempool.c | 220 +++ src/dbm/dbm_mempool.h | 46 + src/dbm/dbm_miniapp.c | 184 ++ src/dbm/dbm_mpi.c | 421 +++++ src/dbm/dbm_mpi.h | 180 ++ src/dbm/dbm_multiply.c | 319 ++++ src/dbm/dbm_multiply.h | 37 + src/dbm/dbm_multiply_comm.c | 423 +++++ src/dbm/dbm_multiply_comm.h | 68 + src/dbm/dbm_multiply_cpu.c | 152 ++ src/dbm/dbm_multiply_cpu.h | 28 + src/dbm/dbm_multiply_cuda.cu | 270 +++ src/dbm/dbm_multiply_cuda.h | 94 ++ src/dbm/dbm_multiply_internal.h | 39 + src/dbm/dbm_shard.c | 198 +++ src/dbm/dbm_shard.h | 94 ++ src/f77_interface.F | 6 + src/offload/offload_library.h | 2 +- tools/precommit/check_file_properties.py | 3 +- .../toolchain/scripts/generate_arch_files.sh | 2 +- 35 files changed, 5645 insertions(+), 3 deletions(-) create mode 100644 src/dbm/LICENSE create mode 100644 src/dbm/Makefile create mode 100644 src/dbm/PACKAGE create mode 100644 src/dbm/README.md create mode 100644 src/dbm/dbm_api.F create mode 100644 src/dbm/dbm_distribution.c create mode 100644 src/dbm/dbm_distribution.h create mode 100644 src/dbm/dbm_hyperparams.h create mode 100644 src/dbm/dbm_library.c create mode 100644 src/dbm/dbm_library.h create mode 100644 src/dbm/dbm_matrix.c create mode 100644 src/dbm/dbm_matrix.h create mode 100644 src/dbm/dbm_mempool.c create mode 100644 src/dbm/dbm_mempool.h create mode 100644 src/dbm/dbm_miniapp.c create mode 100644 src/dbm/dbm_mpi.c create mode 100644 src/dbm/dbm_mpi.h create mode 100644 src/dbm/dbm_multiply.c create mode 100644 src/dbm/dbm_multiply.h create mode 100644 src/dbm/dbm_multiply_comm.c create mode 100644 src/dbm/dbm_multiply_comm.h create mode 100644 src/dbm/dbm_multiply_cpu.c create mode 100644 src/dbm/dbm_multiply_cpu.h create mode 100644 src/dbm/dbm_multiply_cuda.cu create mode 100644 src/dbm/dbm_multiply_cuda.h create mode 100644 src/dbm/dbm_multiply_internal.h create mode 100644 src/dbm/dbm_shard.c create mode 100644 src/dbm/dbm_shard.h diff --git a/INSTALL.md b/INSTALL.md index 4c92348692..e6f0447af4 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -199,6 +199,7 @@ the FFTW3 threading library libfftw3_threads (or libfftw3_omp) is required. It requires to link `-lnvToolsExt`. - Link to a blas/scalapack library that accelerates large DGEMMs (e.g. libsci_acc) - Use the `-D__GRID_CUDA` to compile the GPU and HYBRID backends for the grid library. +- Use the `-D__DBM_CUDA` to compile the GPU backend for the sparse tensor library. ### 2k. LIBXC (optional, wider choice of xc functionals) diff --git a/src/PACKAGE b/src/PACKAGE index 760282118a..ef3d6ed35d 100644 --- a/src/PACKAGE +++ b/src/PACKAGE @@ -20,6 +20,7 @@ "dbcsrx", "arnoldi", "grid", + "dbm", ], "implicit": "INIT_METADYN|META_FORCE_CALCULATION|plumed_f_installed|plumed_f_gcreate|plumed_f_gcmd", } diff --git a/src/cp2k_info.F b/src/cp2k_info.F index e1fbe0e0fd..18bf819c47 100644 --- a/src/cp2k_info.F +++ b/src/cp2k_info.F @@ -289,6 +289,9 @@ FUNCTION cp2k_flags() RESULT(flags) #if defined(__GRID_HIP) flags = TRIM(flags)//" grid_hip" #endif +#if defined(__DBM_CUDA) + flags = TRIM(flags)//" dbm_cuda" +#endif #if defined(__OFFLOAD_PROFILING) flags = TRIM(flags)//" offload_profiling" #endif diff --git a/src/dbm/LICENSE b/src/dbm/LICENSE new file mode 100644 index 0000000000..cfbe25b4fb --- /dev/null +++ b/src/dbm/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2021, CP2K developers group +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/dbm/Makefile b/src/dbm/Makefile new file mode 100644 index 0000000000..c205ccb5e5 --- /dev/null +++ b/src/dbm/Makefile @@ -0,0 +1,41 @@ +.PHONY : all clean + +all: dbm_miniapp.x + +clean: + rm -fv *.o */*.o *.x ../offload/*.o + +CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra +NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__DBM_CUDA +LIBS := -lm -lblas -lstdc++ + +ALL_HEADERS := $(shell find . -name "*.h") $(shell find ../offload/ -name "*.h") +ALL_OBJECTS := ../offload/offload_library.o \ + dbm_distribution.o \ + dbm_library.o \ + dbm_matrix.o \ + dbm_mempool.o \ + dbm_mpi.o \ + dbm_multiply.o \ + dbm_multiply_comm.o \ + dbm_multiply_cpu.o \ + dbm_shard.o + +# Enable Cuda when nvcc compiler is present. +NVCC := $(shell which nvcc) +ifneq ($(NVCC),) +LIBS += -lcudart -lcuda -lcublas -L${CUDA_PATH}/lib64 +CFLAGS += -I${CUDA_PATH}/include -D__DBM_CUDA +ALL_OBJECTS += dbm_multiply_cuda.o + +%.o: %.cu $(ALL_HEADERS) + cd $(dir $<); $(NVCC) -c $(NVFLAGS) $(notdir $<) +endif + +%.o: %.c $(ALL_HEADERS) + cd $(dir $<); $(CC) -c -std=c11 $(CFLAGS) $(notdir $<) + +dbm_miniapp.x: dbm_miniapp.o $(ALL_OBJECTS) + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) + +#EOF diff --git a/src/dbm/PACKAGE b/src/dbm/PACKAGE new file mode 100644 index 0000000000..f40f5bb63b --- /dev/null +++ b/src/dbm/PACKAGE @@ -0,0 +1,5 @@ +{ + "description": "Distributed Block-sparse Matrix", + "requires": ["../base", "../mpiwrap", "../offload"], + "public": ["dbm_api.F"], +} diff --git a/src/dbm/README.md b/src/dbm/README.md new file mode 100644 index 0000000000..ae3b670b2d --- /dev/null +++ b/src/dbm/README.md @@ -0,0 +1,80 @@ +# DBM: Distributed Block-sparse Matrices + +The DBM is a drop-in replacement for [DBCSR](https://github.com/cp2k/dbcsr) +written in C. For the time being only features required by [DBT](../dbt/) are implemented. + +## Storage + +The DBM uses [coordinate lists](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) +as internal storage format. +An existing block is represented by the following data structure: + +```C +typedef struct { + int row; // zero based + int col; + int offset; + float norm; +} dbm_block_t; +``` + +The `norm` is cached for performance reasons. +A negative value indicates that the norm is invalid and needs to be recomputed. + +To allow for efficient OpenMP parallelism the blocks are +[sharded](https://en.wikipedia.org/wiki/Shard_(database_architecture)) via round-robin: + +```C +const int ishard = row % matrix->nshards; +``` + +## MPI Communication + +The communication scheme in [dbm_multiply_comm.c](./dbm_multiply_comm.c) is decoupled +from the local multiplication in [dbm_multiply.c](./dbm_multiply.c) via the +[iterator pattern](https://en.wikipedia.org/wiki/Iterator_pattern): + +```C +while (dbm_comm_iterator_next(iter, &pack_a, &pack_b)) { + backend_upload_packs(pack_a, pack_b, ctx); + multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b, + matrix_c, rows_left_max_eps, flop, ctx); +} +``` + +## Backends + +The last stage of the multiplication are the backends for specific hardware, e.g. +[CPU](./dbm_multiply_cpu.c) and [CUDA](./dbm_multiply_cuda.cu). +They are passed batches of task for processing. Each task describes a single block +multiplication. A simplest backend implementation looks like this: + + +```C +for (int itask = 0; itask < ntasks; itask++) { + const dbm_task_t task = batch[itask]; + const int lda = (transa) ? task.k : task.m; + const int ldb = (transb) ? task.n : task.k; + const int ldc = task.m; + const double *data_a = &pack_a->data[task.offset_a]; + const double *data_b = &pack_b->data[task.offset_b]; + double *data_c = &shard_c->data[task.offset_c]; + dgemm(transa, transb, task.m, task.n, task.k, alpha, data_a, lda, data_b, ldb, 1.0, data_c, ldc); +} +``` + + +## MiniApp + +The `dbm_miniapp.x` binary allows to run a simple smoke test. + +```shell +$ cd cp2k/src/dbm +$ make -j +$ OMP_NUM_THREADS=2 ./dbm_miniapp.x +MPI ranks: 1 +OpenMP threads: 2 +reserve blocks: 0.047 seconds +matrix multiply: 0.001 s, 2.1 MFLOP/s +done :-) +``` diff --git a/src/dbm/dbm_api.F b/src/dbm/dbm_api.F new file mode 100644 index 0000000000..67a6362276 --- /dev/null +++ b/src/dbm/dbm_api.F @@ -0,0 +1,1474 @@ +!--------------------------------------------------------------------------------------------------! +! CP2K: A general program to perform molecular dynamics simulations ! +! Copyright 2000-2022 CP2K developers group ! +! ! +! SPDX-License-Identifier: BSD-3-Clause ! +!--------------------------------------------------------------------------------------------------! + +MODULE dbm_api + USE ISO_C_BINDING, ONLY: & + C_ASSOCIATED, C_BOOL, C_CHAR, C_DOUBLE, C_F_POINTER, C_INT, C_LONG, C_NULL_CHAR, C_NULL_PTR, C_PTR + USE kinds, ONLY: default_string_length, & + dp, & + int_8 + USE message_passing, ONLY: mp_cart_rank, & + mp_environ + +! Uncomment the following line to enable validation. +!#define DBM_VALIDATE_AGAINST_DBCSR +#define DBM_VALIDATE_NBLOCKS_MATCH .TRUE. +#define DBM_VALIDATE_THRESHOLD 5e-10_dp + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + USE dbcsr_block_access, ONLY: dbcsr_get_block_p, & + dbcsr_put_block, & + dbcsr_reserve_blocks + USE dbcsr_dist_methods, ONLY: dbcsr_distribution_col_dist, & + dbcsr_distribution_hold, & + dbcsr_distribution_new, & + dbcsr_distribution_release, & + dbcsr_distribution_row_dist + USE dbcsr_dist_operations, ONLY: dbcsr_get_stored_coordinates + USE dbcsr_dist_util, ONLY: dbcsr_checksum + USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, & + dbcsr_iterator_next_block, & + dbcsr_iterator_start, & + dbcsr_iterator_stop + USE dbcsr_methods, ONLY: dbcsr_col_block_sizes, & + dbcsr_get_num_blocks, & + dbcsr_get_nze, & + dbcsr_mp_release, & + dbcsr_release, & + dbcsr_row_block_sizes + USE dbcsr_mp_methods, ONLY: dbcsr_mp_new + USE dbcsr_multiply_api, ONLY: dbcsr_multiply + USE dbcsr_operations, ONLY: dbcsr_add, & + dbcsr_clear, & + dbcsr_copy, & + dbcsr_filter, & + dbcsr_get_info, & + dbcsr_maxabs, & + dbcsr_scale, & + dbcsr_zero + USE dbcsr_transformations, ONLY: dbcsr_redistribute + USE dbcsr_types, ONLY: dbcsr_distribution_obj, & + dbcsr_iterator, & + dbcsr_mp_obj, & + dbcsr_no_transpose, & + dbcsr_transpose, & + dbcsr_type, & + dbcsr_type_no_symmetry, & + dbcsr_type_real_8 + USE dbcsr_work_operations, ONLY: dbcsr_create, & + dbcsr_finalize + USE dbcsr_data_methods, ONLY: dbcsr_scalar + +#endif + +#include "../base/base_uses.f90" + + IMPLICIT NONE + + PRIVATE + + CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbm_api' + + PUBLIC :: dbm_distribution_obj + PUBLIC :: dbm_distribution_new + PUBLIC :: dbm_distribution_hold + PUBLIC :: dbm_distribution_release + PUBLIC :: dbm_distribution_col_dist + PUBLIC :: dbm_distribution_row_dist + + PUBLIC :: dbm_iterator + PUBLIC :: dbm_iterator_start + PUBLIC :: dbm_iterator_stop + PUBLIC :: dbm_iterator_blocks_left + PUBLIC :: dbm_iterator_next_block + + PUBLIC :: dbm_type + PUBLIC :: dbm_release + PUBLIC :: dbm_create + PUBLIC :: dbm_create_from_template + PUBLIC :: dbm_clear + PUBLIC :: dbm_scale + PUBLIC :: dbm_get_block_p + PUBLIC :: dbm_put_block + PUBLIC :: dbm_reserve_blocks + PUBLIC :: dbm_filter + PUBLIC :: dbm_finalize + PUBLIC :: dbm_multiply + PUBLIC :: dbm_redistribute + PUBLIC :: dbm_copy + PUBLIC :: dbm_add + PUBLIC :: dbm_maxabs + PUBLIC :: dbm_zero + PUBLIC :: dbm_checksum + PUBLIC :: dbm_get_name + PUBLIC :: dbm_get_distribution + PUBLIC :: dbm_get_num_blocks + PUBLIC :: dbm_get_nze + PUBLIC :: dbm_get_stored_coordinates + PUBLIC :: dbm_get_row_block_sizes + PUBLIC :: dbm_get_col_block_sizes + PUBLIC :: dbm_get_local_rows + PUBLIC :: dbm_get_local_cols + + PUBLIC :: dbm_library_init + PUBLIC :: dbm_library_finalize + + TYPE dbm_distribution_obj + PRIVATE + TYPE(C_PTR) :: c_ptr = C_NULL_PTR +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + TYPE(dbcsr_distribution_obj) :: dbcsr +#endif + END TYPE dbm_distribution_obj + + TYPE dbm_type + PRIVATE + TYPE(C_PTR) :: c_ptr = C_NULL_PTR +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + TYPE(dbcsr_type) :: dbcsr +#endif + END TYPE dbm_type + + TYPE dbm_iterator + PRIVATE + TYPE(C_PTR) :: c_ptr = C_NULL_PTR + END TYPE dbm_iterator + +CONTAINS + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) +! ************************************************************************************************** +!> \brief Compates the given DBM matrix against its shadow DBCSR matrics. +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE validate(matrix) + TYPE(dbm_type), INTENT(IN) :: matrix + + INTEGER :: col, col_size, col_size_dbcsr, i, j, & + num_blocks, num_blocks_dbcsr, & + num_blocks_diff, row, row_size, & + row_size_dbcsr + INTEGER, ALLOCATABLE, DIMENSION(:) :: local_cols, local_rows + LOGICAL :: transposed + REAL(dp) :: norm2, rel_diff + REAL(dp), DIMENSION(:, :), POINTER :: block, block_dbcsr + TYPE(C_PTR) :: block_c + TYPE(dbcsr_iterator) :: iter + INTERFACE + SUBROUTINE dbm_get_block_p_c(matrix, row, col, block, row_size, col_size) & + BIND(C, name="dbm_get_block_p") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(kind=C_INT), VALUE :: row + INTEGER(kind=C_INT), VALUE :: col + TYPE(C_PTR) :: block + INTEGER(kind=C_INT) :: row_size + INTEGER(kind=C_INT) :: col_size + END SUBROUTINE dbm_get_block_p_c + END INTERFACE + + ! Call some getters to run their validation code. + CALL dbm_get_local_rows(matrix, local_rows) + CALL dbm_get_local_cols(matrix, local_cols) + + num_blocks_dbcsr = dbcsr_get_num_blocks(matrix%dbcsr) + num_blocks = dbm_get_num_blocks(matrix) + num_blocks_diff = ABS(num_blocks - num_blocks_dbcsr) + IF (num_blocks_diff /= 0) THEN + WRITE (*, *) "num_blocks mismatch dbcsr:", num_blocks_dbcsr, "new:", num_blocks + IF (DBM_VALIDATE_NBLOCKS_MATCH) & + CPABORT("num_blocks mismatch") + END IF + + IF (DBM_VALIDATE_NBLOCKS_MATCH) THEN + CPASSERT(dbm_get_nze(matrix) == dbcsr_get_nze(matrix%dbcsr)) + END IF + + ! check all dbcsr blocks + norm2 = 0.0_dp + CALL dbcsr_iterator_start(iter, matrix%dbcsr) + DO WHILE (dbcsr_iterator_blocks_left(iter)) + CALL dbcsr_iterator_next_block(iter, row=row, column=col, block=block_dbcsr, & + transposed=transposed, & + row_size=row_size_dbcsr, col_size=col_size_dbcsr) + CPASSERT(.NOT. transposed) + CALL dbm_get_block_p_c(matrix=matrix%c_ptr, row=row - 1, col=col - 1, & + block=block_c, row_size=row_size, col_size=col_size) + + CPASSERT(row_size == row_size_dbcsr .AND. col_size == col_size_dbcsr) + IF (SIZE(block_dbcsr) == 0) THEN + CYCLE + END IF + IF (.NOT. C_ASSOCIATED(block_c)) THEN + CPASSERT(MAXVAL(ABS(block_dbcsr)) < DBM_VALIDATE_THRESHOLD) + CYCLE + END IF + + CALL C_F_POINTER(block_c, block, shape=(/row_size, col_size/)) + DO i = 1, row_size + DO j = 1, col_size + rel_diff = ABS(block(i, j) - block_dbcsr(i, j))/MAX(1.0_dp, ABS(block_dbcsr(i, j))) + IF (rel_diff > DBM_VALIDATE_THRESHOLD) THEN + WRITE (*, *) "row:", row, "col:", col, "i:", i, "j:", j, "rel_diff:", rel_diff + WRITE (*, *) "values dbcsr:", block_dbcsr(i, j), "new:", block(i, j) + CPABORT("block value mismatch") + END IF + END DO + END DO + norm2 = norm2 + SUM(block**2) + block_dbcsr(:, :) = block(:, :) ! quench numerical noise + END DO + CALL dbcsr_iterator_stop(iter) + + ! Can not call dbcsr_get_block_p because it's INTENT(INOUT) :-( + + !! At least check that the norm (=checksum) of excesive blocks is small. + !TODO: sum norm2 across all mpi ranks. + !TODO: re-add INTERFACE to dbm_checksum_c, which got removed by prettify. + !rel_diff = ABS(dbm_checksum_c(matrix%c_ptr) - norm2)/MAX(1.0_dp, norm2) + !IF (rel_diff > DBM_VALIDATE_THRESHOLD) THEN + ! WRITE (*, *) "num_blocks dbcsr:", num_blocks_dbcsr, "new:", num_blocks + ! WRITE (*, *) "norm2: ", norm2 + ! WRITE (*, *) "relative residual norm diff: ", rel_diff + ! CPABORT("residual norm diff") + !END IF + END SUBROUTINE validate + +#else + +! ************************************************************************************************** +!> \brief Dummy for when DBM_VALIDATE_AGAINST_DBCSR is not defined. +!> \param matrix ... +! ************************************************************************************************** + SUBROUTINE validate(matrix) + TYPE(dbm_type), INTENT(IN) :: matrix + + MARK_USED(matrix) + END SUBROUTINE validate +#endif + +! ************************************************************************************************** +!> \brief Creates a new matrix from given template, reusing dist and row/col_block_sizes. +!> \param matrix ... +!> \param name ... +!> \param template ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_create_from_template(matrix, name, template) + TYPE(dbm_type), INTENT(INOUT) :: matrix + CHARACTER(len=*), INTENT(IN) :: name + TYPE(dbm_type), INTENT(IN) :: template + + CALL dbm_create(matrix, & + name=name, & + dist=dbm_get_distribution(template), & + row_block_sizes=dbm_get_row_block_sizes(template), & + col_block_sizes=dbm_get_col_block_sizes(template)) + + END SUBROUTINE dbm_create_from_template + +! ************************************************************************************************** +!> \brief Creates a new matrix. +!> \param matrix ... +!> \param name ... +!> \param dist ... +!> \param row_block_sizes ... +!> \param col_block_sizes ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_create(matrix, name, dist, row_block_sizes, col_block_sizes) + TYPE(dbm_type), INTENT(INOUT) :: matrix + CHARACTER(len=*), INTENT(IN) :: name + TYPE(dbm_distribution_obj), INTENT(IN) :: dist + INTEGER, CONTIGUOUS, DIMENSION(:), INTENT(IN), & + POINTER :: row_block_sizes, col_block_sizes + + INTERFACE + SUBROUTINE dbm_create_c(matrix, dist, name, nrows, ncols, row_sizes, col_sizes) & + BIND(C, name="dbm_create") + IMPORT :: C_PTR, C_CHAR, C_INT + TYPE(C_PTR) :: matrix + TYPE(C_PTR), VALUE :: dist + CHARACTER(kind=C_CHAR), DIMENSION(*) :: name + INTEGER(kind=C_INT), VALUE :: nrows + INTEGER(kind=C_INT), VALUE :: ncols + INTEGER(kind=C_INT), DIMENSION(*) :: row_sizes + INTEGER(kind=C_INT), DIMENSION(*) :: col_sizes + END SUBROUTINE dbm_create_c + END INTERFACE + + CPASSERT(.NOT. C_ASSOCIATED(matrix%c_ptr)) + CALL dbm_create_c(matrix=matrix%c_ptr, & + dist=dist%c_ptr, & + name=TRIM(name)//C_NULL_CHAR, & + nrows=SIZE(row_block_sizes), & + ncols=SIZE(col_block_sizes), & + row_sizes=row_block_sizes, & + col_sizes=col_block_sizes) + CPASSERT(C_ASSOCIATED(matrix%c_ptr)) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_create(matrix%dbcsr, name=name, dist=dist%dbcsr, & + matrix_type=dbcsr_type_no_symmetry, & + row_blk_size=row_block_sizes, col_blk_size=col_block_sizes, & + data_type=dbcsr_type_real_8) + + CALL validate(matrix) +#endif + END SUBROUTINE dbm_create + +! ************************************************************************************************** +!> \brief Needed to be called for DBCSR after blocks where inserted. For DBM it's a no-opt. +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_finalize(matrix) + TYPE(dbm_type), INTENT(INOUT) :: matrix + + MARK_USED(matrix) ! New implementation does not need finalize. + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_finalize(matrix%dbcsr) +#endif + END SUBROUTINE dbm_finalize + +! ************************************************************************************************** +!> \brief Releases a matrix and all its ressources. +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_release(matrix) + TYPE(dbm_type), INTENT(INOUT) :: matrix + + INTERFACE + SUBROUTINE dbm_release_c(matrix) & + BIND(C, name="dbm_release") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + END SUBROUTINE dbm_release_c + END INTERFACE + + CALL dbm_release_c(matrix=matrix%c_ptr) + matrix%c_ptr = C_NULL_PTR + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_release(matrix%dbcsr) +#endif + END SUBROUTINE dbm_release + +! ************************************************************************************************** +!> \brief Copies content of matrix_b into matrix_a. +!> Matrices must have the same row/col block sizes and distribution. +!> \param matrix_a ... +!> \param matrix_b ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_copy(matrix_a, matrix_b) + TYPE(dbm_type), INTENT(INOUT) :: matrix_a + TYPE(dbm_type), INTENT(IN) :: matrix_b + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_copy' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_copy_c(matrix_a, matrix_b) & + BIND(C, name="dbm_copy") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix_a + TYPE(C_PTR), VALUE :: matrix_b + END SUBROUTINE dbm_copy_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL dbm_copy_c(matrix_a=matrix_a%c_ptr, matrix_b=matrix_b%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_copy(matrix_a%dbcsr, matrix_b%dbcsr) + CALL validate(matrix_a) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_copy + +! ************************************************************************************************** +!> \brief Copies content of matrix_b into matrix_a. Matrices may have different distributions. +!> \param matrix ... +!> \param redist ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_redistribute(matrix, redist) + TYPE(dbm_type), INTENT(IN) :: matrix + TYPE(dbm_type), INTENT(INOUT) :: redist + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_redistribute' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_redistribute_c(matrix, redist) & + BIND(C, name="dbm_redistribute") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + TYPE(C_PTR), VALUE :: redist + END SUBROUTINE dbm_redistribute_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL dbm_redistribute_c(matrix=matrix%c_ptr, redist=redist%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_redistribute(matrix%dbcsr, redist%dbcsr) + CALL validate(redist) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_redistribute + +! ************************************************************************************************** +!> \brief Looks up a block from given matrics. This routine is thread-safe. +!> If the block is not found then a null pointer is returned. +!> \param matrix ... +!> \param row ... +!> \param col ... +!> \param block ... +!> \param row_size ... +!> \param col_size ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_get_block_p(matrix, row, col, block, row_size, col_size) + TYPE(dbm_type), INTENT(INOUT) :: matrix + INTEGER, INTENT(IN) :: row, col + REAL(dp), DIMENSION(:, :), INTENT(OUT), POINTER :: block + INTEGER, INTENT(OUT), OPTIONAL :: row_size, col_size + + INTEGER :: my_col_size, my_row_size + TYPE(C_PTR) :: block_c + INTERFACE + SUBROUTINE dbm_get_block_p_c(matrix, row, col, block, row_size, col_size) & + BIND(C, name="dbm_get_block_p") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(kind=C_INT), VALUE :: row + INTEGER(kind=C_INT), VALUE :: col + TYPE(C_PTR) :: block + INTEGER(kind=C_INT) :: row_size + INTEGER(kind=C_INT) :: col_size + END SUBROUTINE dbm_get_block_p_c + END INTERFACE + + CALL dbm_get_block_p_c(matrix=matrix%c_ptr, row=row - 1, col=col - 1, & + block=block_c, row_size=my_row_size, col_size=my_col_size) + IF (C_ASSOCIATED(block_c)) THEN + CALL C_F_POINTER(block_c, block, shape=(/my_row_size, my_col_size/)) + ELSE + NULLIFY (block) ! block not found + END IF + IF (PRESENT(row_size)) row_size = my_row_size + IF (PRESENT(col_size)) col_size = my_col_size + END SUBROUTINE dbm_get_block_p + +! ************************************************************************************************** +!> \brief Adds a block to given matrix. This routine is thread-safe. +!> If block already exist then it gets overwritten (or summed). +!> \param matrix ... +!> \param row ... +!> \param col ... +!> \param block ... +!> \param summation ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_put_block(matrix, row, col, block, summation) + TYPE(dbm_type), INTENT(INOUT) :: matrix + INTEGER, INTENT(IN) :: row, col + REAL(dp), CONTIGUOUS, DIMENSION(:, :), INTENT(IN) :: block + LOGICAL, INTENT(IN), OPTIONAL :: summation + + LOGICAL :: my_summation + INTERFACE + SUBROUTINE dbm_put_block_c(matrix, row, col, summation, block) & + BIND(C, name="dbm_put_block") + IMPORT :: C_PTR, C_INT, C_BOOL, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix + INTEGER(kind=C_INT), VALUE :: row + INTEGER(kind=C_INT), VALUE :: col + LOGICAL(kind=C_BOOL), VALUE :: summation + REAL(kind=C_DOUBLE), DIMENSION(*) :: block + END SUBROUTINE dbm_put_block_c + END INTERFACE + + my_summation = .FALSE. + IF (PRESENT(summation)) my_summation = summation + + CALL dbm_put_block_c(matrix=matrix%c_ptr, & + row=row - 1, col=col - 1, & + summation=LOGICAL(my_summation, C_BOOL), & + block=block) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_put_block(matrix%dbcsr, row, col, block, summation=summation) + ! Can not call validate(matrix) because the dbcsr matrix needs to be finalized first. +#endif + END SUBROUTINE dbm_put_block + +! ************************************************************************************************** +!> \brief Remove all blocks from given matrix, but does not release the underlying memory. +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_clear(matrix) + TYPE(dbm_type), INTENT(INOUT) :: matrix + + INTERFACE + SUBROUTINE dbm_clear_c(matrix) & + BIND(C, name="dbm_clear") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + END SUBROUTINE dbm_clear_c + END INTERFACE + + CALL dbm_clear_c(matrix=matrix%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_clear(matrix%dbcsr) + CALL validate(matrix) +#endif + END SUBROUTINE dbm_clear + +! ************************************************************************************************** +!> \brief Removes all blocks from the given matrix whose block norm is below the given threshold. +!> Blocks of size zero are always kept. +!> \param matrix ... +!> \param eps ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_filter(matrix, eps) + TYPE(dbm_type), INTENT(INOUT) :: matrix + REAL(dp), INTENT(IN) :: eps + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_filter' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_filter_c(matrix, eps) & + BIND(C, name="dbm_filter") + IMPORT :: C_PTR, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix + REAL(kind=C_DOUBLE), VALUE :: eps + END SUBROUTINE dbm_filter_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL validate(matrix) + CALL dbm_filter_c(matrix=matrix%c_ptr, eps=eps) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_filter(matrix%dbcsr, eps) + CALL validate(matrix) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_filter + +! ************************************************************************************************** +!> \brief Adds given list of blocks efficiently. The blocks will be filled with zeros. +!> \param matrix ... +!> \param rows ... +!> \param cols ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_reserve_blocks(matrix, rows, cols) + TYPE(dbm_type), INTENT(INOUT) :: matrix + INTEGER, DIMENSION(:), INTENT(IN) :: rows, cols + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_reserve_blocks' + + INTEGER :: handle + INTEGER(kind=C_INT), DIMENSION(SIZE(rows)) :: cols_c, rows_c + INTERFACE + SUBROUTINE dbm_reserve_blocks_c(matrix, nblocks, rows, cols) & + BIND(C, name="dbm_reserve_blocks") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(kind=C_INT), VALUE :: nblocks + INTEGER(kind=C_INT), DIMENSION(*) :: rows + INTEGER(kind=C_INT), DIMENSION(*) :: cols + END SUBROUTINE dbm_reserve_blocks_c + END INTERFACE + + CALL timeset(routineN, handle) + CPASSERT(SIZE(rows) == SIZE(cols)) + rows_c = rows - 1 + cols_c = cols - 1 + + CALL dbm_reserve_blocks_c(matrix=matrix%c_ptr, & + nblocks=SIZE(rows), & + rows=rows_c, & + cols=cols_c) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_reserve_blocks(matrix%dbcsr, rows, cols) + CALL validate(matrix) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_reserve_blocks + +! ************************************************************************************************** +!> \brief Multiplies all entries in the given matrix by the given factor alpha. +!> \param matrix ... +!> \param alpha ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_scale(matrix, alpha) + TYPE(dbm_type), INTENT(INOUT) :: matrix + REAL(dp), INTENT(IN) :: alpha + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_scale' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_scale_c(matrix, alpha) & + BIND(C, name="dbm_scale") + IMPORT :: C_PTR, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix + REAL(kind=C_DOUBLE), VALUE :: alpha + END SUBROUTINE dbm_scale_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL dbm_scale_c(matrix=matrix%c_ptr, alpha=alpha) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_scale(matrix%dbcsr, alpha) + CALL validate(matrix) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_scale + +! ************************************************************************************************** +!> \brief Sets all blocks in the given matrix to zero. +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_zero(matrix) + TYPE(dbm_type), INTENT(INOUT) :: matrix + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_zero' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_zero_c(matrix) & + BIND(C, name="dbm_zero") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + END SUBROUTINE dbm_zero_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL dbm_zero_c(matrix=matrix%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_zero(matrix%dbcsr) + CALL validate(matrix) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_zero + +! ************************************************************************************************** +!> \brief Adds matrix_b to matrix_a. +!> \param matrix_a ... +!> \param matrix_b ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_add(matrix_a, matrix_b) + TYPE(dbm_type), INTENT(INOUT) :: matrix_a + TYPE(dbm_type), INTENT(IN) :: matrix_b + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_add' + + INTEGER :: handle + INTERFACE + SUBROUTINE dbm_add_c(matrix_a, matrix_b) & + BIND(C, name="dbm_add") + IMPORT :: C_PTR, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix_a + TYPE(C_PTR), VALUE :: matrix_b + END SUBROUTINE dbm_add_c + END INTERFACE + + CALL timeset(routineN, handle) + CALL validate(matrix_a) + CALL validate(matrix_b) + CALL dbm_add_c(matrix_a=matrix_a%c_ptr, matrix_b=matrix_b%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_add(matrix_a%dbcsr, matrix_b%dbcsr) + CALL validate(matrix_a) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_add + +! ************************************************************************************************** +!> \brief Computes matrix product: matrix_c = alpha * matrix_a * matrix_b + beta * matrix_c. +!> \param transa ... +!> \param transb ... +!> \param alpha ... +!> \param matrix_a ... +!> \param matrix_b ... +!> \param beta ... +!> \param matrix_c ... +!> \param retain_sparsity ... +!> \param filter_eps ... +!> \param flop ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_multiply(transa, transb, & + alpha, matrix_a, matrix_b, beta, matrix_c, & + retain_sparsity, filter_eps, flop) + LOGICAL, INTENT(IN) :: transa, transb + REAL(kind=dp), INTENT(IN) :: alpha + TYPE(dbm_type), INTENT(INOUT) :: matrix_a, matrix_b + REAL(kind=dp), INTENT(IN) :: beta + TYPE(dbm_type), INTENT(INOUT) :: matrix_c + LOGICAL, INTENT(IN), OPTIONAL :: retain_sparsity + REAL(kind=dp), INTENT(IN), OPTIONAL :: filter_eps + INTEGER(int_8), INTENT(OUT), OPTIONAL :: flop + + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbm_multiply' + + CHARACTER(LEN=1) :: transa_char, transb_char + INTEGER :: handle + INTEGER(int_8) :: flop_dbcsr, my_flop + LOGICAL :: my_retain_sparsity + REAL(kind=dp) :: my_filter_eps + INTERFACE + SUBROUTINE dbm_multiply_c(transa, transb, alpha, & + matrix_a, matrix_b, & + beta, matrix_c, & + retain_sparsity, filter_eps, flop) & + BIND(C, name="dbm_multiply") + IMPORT :: C_PTR, C_DOUBLE, C_BOOL, C_LONG + LOGICAL(kind=C_BOOL), VALUE :: transa + LOGICAL(kind=C_BOOL), VALUE :: transb + REAL(kind=C_DOUBLE), VALUE :: alpha + TYPE(C_PTR), VALUE :: matrix_a + TYPE(C_PTR), VALUE :: matrix_b + REAL(kind=C_DOUBLE), VALUE :: beta + TYPE(C_PTR), VALUE :: matrix_c + LOGICAL(kind=C_BOOL), VALUE :: retain_sparsity + REAL(kind=C_DOUBLE), VALUE :: filter_eps + INTEGER(kind=C_LONG) :: flop + END SUBROUTINE dbm_multiply_c + END INTERFACE + + CALL timeset(routineN, handle) + + IF (PRESENT(retain_sparsity)) THEN + my_retain_sparsity = retain_sparsity + ELSE + my_retain_sparsity = .FALSE. + END IF + + IF (PRESENT(filter_eps)) THEN + my_filter_eps = filter_eps + ELSE + my_filter_eps = 0.0_dp + END IF + + CALL validate(matrix_a) + CALL validate(matrix_b) + CALL validate(matrix_c) + CALL dbm_multiply_c(transa=LOGICAL(transa, C_BOOL), & + transb=LOGICAL(transb, C_BOOL), & + alpha=alpha, & + matrix_a=matrix_a%c_ptr, & + matrix_b=matrix_b%c_ptr, & + beta=beta, & + matrix_c=matrix_c%c_ptr, & + retain_sparsity=LOGICAL(my_retain_sparsity, C_BOOL), & + filter_eps=my_filter_eps, & + flop=my_flop) + + IF (PRESENT(flop)) THEN + flop = my_flop + END IF + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + IF (transa) THEN + transa_char = dbcsr_transpose + ELSE + transa_char = dbcsr_no_transpose + END IF + IF (transb) THEN + transb_char = dbcsr_transpose + ELSE + transb_char = dbcsr_no_transpose + END IF + CALL dbcsr_multiply(transa=transa_char, transb=transb_char, & + alpha=alpha, matrix_a=matrix_a%dbcsr, & + matrix_b=matrix_b%dbcsr, beta=beta, matrix_c=matrix_c%dbcsr, & + retain_sparsity=retain_sparsity, filter_eps=filter_eps, flop=flop_dbcsr) + CPASSERT(my_flop == flop_dbcsr) + CALL validate(matrix_c) +#else + ! Can not use preprocessor's ifdefs before INTERFACE because it confuses prettify. + MARK_USED(transa_char) + MARK_USED(transb_char) + MARK_USED(flop_dbcsr) +#endif + CALL timestop(handle) + END SUBROUTINE dbm_multiply + +! ************************************************************************************************** +!> \brief Creates an iterator for the blocks of the given matrix. The iteration order is not stable. +!> \param iterator ... +!> \param matrix ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_iterator_start(iterator, matrix) + TYPE(dbm_iterator), INTENT(OUT) :: iterator + TYPE(dbm_type), INTENT(IN) :: matrix + + INTERFACE + SUBROUTINE dbm_iterator_start_c(iterator, matrix) & + BIND(C, name="dbm_iterator_start") + IMPORT :: C_PTR + TYPE(C_PTR) :: iterator + TYPE(C_PTR), VALUE :: matrix + END SUBROUTINE dbm_iterator_start_c + END INTERFACE + + CPASSERT(.NOT. C_ASSOCIATED(iterator%c_ptr)) + CALL dbm_iterator_start_c(iterator=iterator%c_ptr, matrix=matrix%c_ptr) + CPASSERT(C_ASSOCIATED(iterator%c_ptr)) + CALL validate(matrix) + END SUBROUTINE dbm_iterator_start + +! ************************************************************************************************** +!> \brief Tests whether the given iterator has any block left. +!> \param iterator ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_iterator_blocks_left(iterator) RESULT(blocks_left) + TYPE(dbm_iterator), INTENT(IN) :: iterator + LOGICAL :: blocks_left + + INTERFACE + FUNCTION dbm_iterator_blocks_left_c(iterator) & + BIND(C, name="dbm_iterator_blocks_left") + IMPORT :: C_PTR, C_BOOL + TYPE(C_PTR), VALUE :: iterator + LOGICAL(C_BOOL) :: dbm_iterator_blocks_left_c + END FUNCTION dbm_iterator_blocks_left_c + END INTERFACE + + blocks_left = dbm_iterator_blocks_left_c(iterator%c_ptr) + END FUNCTION dbm_iterator_blocks_left + +! ************************************************************************************************** +!> \brief Returns the next block from the given iterator. +!> \param iterator ... +!> \param row ... +!> \param column ... +!> \param block ... +!> \param row_size ... +!> \param col_size ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_iterator_next_block(iterator, row, column, block, row_size, col_size) + TYPE(dbm_iterator), INTENT(INOUT) :: iterator + INTEGER, INTENT(OUT) :: row, column + REAL(dp), DIMENSION(:, :), INTENT(OUT), OPTIONAL, & + POINTER :: block + INTEGER, INTENT(OUT), OPTIONAL :: row_size, col_size + + INTEGER :: col0, my_col_size, my_row_size, row0 + TYPE(C_PTR) :: block_c + INTERFACE + SUBROUTINE dbm_iterator_next_block_c(iterator, row, col, block, row_size, col_size) & + BIND(C, name="dbm_iterator_next_block") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: iterator + INTEGER(kind=C_INT) :: row + INTEGER(kind=C_INT) :: col + TYPE(C_PTR) :: block + INTEGER(kind=C_INT) :: row_size + INTEGER(kind=C_INT) :: col_size + END SUBROUTINE dbm_iterator_next_block_c + END INTERFACE + + CALL dbm_iterator_next_block_c(iterator%c_ptr, row=row0, col=col0, block=block_c, & + row_size=my_row_size, col_size=my_col_size) + + CPASSERT(C_ASSOCIATED(block_c)) + IF (PRESENT(block)) CALL C_F_POINTER(block_c, block, shape=(/my_row_size, my_col_size/)) + row = row0 + 1 + column = col0 + 1 + IF (PRESENT(row_size)) row_size = my_row_size + IF (PRESENT(col_size)) col_size = my_col_size + END SUBROUTINE dbm_iterator_next_block + +! ************************************************************************************************** +!> \brief Releases the given iterator. +!> \param iterator ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_iterator_stop(iterator) + TYPE(dbm_iterator), INTENT(INOUT) :: iterator + + INTERFACE + SUBROUTINE dbm_iterator_stop_c(iterator) & + BIND(C, name="dbm_iterator_stop") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: iterator + END SUBROUTINE dbm_iterator_stop_c + END INTERFACE + + CALL dbm_iterator_stop_c(iterator%c_ptr) + iterator%c_ptr = C_NULL_PTR + END SUBROUTINE dbm_iterator_stop + +! ************************************************************************************************** +!> \brief Computes a checksum of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_checksum(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + REAL(KIND=dp) :: res + + INTERFACE + FUNCTION dbm_checksum_c(matrix) & + BIND(C, name="dbm_checksum") + IMPORT :: C_PTR, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix + REAL(C_DOUBLE) :: dbm_checksum_c + END FUNCTION dbm_checksum_c + END INTERFACE + + CALL validate(matrix) + res = dbm_checksum_c(matrix%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CPASSERT(ABS(res - dbcsr_checksum(matrix%dbcsr))/MAX(1.0_dp, ABS(res)) < DBM_VALIDATE_THRESHOLD) +#endif + END FUNCTION dbm_checksum + +! ************************************************************************************************** +!> \brief Returns the absolute value of the larges element of the entire given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_maxabs(matrix) RESULT(res) + TYPE(dbm_type), INTENT(INOUT) :: matrix + REAL(KIND=dp) :: res + + INTERFACE + FUNCTION dbm_maxabs_c(matrix) & + BIND(C, name="dbm_maxabs") + IMPORT :: C_PTR, C_DOUBLE + TYPE(C_PTR), VALUE :: matrix + REAL(C_DOUBLE) :: dbm_maxabs_c + END FUNCTION dbm_maxabs_c + END INTERFACE + + CALL validate(matrix) + res = dbm_maxabs_c(matrix%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CPASSERT(ABS(res - dbcsr_maxabs(matrix%dbcsr))/MAX(1.0_dp, ABS(res)) < DBM_VALIDATE_THRESHOLD) +#endif + END FUNCTION dbm_maxabs + +! ************************************************************************************************** +!> \brief Returns the name of the matrix of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_get_name(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + CHARACTER(len=default_string_length) :: res + + CHARACTER(LEN=1, KIND=C_CHAR), DIMENSION(:), & + POINTER :: name_f + INTEGER :: i + TYPE(C_PTR) :: name_c + INTERFACE + FUNCTION dbm_get_name_c(matrix) BIND(C, name="dbm_get_name") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + TYPE(C_PTR) :: dbm_get_name_c + END FUNCTION dbm_get_name_c + END INTERFACE + + name_c = dbm_get_name_c(matrix%c_ptr) + + CALL C_F_POINTER(name_c, name_f, shape=(/default_string_length/)) + + res = "" + DO i = 1, default_string_length + IF (name_f(i) == C_NULL_CHAR) EXIT + res(i:i) = name_f(i) + END DO + + END FUNCTION dbm_get_name + +! ************************************************************************************************** +!> \brief Returns the number of local Non-Zero Elements of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + PURE FUNCTION dbm_get_nze(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER :: res + + INTERFACE + PURE FUNCTION dbm_get_nze_c(matrix) & + BIND(C, name="dbm_get_nze") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: dbm_get_nze_c + END FUNCTION dbm_get_nze_c + END INTERFACE + + res = dbm_get_nze_c(matrix%c_ptr) + + END FUNCTION dbm_get_nze + +! ************************************************************************************************** +!> \brief Returns the number of local blocks of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + PURE FUNCTION dbm_get_num_blocks(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER :: res + + INTERFACE + PURE FUNCTION dbm_get_num_blocks_c(matrix) & + BIND(C, name="dbm_get_num_blocks") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: dbm_get_num_blocks_c + END FUNCTION dbm_get_num_blocks_c + END INTERFACE + + res = dbm_get_num_blocks_c(matrix%c_ptr) + + END FUNCTION dbm_get_num_blocks + +! ************************************************************************************************** +!> \brief Returns the row block sizes of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_get_row_block_sizes(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER, CONTIGUOUS, DIMENSION(:), POINTER :: res + + INTEGER :: nrows + TYPE(C_PTR) :: row_sizes + INTERFACE + SUBROUTINE dbm_get_row_sizes_c(matrix, nrows, row_sizes) & + BIND(C, name="dbm_get_row_sizes") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: nrows + TYPE(C_PTR) :: row_sizes + END SUBROUTINE dbm_get_row_sizes_c + END INTERFACE + + CALL dbm_get_row_sizes_c(matrix%c_ptr, nrows, row_sizes) + CALL C_F_POINTER(row_sizes, res, shape=(/nrows/)) + ! TODO: maybe return an ALLOCATABLE + END FUNCTION dbm_get_row_block_sizes + +! ************************************************************************************************** +!> \brief Returns the column block sizes of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_get_col_block_sizes(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER, CONTIGUOUS, DIMENSION(:), POINTER :: res + + INTEGER :: ncols + TYPE(C_PTR) :: col_sizes + INTERFACE + SUBROUTINE dbm_get_col_sizes_c(matrix, ncols, col_sizes) & + BIND(C, name="dbm_get_col_sizes") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: ncols + TYPE(C_PTR) :: col_sizes + END SUBROUTINE dbm_get_col_sizes_c + END INTERFACE + + CALL dbm_get_col_sizes_c(matrix%c_ptr, ncols, col_sizes) + CALL C_F_POINTER(col_sizes, res, shape=(/ncols/)) + ! TODO: maybe return an ALLOCATABLE + END FUNCTION dbm_get_col_block_sizes + +! ************************************************************************************************** +!> \brief Returns the local row block sizes of the given matrix. +!> \param matrix ... +!> \param local_rows ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_get_local_rows(matrix, local_rows) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER, ALLOCATABLE, DIMENSION(:) :: local_rows + + INTEGER :: nlocal_rows + INTEGER, DIMENSION(:), POINTER :: local_rows_dbcsr, local_rows_ptr + TYPE(C_PTR) :: local_rows_c + INTERFACE + SUBROUTINE dbm_get_local_rows_c(matrix, nlocal_rows, local_rows) & + BIND(C, name="dbm_get_local_rows") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: nlocal_rows + TYPE(C_PTR) :: local_rows + END SUBROUTINE dbm_get_local_rows_c + END INTERFACE + + CALL dbm_get_local_rows_c(matrix%c_ptr, nlocal_rows, local_rows_c) + CALL C_F_POINTER(local_rows_c, local_rows_ptr, shape=(/nlocal_rows/)) + ALLOCATE (local_rows(nlocal_rows)) + local_rows(:) = local_rows_ptr(:) + 1 + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_get_info(matrix%dbcsr, local_rows=local_rows_dbcsr) + CPASSERT(ALL(local_rows == local_rows_dbcsr)) +#else + MARK_USED(local_rows_dbcsr) +#endif + END SUBROUTINE dbm_get_local_rows + +! ************************************************************************************************** +!> \brief Returns the local column block sizes of the given matrix. +!> \param matrix ... +!> \param local_cols ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_get_local_cols(matrix, local_cols) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER, ALLOCATABLE, DIMENSION(:) :: local_cols + + INTEGER :: nlocal_cols + INTEGER, DIMENSION(:), POINTER :: local_cols_dbcsr, local_cols_ptr + TYPE(C_PTR) :: local_cols_c + INTERFACE + SUBROUTINE dbm_get_local_cols_c(matrix, nlocal_cols, local_cols) & + BIND(C, name="dbm_get_local_cols") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT) :: nlocal_cols + TYPE(C_PTR) :: local_cols + END SUBROUTINE dbm_get_local_cols_c + END INTERFACE + + CALL dbm_get_local_cols_c(matrix%c_ptr, nlocal_cols, local_cols_c) + CALL C_F_POINTER(local_cols_c, local_cols_ptr, shape=(/nlocal_cols/)) + ALLOCATE (local_cols(nlocal_cols)) + local_cols(:) = local_cols_ptr(:) + 1 + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_get_info(matrix%dbcsr, local_cols=local_cols_dbcsr) + CPASSERT(ALL(local_cols == local_cols_dbcsr)) +#else + MARK_USED(local_cols_dbcsr) +#endif + END SUBROUTINE dbm_get_local_cols + +! ************************************************************************************************** +!> \brief Returns the MPI rank on which the given block should be stored. +!> \param matrix ... +!> \param row ... +!> \param column ... +!> \param processor ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_get_stored_coordinates(matrix, row, column, processor) + TYPE(dbm_type), INTENT(IN) :: matrix + INTEGER, INTENT(IN) :: row, column + INTEGER, INTENT(OUT) :: processor + + INTEGER :: processor_dbcsr + INTERFACE + PURE FUNCTION dbm_get_stored_coordinates_c(matrix, row, col) & + BIND(C, name="dbm_get_stored_coordinates") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: matrix + INTEGER(C_INT), VALUE :: row + INTEGER(C_INT), VALUE :: col + INTEGER(C_INT) :: dbm_get_stored_coordinates_c + END FUNCTION dbm_get_stored_coordinates_c + END INTERFACE + + processor = dbm_get_stored_coordinates_c(matrix%c_ptr, row=row - 1, col=column - 1) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_get_stored_coordinates(matrix%dbcsr, row, column, processor_dbcsr) + CPASSERT(processor == processor_dbcsr) +#else + MARK_USED(processor_dbcsr) +#endif + END SUBROUTINE dbm_get_stored_coordinates + +! ************************************************************************************************** +!> \brief Returns the distribution of the given matrix. +!> \param matrix ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_get_distribution(matrix) RESULT(res) + TYPE(dbm_type), INTENT(IN) :: matrix + TYPE(dbm_distribution_obj) :: res + + INTERFACE + FUNCTION dbm_get_distribution_c(matrix) BIND(C, name="dbm_get_distribution") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: matrix + TYPE(C_PTR) :: dbm_get_distribution_c + END FUNCTION dbm_get_distribution_c + END INTERFACE + + res%c_ptr = dbm_get_distribution_c(matrix%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_get_info(matrix%dbcsr, distribution=res%dbcsr) +#endif + + END FUNCTION dbm_get_distribution + +! ************************************************************************************************** +!> \brief Creates a new two dimensional distribution. +!> \param dist ... +!> \param mp_comm ... +!> \param row_dist_block ... +!> \param col_dist_block ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_distribution_new(dist, mp_comm, row_dist_block, col_dist_block) + TYPE(dbm_distribution_obj), INTENT(OUT) :: dist + INTEGER, INTENT(IN) :: mp_comm + INTEGER, CONTIGUOUS, DIMENSION(:), INTENT(IN), & + POINTER :: row_dist_block, col_dist_block + + INTERFACE + SUBROUTINE dbm_distribution_new_c(dist, fortran_comm, nrows, ncols, row_dist, col_dist) & + BIND(C, name="dbm_distribution_new") + IMPORT :: C_PTR, C_CHAR, C_INT + TYPE(C_PTR) :: dist + INTEGER(kind=C_INT), VALUE :: fortran_comm + INTEGER(kind=C_INT), VALUE :: nrows + INTEGER(kind=C_INT), VALUE :: ncols + INTEGER(kind=C_INT), DIMENSION(*) :: row_dist + INTEGER(kind=C_INT), DIMENSION(*) :: col_dist + END SUBROUTINE dbm_distribution_new_c + END INTERFACE + + CPASSERT(.NOT. C_ASSOCIATED(dist%c_ptr)) + CALL dbm_distribution_new_c(dist=dist%c_ptr, & + fortran_comm=mp_comm, & + nrows=SIZE(row_dist_block), & + ncols=SIZE(col_dist_block), & + row_dist=row_dist_block, & + col_dist=col_dist_block) + CPASSERT(C_ASSOCIATED(dist%c_ptr)) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_distribution_new_wrapper(dist, mp_comm, row_dist_block, col_dist_block) +#endif + END SUBROUTINE dbm_distribution_new + +! ************************************************************************************************** +!> \brief Helper for creating a new DBCSR distribution. Only needed for DBM_VALIDATE_AGAINST_DBCSR. +!> \param dist ... +!> \param mp_comm ... +!> \param row_dist_block ... +!> \param col_dist_block ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbcsr_distribution_new_wrapper(dist, mp_comm, row_dist_block, col_dist_block) + TYPE(dbm_distribution_obj), INTENT(INOUT) :: dist + INTEGER, INTENT(IN) :: mp_comm + INTEGER, CONTIGUOUS, DIMENSION(:), INTENT(IN), & + POINTER :: row_dist_block, col_dist_block + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + INTEGER :: mynode, numnodes, pcol, prow + INTEGER, ALLOCATABLE, DIMENSION(:, :) :: pgrid + INTEGER, DIMENSION(2) :: coord, mycoord, npdims + TYPE(dbcsr_mp_obj) :: mp_env + + ! Create a dbcsr mp environment from communicator + CALL mp_environ(numnodes, npdims, mycoord, mp_comm) + CALL mp_environ(numnodes, mynode, mp_comm) + ALLOCATE (pgrid(0:npdims(1) - 1, 0:npdims(2) - 1)) + DO prow = 0, npdims(1) - 1 + DO pcol = 0, npdims(2) - 1 + coord = (/prow, pcol/) + CALL mp_cart_rank(mp_comm, coord, pgrid(prow, pcol)) + END DO + END DO + CPASSERT(mynode == pgrid(mycoord(1), mycoord(2))) + + CALL dbcsr_mp_new(mp_env, mp_comm, pgrid, mynode, numnodes, mycoord(1), mycoord(2)) + CALL dbcsr_distribution_new(dist=dist%dbcsr, mp_env=mp_env, & + row_dist_block=row_dist_block, col_dist_block=col_dist_block) + CALL dbcsr_mp_release(mp_env) +#else + MARK_USED(dist) + MARK_USED(mp_comm) + MARK_USED(row_dist_block) + MARK_USED(col_dist_block) +#endif + END SUBROUTINE dbcsr_distribution_new_wrapper + +! ************************************************************************************************** +!> \brief Increases the reference counter of the given distribution. +!> \param dist ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_distribution_hold(dist) + TYPE(dbm_distribution_obj) :: dist + + INTERFACE + SUBROUTINE dbm_distribution_hold_c(dist) & + BIND(C, name="dbm_distribution_hold") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: dist + END SUBROUTINE dbm_distribution_hold_c + END INTERFACE + + CALL dbm_distribution_hold_c(dist%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_distribution_hold(dist%dbcsr) +#endif + END SUBROUTINE dbm_distribution_hold + +! ************************************************************************************************** +!> \brief Decreases the reference counter of the given distribution. +!> \param dist ... +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_distribution_release(dist) + TYPE(dbm_distribution_obj) :: dist + + INTERFACE + SUBROUTINE dbm_distribution_release_c(dist) & + BIND(C, name="dbm_distribution_release") + IMPORT :: C_PTR + TYPE(C_PTR), VALUE :: dist + END SUBROUTINE dbm_distribution_release_c + END INTERFACE + + CALL dbm_distribution_release_c(dist%c_ptr) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CALL dbcsr_distribution_release(dist%dbcsr) +#endif + END SUBROUTINE dbm_distribution_release + +! ************************************************************************************************** +!> \brief Returns the rows of the given distribution. +!> \param dist ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_distribution_row_dist(dist) RESULT(res) + TYPE(dbm_distribution_obj), INTENT(IN) :: dist + INTEGER, CONTIGUOUS, DIMENSION(:), POINTER :: res + + INTEGER :: nrows + TYPE(C_PTR) :: row_dist + INTERFACE + SUBROUTINE dbm_distribution_row_dist_c(dist, nrows, row_dist) & + BIND(C, name="dbm_distribution_row_dist") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: dist + INTEGER(C_INT) :: nrows + TYPE(C_PTR) :: row_dist + END SUBROUTINE dbm_distribution_row_dist_c + END INTERFACE + + CALL dbm_distribution_row_dist_c(dist%c_ptr, nrows, row_dist) + CALL C_F_POINTER(row_dist, res, shape=(/nrows/)) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CPASSERT(ALL(res == dbcsr_distribution_row_dist(dist%dbcsr))) +#endif + END FUNCTION dbm_distribution_row_dist + +! ************************************************************************************************** +!> \brief Returns the columns of the given distribution. +!> \param dist ... +!> \return ... +!> \author Ole Schuett +! ************************************************************************************************** + FUNCTION dbm_distribution_col_dist(dist) RESULT(res) + TYPE(dbm_distribution_obj), INTENT(IN) :: dist + INTEGER, CONTIGUOUS, DIMENSION(:), POINTER :: res + + INTEGER :: ncols + TYPE(C_PTR) :: col_dist + INTERFACE + SUBROUTINE dbm_distribution_col_dist_c(dist, ncols, col_dist) & + BIND(C, name="dbm_distribution_col_dist") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), VALUE :: dist + INTEGER(C_INT) :: ncols + TYPE(C_PTR) :: col_dist + END SUBROUTINE dbm_distribution_col_dist_c + END INTERFACE + + CALL dbm_distribution_col_dist_c(dist%c_ptr, ncols, col_dist) + CALL C_F_POINTER(col_dist, res, shape=(/ncols/)) + +#if defined(DBM_VALIDATE_AGAINST_DBCSR) + CPASSERT(ALL(res == dbcsr_distribution_col_dist(dist%dbcsr))) +#endif + END FUNCTION dbm_distribution_col_dist + +! ************************************************************************************************** +!> \brief Initialize DBM library +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_library_init() + INTERFACE + SUBROUTINE dbm_library_init_c() BIND(C, name="dbm_library_init") + END SUBROUTINE dbm_library_init_c + END INTERFACE + + CALL dbm_library_init_c() + + END SUBROUTINE dbm_library_init + +! ************************************************************************************************** +!> \brief Finalize DBM library +!> \author Ole Schuett +! ************************************************************************************************** + SUBROUTINE dbm_library_finalize() + INTERFACE + SUBROUTINE dbm_library_finalize_c() BIND(C, name="dbm_library_finalize") + END SUBROUTINE dbm_library_finalize_c + END INTERFACE + + CALL dbm_library_finalize_c() + + END SUBROUTINE dbm_library_finalize + +END MODULE dbm_api diff --git a/src/dbm/dbm_distribution.c b/src/dbm/dbm_distribution.c new file mode 100644 index 0000000000..cc39efbb5d --- /dev/null +++ b/src/dbm/dbm_distribution.c @@ -0,0 +1,153 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include + +#include "dbm_distribution.h" + +/******************************************************************************* + * \brief Private routine for creating a new one dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +static void dbm_dist_1d_new(dbm_dist_1d_t *dist, const int length, + const int coords[length], + const dbm_mpi_comm_t comm) { + dist->comm = comm; + dist->my_rank = dbm_mpi_comm_rank(comm); + dist->nranks = dbm_mpi_comm_size(comm); + dist->length = length; + dist->index2coord = malloc(length * sizeof(int)); + memcpy(dist->index2coord, coords, length * sizeof(int)); + + // Check that cart coordinates and ranks are equivalent. + int cart_dims[1], cart_periods[1], cart_coords[1]; + dbm_mpi_cart_get(comm, 1, cart_dims, cart_periods, cart_coords); + assert(dist->nranks == cart_dims[0]); + assert(dist->my_rank == cart_coords[0]); + + // Count local rows/columns. + for (int i = 0; i < length; i++) { + assert(0 <= coords[i] && coords[i] < dist->nranks); + if (coords[i] == dist->my_rank) { + dist->nlocals++; + } + } + + // Store local rows/columns. + dist->local_indicies = malloc(dist->nlocals * sizeof(int)); + int j = 0; + for (int i = 0; i < length; i++) { + if (coords[i] == dist->my_rank) { + dist->local_indicies[j++] = i; + } + } + assert(j == dist->nlocals); +} + +/******************************************************************************* + * \brief Private routine for releasing a one dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +static void dbm_dist_1d_free(dbm_dist_1d_t *dist) { + free(dist->index2coord); + free(dist->local_indicies); + dbm_mpi_comm_free(&dist->comm); +} + +/******************************************************************************* + * \brief Creates a new two dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_new(dbm_distribution_t **dist_out, const int fortran_comm, + const int nrows, const int ncols, + const int row_dist[nrows], + const int col_dist[ncols]) { + assert(omp_get_num_threads() == 1); + dbm_distribution_t *dist = calloc(1, sizeof(dbm_distribution_t)); + dist->ref_count = 1; + + dist->comm = dbm_mpi_comm_f2c(fortran_comm); + dist->my_rank = dbm_mpi_comm_rank(dist->comm); + dist->nranks = dbm_mpi_comm_size(dist->comm); + + const int row_dim_remains[2] = {1, 0}; + const dbm_mpi_comm_t row_comm = dbm_mpi_cart_sub(dist->comm, row_dim_remains); + + const int col_dim_remains[2] = {0, 1}; + const dbm_mpi_comm_t col_comm = dbm_mpi_cart_sub(dist->comm, col_dim_remains); + + dbm_dist_1d_new(&dist->rows, nrows, row_dist, row_comm); + dbm_dist_1d_new(&dist->cols, ncols, col_dist, col_comm); + + assert(*dist_out == NULL); + *dist_out = dist; +} + +/******************************************************************************* + * \brief Increases the reference counter of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_hold(dbm_distribution_t *dist) { + assert(dist->ref_count > 0); + dist->ref_count++; +} + +/******************************************************************************* + * \brief Decreases the reference counter of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_release(dbm_distribution_t *dist) { + assert(dist->ref_count > 0); + dist->ref_count--; + if (dist->ref_count == 0) { + dbm_dist_1d_free(&dist->rows); + dbm_dist_1d_free(&dist->cols); + free(dist); + } +} + +/******************************************************************************* + * \brief Returns the rows of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_row_dist(const dbm_distribution_t *dist, int *nrows, + const int **row_dist) { + assert(dist->ref_count > 0); + *nrows = dist->rows.length; + *row_dist = dist->rows.index2coord; +} + +/******************************************************************************* + * \brief Returns the columns of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_col_dist(const dbm_distribution_t *dist, int *ncols, + const int **col_dist) { + assert(dist->ref_count > 0); + *ncols = dist->cols.length; + *col_dist = dist->cols.index2coord; +} + +/******************************************************************************* + * \brief Returns the MPI rank on which the given block should be stored. + * \author Ole Schuett + ******************************************************************************/ +int dbm_distribution_stored_coords(const dbm_distribution_t *dist, + const int row, const int col) { + assert(dist->ref_count > 0); + assert(0 <= row && row < dist->rows.length); + assert(0 <= col && col < dist->cols.length); + int coords[2] = {dist->rows.index2coord[row], dist->cols.index2coord[col]}; + return dbm_mpi_cart_rank(dist->comm, coords); +} + +// EOF diff --git a/src/dbm/dbm_distribution.h b/src/dbm/dbm_distribution.h new file mode 100644 index 0000000000..13ab9b7d25 --- /dev/null +++ b/src/dbm/dbm_distribution.h @@ -0,0 +1,83 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_DISTRIBUTION_H +#define DBM_DISTRIBUTION_H + +#include "dbm_mpi.h" + +/******************************************************************************* + * \brief Internal struct for storing a one dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int length; // global number of rows/cols + int *index2coord; // maps row/col indicies to cart coordinate + int nlocals; + int *local_indicies; // list of row/col indicies that reside locally. + dbm_mpi_comm_t comm; // 1D communicator + int nranks; + int my_rank; +} dbm_dist_1d_t; + +/******************************************************************************* + * \brief Internal struct for storing a two dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int ref_count; + dbm_dist_1d_t rows; + dbm_dist_1d_t cols; + dbm_mpi_comm_t comm; + int nranks; + int my_rank; +} dbm_distribution_t; + +/******************************************************************************* + * \brief Creates a new two dimensional distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_new(dbm_distribution_t **dist_out, const int fortran_comm, + const int nrows, const int ncols, + const int row_dist[nrows], const int col_dist[ncols]); + +/******************************************************************************* + * \brief Increases the reference counter of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_hold(dbm_distribution_t *dist); + +/******************************************************************************* + * \brief Decreases the reference counter of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_release(dbm_distribution_t *dist); + +/******************************************************************************* + * \brief Returns the rows of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_row_dist(const dbm_distribution_t *dist, int *nrows, + const int **row_dist); + +/******************************************************************************* + * \brief Returns the columns of the given distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_distribution_col_dist(const dbm_distribution_t *dist, int *ncols, + const int **col_dist); + +/******************************************************************************* + * \brief Returns the MPI rank on which the given block should be stored. + * \author Ole Schuett + ******************************************************************************/ +int dbm_distribution_stored_coords(const dbm_distribution_t *dist, + const int row, const int col); + +#endif + +// EOF diff --git a/src/dbm/dbm_hyperparams.h b/src/dbm/dbm_hyperparams.h new file mode 100644 index 0000000000..c318cc7190 --- /dev/null +++ b/src/dbm/dbm_hyperparams.h @@ -0,0 +1,22 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_HYPERPARAMS_H + +// TODO: Tune parameters, check if dynamic OpenMP scheduling is really faster? + +static const float HASHTABLE_FACTOR = 3.0; +static const float ALLOCATION_FACTOR = 1.5; +static const float SHARDS_PER_THREAD = 1.0; +static const int MAX_BATCH_SIZE = 10000; +static const int BATCH_NUM_BUCKETS = 1000; +static const int INITIAL_NBLOCKS_ALLOCATED = 100; +static const int INITIAL_DATA_ALLOCATED = 1024; + +#endif + +// EOF diff --git a/src/dbm/dbm_library.c b/src/dbm/dbm_library.c new file mode 100644 index 0000000000..3f80b1019f --- /dev/null +++ b/src/dbm/dbm_library.c @@ -0,0 +1,51 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include + +#include "dbm_library.h" +#include "dbm_mempool.h" + +static bool library_initialized = false; + +#if !defined(_OPENMP) +#error "OpenMP is required. Please add -fopenmp to your C compiler flags." +#endif + +/******************************************************************************* + * \brief Initializes the DBM library. + * \author Ole Schuett + ******************************************************************************/ +void dbm_library_init(void) { + if (library_initialized) { + fprintf(stderr, "DBM library was already initialized.\n"); + abort(); + } + + // Nothing to do, yet. + library_initialized = true; +} + +/******************************************************************************* + * \brief Finalizes the DBM library. + * \author Ole Schuett + ******************************************************************************/ +void dbm_library_finalize(void) { + if (!library_initialized) { + fprintf(stderr, "Error: DBM library is not initialized.\n"); + abort(); + } + + dbm_mempool_clear(); + library_initialized = false; +} + +// EOF diff --git a/src/dbm/dbm_library.h b/src/dbm/dbm_library.h new file mode 100644 index 0000000000..7c8d8e8681 --- /dev/null +++ b/src/dbm/dbm_library.h @@ -0,0 +1,26 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ +#ifndef DBM_LIBRARY_H +#define DBM_LIBRARY_H + +#include "dbm_multiply.h" + +/******************************************************************************* + * \brief Initializes the DBM library. + * \author Ole Schuett + ******************************************************************************/ +void dbm_library_init(void); + +/******************************************************************************* + * \brief Finalizes the DBM library. + * \author Ole Schuett + ******************************************************************************/ +void dbm_library_finalize(void); + +#endif + +// EOF diff --git a/src/dbm/dbm_matrix.c b/src/dbm/dbm_matrix.c new file mode 100644 index 0000000000..cc75c0fe4a --- /dev/null +++ b/src/dbm/dbm_matrix.c @@ -0,0 +1,664 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "dbm_hyperparams.h" +#include "dbm_matrix.h" +#include "dbm_mpi.h" + +/******************************************************************************* + * \brief Creates a new matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_create(dbm_matrix_t **matrix_out, dbm_distribution_t *dist, + const char name[], const int nrows, const int ncols, + const int row_sizes[nrows], const int col_sizes[ncols]) { + assert(omp_get_num_threads() == 1); + + dbm_matrix_t *matrix = calloc(1, sizeof(dbm_matrix_t)); + + assert(dist->rows.length == nrows); + assert(dist->cols.length == ncols); + dbm_distribution_hold(dist); + matrix->dist = dist; + + size_t size = (strlen(name) + 1) * sizeof(char); + matrix->name = malloc(size); + memcpy(matrix->name, name, size); + + matrix->nrows = nrows; + matrix->ncols = ncols; + + size = nrows * sizeof(int); + matrix->row_sizes = malloc(size); + memcpy(matrix->row_sizes, row_sizes, size); + + size = ncols * sizeof(int); + matrix->col_sizes = malloc(size); + memcpy(matrix->col_sizes, col_sizes, size); + + matrix->nshards = SHARDS_PER_THREAD * omp_get_max_threads(); + matrix->shards = malloc(matrix->nshards * sizeof(dbm_shard_t)); +#pragma omp parallel for + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_init(&matrix->shards[ishard]); + } + + assert(*matrix_out == NULL); + *matrix_out = matrix; +} + +/******************************************************************************* + * \brief Releases a matrix and all its ressources. + * \author Ole Schuett + ******************************************************************************/ +void dbm_release(dbm_matrix_t *matrix) { + assert(omp_get_num_threads() == 1); + dbm_distribution_release(matrix->dist); + free(matrix->name); + free(matrix->row_sizes); + free(matrix->col_sizes); + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_release(&matrix->shards[ishard]); + } + free(matrix->shards); + free(matrix); +} + +/******************************************************************************* + * \brief Copies content of matrix_b into matrix_a. + * Matrices must have the same row/col block sizes and distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_copy(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b) { + assert(omp_get_num_threads() == 1); + + assert(matrix_b->nrows == matrix_a->nrows); + for (int i = 0; i < matrix_b->nrows; i++) { + assert(matrix_b->row_sizes[i] == matrix_a->row_sizes[i]); + } + assert(matrix_b->ncols == matrix_a->ncols); + for (int i = 0; i < matrix_b->ncols; i++) { + assert(matrix_b->col_sizes[i] == matrix_a->col_sizes[i]); + } + + assert(matrix_a->nshards == matrix_b->nshards); + assert(matrix_a->dist == matrix_b->dist); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix_a->nshards; ishard++) { + dbm_shard_t *shard_a = &matrix_a->shards[ishard]; + const dbm_shard_t *shard_b = &matrix_b->shards[ishard]; + + free(shard_a->blocks); + shard_a->nblocks = shard_b->nblocks; + shard_a->nblocks_allocated = shard_b->nblocks_allocated; + shard_a->blocks = malloc(shard_b->nblocks_allocated * sizeof(dbm_block_t)); + memcpy(shard_a->blocks, shard_b->blocks, + shard_b->nblocks * sizeof(dbm_block_t)); + + free(shard_a->hashtable); + shard_a->hashtable_size = shard_b->hashtable_size; + shard_a->hashtable = malloc(shard_b->hashtable_size * sizeof(int)); + memcpy(shard_a->hashtable, shard_b->hashtable, + shard_b->hashtable_size * sizeof(int)); + + free(shard_a->data); + shard_a->data_allocated = shard_b->data_allocated; + shard_a->data = malloc(shard_b->data_allocated * sizeof(double)); + shard_a->data_size = shard_b->data_size; + memcpy(shard_a->data, shard_b->data, shard_b->data_size * sizeof(double)); + } +} + +/******************************************************************************* + * \brief Copies content of matrix_b into matrix_a. + * Matrices may have different distributions. + * \author Ole Schuett + ******************************************************************************/ +void dbm_redistribute(const dbm_matrix_t *matrix, dbm_matrix_t *redist) { + assert(omp_get_num_threads() == 1); + assert(matrix->nrows == redist->nrows); + for (int i = 0; i < matrix->nrows; i++) { + assert(matrix->row_sizes[i] == redist->row_sizes[i]); + } + assert(matrix->ncols == redist->ncols); + for (int i = 0; i < matrix->ncols; i++) { + assert(matrix->col_sizes[i] == redist->col_sizes[i]); + } + + assert(dbm_mpi_comms_are_similar(matrix->dist->comm, redist->dist->comm)); + const dbm_mpi_comm_t comm = redist->dist->comm; + const int nranks = dbm_mpi_comm_size(comm); + + // 1st pass: Compute send_count. + int send_count[nranks]; + memset(send_count, 0, nranks * sizeof(int)); + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const int row_size = matrix->row_sizes[blk->row]; + const int col_size = matrix->col_sizes[blk->col]; + const int block_size = row_size * col_size; + const int rank = dbm_get_stored_coordinates(redist, blk->row, blk->col); + assert(0 <= rank && rank < nranks); + send_count[rank] += 2 + block_size; + } + } + + // 1st communication: Exchange counts. + int recv_count[nranks]; + dbm_mpi_alltoall_int(send_count, 1, recv_count, 1, comm); + + // Compute displacements and allocate data buffers. + int send_displ[nranks + 1], recv_displ[nranks + 1]; + send_displ[0] = recv_displ[0] = 0; + for (int irank = 1; irank < nranks + 1; irank++) { + send_displ[irank] = send_displ[irank - 1] + send_count[irank - 1]; + recv_displ[irank] = recv_displ[irank - 1] + recv_count[irank - 1]; + } + const int total_send_count = send_displ[nranks]; + const int total_recv_count = recv_displ[nranks]; + double *data_send = malloc(total_send_count * sizeof(double)); + double *data_recv = malloc(total_recv_count * sizeof(double)); + + // 2nd pass: Fill send_data. + int send_data_positions[nranks]; + memcpy(send_data_positions, send_displ, nranks * sizeof(int)); + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const double *blk_data = &shard->data[blk->offset]; + const int row_size = matrix->row_sizes[blk->row]; + const int col_size = matrix->col_sizes[blk->col]; + const int block_size = row_size * col_size; + const int rank = dbm_get_stored_coordinates(redist, blk->row, blk->col); + const int pos = send_data_positions[rank]; + data_send[pos + 0] = blk->row; // send integers as doubles + data_send[pos + 1] = blk->col; + memcpy(&data_send[pos + 2], blk_data, block_size * sizeof(double)); + send_data_positions[rank] += 2 + block_size; + } + } + for (int irank = 0; irank < nranks; irank++) { + assert(send_data_positions[irank] == send_displ[irank + 1]); + } + + // 2nd communication: Exchange data. + dbm_mpi_alltoallv_double(data_send, send_count, send_displ, data_recv, + recv_count, recv_displ, comm); + free(data_send); + + // 3rd pass: Unpack data. + dbm_clear(redist); + int recv_data_pos = 0; + while (recv_data_pos < total_recv_count) { + const int row = (int)data_recv[recv_data_pos + 0]; + const int col = (int)data_recv[recv_data_pos + 1]; + assert(data_recv[recv_data_pos + 0] - (double)row == 0.0); + assert(data_recv[recv_data_pos + 1] - (double)col == 0.0); + dbm_put_block(redist, row, col, false, &data_recv[recv_data_pos + 2]); + const int row_size = matrix->row_sizes[row]; + const int col_size = matrix->col_sizes[col]; + const int block_size = row_size * col_size; + recv_data_pos += 2 + block_size; + } + assert(recv_data_pos == total_recv_count); + free(data_recv); +} + +/******************************************************************************* + * \brief Looks up a block from given matrics. This routine is thread-safe. + * If the block is not found then a null pointer is returned. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_block_p(dbm_matrix_t *matrix, const int row, const int col, + double **block, int *row_size, int *col_size) { + assert(0 <= row && row < matrix->nrows); + assert(0 <= col && col < matrix->ncols); + assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->dist->my_rank); + *row_size = matrix->row_sizes[row]; + *col_size = matrix->col_sizes[col]; + *block = NULL; + + const int ishard = row % matrix->nshards; + const dbm_shard_t *shard = &matrix->shards[ishard]; + dbm_block_t *blk = dbm_shard_lookup(shard, row, col); + if (blk != NULL) { + blk->norm = -1.0; // Invalidate norm because caller might modify block data. + *block = &shard->data[blk->offset]; + } +} + +/******************************************************************************* + * \brief Adds a block to given matrix. This routine is thread-safe. + * If block already exist then it gets overwritten (or summed). + * \author Ole Schuett + ******************************************************************************/ +void dbm_put_block(dbm_matrix_t *matrix, const int row, const int col, + const bool summation, const double *block) { + assert(0 <= row && row < matrix->nrows); + assert(0 <= col && col < matrix->ncols); + assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->dist->my_rank); + const int row_size = matrix->row_sizes[row]; + const int col_size = matrix->col_sizes[col]; + const int block_size = row_size * col_size; + + const int ishard = row % matrix->nshards; + dbm_shard_t *shard = &matrix->shards[ishard]; + omp_set_lock(&shard->lock); + dbm_block_t *blk = + dbm_shard_get_or_allocate_block(shard, row, col, block_size); + double *blk_data = &shard->data[blk->offset]; + if (summation) { + for (int i = 0; i < block_size; i++) { + blk_data[i] += block[i]; + } + } else { + memcpy(blk_data, block, block_size * sizeof(double)); + } + blk->norm = -1.0; // Invalidate norm because block data has changed. + omp_unset_lock(&shard->lock); +} + +/******************************************************************************* + * \brief Remove all blocks from matrix, but does not release underlying memory. + * \author Ole Schuett + ******************************************************************************/ +void dbm_clear(dbm_matrix_t *matrix) { + assert(omp_get_num_threads() == 1); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + shard->nblocks = 0; + shard->data_size = 0; + shard->data_promised = 0; + // Does not deallocate memory, hence data_allocated remains unchanged. + memset(shard->hashtable, 0, shard->hashtable_size * sizeof(int)); + } +} + +/******************************************************************************* + * \brief Internal routine for re-computing any invalid block norms. + * \author Ole Schuett + ******************************************************************************/ +void dbm_compute_block_norms(dbm_matrix_t *matrix) { + assert(omp_get_num_threads() == 1); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + dbm_block_t *blk = &shard->blocks[iblock]; + if (blk->norm < 0.0) { // negative norms are invalid + const double *blk_data = &shard->data[blk->offset]; + const int row_size = matrix->row_sizes[blk->row]; + const int col_size = matrix->col_sizes[blk->col]; + double norm = 0.0; // Compute as double ... + for (int i = 0; i < row_size * col_size; i++) { + norm += blk_data[i] * blk_data[i]; + } + blk->norm = (float)norm; // ...store as float. + } + } + } +} + +/******************************************************************************* + * \brief Removes all blocks from the matrix whose norm is below the threshold. + * Blocks of size zero are always kept. + * \author Ole Schuett + ******************************************************************************/ +void dbm_filter(dbm_matrix_t *matrix, const double eps) { + assert(omp_get_num_threads() == 1); + + if (eps == 0.0) { + return; + } + + dbm_compute_block_norms(matrix); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + const int old_nblocks = shard->nblocks; + shard->nblocks = 0; + shard->data_promised = 0; + memset(shard->hashtable, 0, shard->hashtable_size * sizeof(int)); + + for (int iblock = 0; iblock < old_nblocks; iblock++) { + const dbm_block_t old_blk = shard->blocks[iblock]; + const int row_size = matrix->row_sizes[old_blk.row]; + const int col_size = matrix->col_sizes[old_blk.col]; + const int block_size = row_size * col_size; + // For historic reasons zero-sized blocks are never filtered. + if (sqrt(old_blk.norm) < eps && block_size > 0) { + continue; // filter the block + } + // Re-create block. + dbm_block_t *new_blk = dbm_shard_promise_new_block( + shard, old_blk.row, old_blk.col, block_size); + new_blk->norm = old_blk.norm; + assert(new_blk->offset <= old_blk.offset); + if (new_blk->offset != old_blk.offset) { + // Using memmove instead of memcpy because it handles overlap correctly. + const double *old_blk_data = &shard->data[old_blk.offset]; + double *new_blk_data = &shard->data[new_blk->offset]; + memmove(new_blk_data, old_blk_data, block_size * sizeof(double)); + } + } + shard->data_size = shard->data_promised; + // TODO: Could call realloc to release excess memory. + } +} + +/******************************************************************************* + * \brief Adds list of blocks efficiently. The blocks will be filled with zeros. + * \author Ole Schuett + ******************************************************************************/ +void dbm_reserve_blocks(dbm_matrix_t *matrix, const int nblocks, + const int rows[], const int cols[]) { + assert(omp_get_num_threads() == 1); + const int my_rank = matrix->dist->my_rank; + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int i = 0; i < nblocks; i++) { + if (rows[i] % matrix->nshards == ishard) { + assert(0 <= rows[i] && rows[i] < matrix->nrows); + assert(0 <= cols[i] && cols[i] < matrix->ncols); + assert(dbm_get_stored_coordinates(matrix, rows[i], cols[i]) == my_rank); + const int row_size = matrix->row_sizes[rows[i]]; + const int col_size = matrix->col_sizes[cols[i]]; + const int block_size = row_size * col_size; + dbm_shard_get_or_promise_block(shard, rows[i], cols[i], block_size); + } + } + dbm_shard_allocate_promised_blocks(shard); + } +} + +/******************************************************************************* + * \brief Multiplies all entries in the given matrix by the given factor alpha. + * \author Ole Schuett + ******************************************************************************/ +void dbm_scale(dbm_matrix_t *matrix, const double alpha) { + assert(omp_get_num_threads() == 1); + if (alpha == 1.0) { + return; + } + if (alpha == 0.0) { + dbm_zero(matrix); + return; + } + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int i = 0; i < shard->data_size; i++) { + shard->data[i] *= alpha; + } + const double alpha2 = alpha * alpha; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + dbm_block_t *blk = &shard->blocks[iblock]; + blk->norm *= alpha2; + } + } +} + +/******************************************************************************* + * \brief Sets all blocks in the given matrix to zero. + * \author Ole Schuett + ******************************************************************************/ +void dbm_zero(dbm_matrix_t *matrix) { + assert(omp_get_num_threads() == 1); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + memset(shard->data, 0, shard->data_size * sizeof(double)); + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + shard->blocks[iblock].norm = 0.0; + } + } +} + +/******************************************************************************* + * \brief Adds matrix_b to matrix_a. + * \author Ole Schuett + ******************************************************************************/ +void dbm_add(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b) { + assert(omp_get_num_threads() == 1); + assert(matrix_a->nshards == matrix_b->nshards); + assert(matrix_a->dist == matrix_b->dist); + +#pragma omp parallel for schedule(dynamic) + for (int ishard = 0; ishard < matrix_b->nshards; ishard++) { + dbm_shard_t *shard_a = &matrix_a->shards[ishard]; + const dbm_shard_t *shard_b = &matrix_b->shards[ishard]; + for (int iblock = 0; iblock < shard_b->nblocks; iblock++) { + const dbm_block_t blk_b = shard_b->blocks[iblock]; + + const int row_size = matrix_b->row_sizes[blk_b.row]; + const int col_size = matrix_b->col_sizes[blk_b.col]; + assert(row_size == matrix_a->row_sizes[blk_b.row]); + assert(col_size == matrix_a->col_sizes[blk_b.col]); + const int block_size = row_size * col_size; + dbm_block_t *blk_a = dbm_shard_get_or_allocate_block( + shard_a, blk_b.row, blk_b.col, block_size); + double *data_a = &shard_a->data[blk_a->offset]; + const double *data_b = &shard_b->data[blk_b.offset]; + for (int i = 0; i < block_size; i++) { + data_a[i] += data_b[i]; + } + blk_a->norm = -1.0; // Invalidate norm because block data has changed. + } + } +} + +/******************************************************************************* + * \brief Creates an iterator for the blocks of the given matrix. + * The iteration order is not stable. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_start(dbm_iterator_t **iter_out, const dbm_matrix_t *matrix) { + assert(omp_get_num_threads() == 1); + dbm_iterator_t *iter = malloc(sizeof(dbm_iterator_t)); + iter->matrix = matrix; + iter->next_block = 0; + iter->next_shard = 0; + while (iter->next_shard < matrix->nshards) { + if (matrix->shards[iter->next_shard].nblocks > 0) { + break; + } + iter->next_shard++; + } + assert(*iter_out == NULL); + *iter_out = iter; +} + +/******************************************************************************* + * \brief Tests whether the given iterator has any block left. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_iterator_blocks_left(const dbm_iterator_t *iter) { + assert(omp_get_num_threads() == 1); + return iter->next_shard < iter->matrix->nshards; +} + +/******************************************************************************* + * \brief Returns the next block from the given iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_next_block(dbm_iterator_t *iter, int *row, int *col, + double **block, int *row_size, int *col_size) { + assert(omp_get_num_threads() == 1); + const dbm_matrix_t *matrix = iter->matrix; + assert(iter->next_shard < matrix->nshards); + const dbm_shard_t *shard = &matrix->shards[iter->next_shard]; + assert(iter->next_block < shard->nblocks); + dbm_block_t *blk = &shard->blocks[iter->next_block]; + + *row = blk->row; + *col = blk->col; + *row_size = matrix->row_sizes[blk->row]; + *col_size = matrix->col_sizes[blk->col]; + *block = &shard->data[blk->offset]; + blk->norm = -1.0; // Invalidate norm because caller might modify block data. + + iter->next_block++; + if (iter->next_block >= shard->nblocks) { + // Advance to the next non-empty shard... + iter->next_shard++; + while (iter->next_shard < matrix->nshards && + matrix->shards[iter->next_shard].nblocks == 0) { + iter->next_shard++; + } + iter->next_block = 0; // ...and continue with its first block. + } +} + +/******************************************************************************* + * \brief Releases the given iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_stop(dbm_iterator_t *iter) { + assert(omp_get_num_threads() == 1); + free(iter); +} + +/******************************************************************************* + * \brief Computes a checksum of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +double dbm_checksum(const dbm_matrix_t *matrix) { + double checksum = 0.0; + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + const dbm_shard_t *shard = &matrix->shards[ishard]; + for (int i = 0; i < shard->data_size; i++) { + checksum += shard->data[i] * shard->data[i]; + } + } + dbm_mpi_sum_double(&checksum, 1, matrix->dist->comm); + return checksum; +} + +/******************************************************************************* + * \brief Returns the absolute value of the larges element of the entire matrix. + * \author Ole Schuett + ******************************************************************************/ +double dbm_maxabs(const dbm_matrix_t *matrix) { + double maxabs = 0.0; + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int i = 0; i < shard->data_size; i++) { + maxabs = fmax(maxabs, fabs(shard->data[i])); + } + } + dbm_mpi_max_double(&maxabs, 1, matrix->dist->comm); + return maxabs; +} + +/******************************************************************************* + * \brief Returns the name of the matrix of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +const char *dbm_get_name(const dbm_matrix_t *matrix) { return matrix->name; } + +/******************************************************************************* + * \brief Returns the number of local Non-Zero Elements of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_nze(const dbm_matrix_t *matrix) { + int nze = 0; + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + nze += matrix->shards[ishard].data_size; + } + return nze; +} + +/******************************************************************************* + * \brief Returns the number of local blocks of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_num_blocks(const dbm_matrix_t *matrix) { + int nblocks = 0; + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + nblocks += matrix->shards[ishard].nblocks; + } + return nblocks; +} + +/******************************************************************************* + * \brief Returns the row block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, + const int **row_sizes) { + *nrows = matrix->nrows; + *row_sizes = matrix->row_sizes; +} + +/******************************************************************************* + * \brief Returns the column block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, + const int **col_sizes) { + *ncols = matrix->ncols; + *col_sizes = matrix->col_sizes; +} + +/******************************************************************************* + * \brief Returns the local row block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_local_rows(const dbm_matrix_t *matrix, int *nlocal_rows, + const int **local_rows) { + *nlocal_rows = matrix->dist->rows.nlocals; + *local_rows = matrix->dist->rows.local_indicies; +} + +/******************************************************************************* + * \brief Returns the local column block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_local_cols(const dbm_matrix_t *matrix, int *nlocal_cols, + const int **local_cols) { + *nlocal_cols = matrix->dist->cols.nlocals; + *local_cols = matrix->dist->cols.local_indicies; +} + +/******************************************************************************* + * \brief Returns the MPI rank on which the given block should be stored. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_stored_coordinates(const dbm_matrix_t *matrix, const int row, + const int col) { + return dbm_distribution_stored_coords(matrix->dist, row, col); +} + +/******************************************************************************* + * \brief Returns the distribution of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +const dbm_distribution_t *dbm_get_distribution(const dbm_matrix_t *matrix) { + return matrix->dist; +} + +// EOF diff --git a/src/dbm/dbm_matrix.h b/src/dbm/dbm_matrix.h new file mode 100644 index 0000000000..b167f46266 --- /dev/null +++ b/src/dbm/dbm_matrix.h @@ -0,0 +1,229 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MATRIX_H +#define DBM_MATRIX_H + +#include + +#include "dbm_distribution.h" +#include "dbm_shard.h" + +/******************************************************************************* + * \brief Internal struct for storing a matrix. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + dbm_distribution_t *dist; + char *name; + int nrows; + int ncols; + int *row_sizes; + int *col_sizes; + + int nshards; + dbm_shard_t *shards; +} dbm_matrix_t; + +/******************************************************************************* + * \brief Internal struct for storing a block iterator. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + const dbm_matrix_t *matrix; + int next_block; + int next_shard; +} dbm_iterator_t; + +/******************************************************************************* + * \brief Creates a new matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_create(dbm_matrix_t **matrix_out, dbm_distribution_t *dist, + const char name[], const int nrows, const int ncols, + const int row_sizes[nrows], const int col_sizes[ncols]); + +/******************************************************************************* + * \brief Releases a matrix and all its ressources. + * \author Ole Schuett + ******************************************************************************/ +void dbm_release(dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Copies content of matrix_b into matrix_a. + * Matrices must have the same row/col block sizes and distribution. + * \author Ole Schuett + ******************************************************************************/ +void dbm_copy(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b); + +/******************************************************************************* + * \brief Copies content of matrix_b into matrix_a. + * Matrices may have different distributions. + * \author Ole Schuett + ******************************************************************************/ +void dbm_redistribute(const dbm_matrix_t *matrix, dbm_matrix_t *redist); + +/******************************************************************************* + * \brief Looks up a block from given matrics. This routine is thread-safe. + * If the block is not found then a null pointer is returned. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_block_p(dbm_matrix_t *matrix, const int row, const int col, + double **block, int *row_size, int *col_size); + +/******************************************************************************* + * \brief Adds a block to given matrix. This routine is thread-safe. + * If block already exist then it gets overwritten (or summed). + * \author Ole Schuett + ******************************************************************************/ +void dbm_put_block(dbm_matrix_t *matrix, const int row, const int col, + const bool summation, const double *block); + +/******************************************************************************* + * \brief Remove all blocks from matrix, but does not release underlying memory. + * \author Ole Schuett + ******************************************************************************/ +void dbm_clear(dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Internal routine for re-computing any invalid block norms. + * \author Ole Schuett + ******************************************************************************/ +void dbm_compute_block_norms(dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Removes all blocks from the matrix whose norm is below the threshold. + * Blocks of size zero are always kept. + * \author Ole Schuett + ******************************************************************************/ +void dbm_filter(dbm_matrix_t *matrix, const double eps); + +/******************************************************************************* + * \brief Adds list of blocks efficiently. The blocks will be filled with zeros. + * \author Ole Schuett + ******************************************************************************/ +void dbm_reserve_blocks(dbm_matrix_t *matrix, const int nblocks, + const int rows[], const int cols[]); + +/******************************************************************************* + * \brief Multiplies all entries in the given matrix by the given factor alpha. + * \author Ole Schuett + ******************************************************************************/ +void dbm_scale(dbm_matrix_t *matrix, const double alpha); + +/******************************************************************************* + * \brief Sets all blocks in the given matrix to zero. + * \author Ole Schuett + ******************************************************************************/ +void dbm_zero(dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Adds matrix_b to matrix_a. + * \author Ole Schuett + ******************************************************************************/ +void dbm_add(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b); + +/******************************************************************************* + * \brief Creates an iterator for the blocks of the given matrix. + * The iteration order is not stable. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_start(dbm_iterator_t **iter_out, const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Tests whether the given iterator has any block left. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_iterator_blocks_left(const dbm_iterator_t *iter); + +/******************************************************************************* + * \brief Returns the next block from the given iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_next_block(dbm_iterator_t *iter, int *row, int *col, + double **block, int *row_size, int *col_size); + +/******************************************************************************* + * \brief Releases the given iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_iterator_stop(dbm_iterator_t *iter); + +/******************************************************************************* + * \brief Computes a checksum of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +double dbm_checksum(const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Returns the absolute value of the larges element of the entire matrix. + * \author Ole Schuett + ******************************************************************************/ +double dbm_maxabs(const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Returns the name of the matrix of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +const char *dbm_get_name(const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Returns the number of local Non-Zero Elements of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_nze(const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Returns the number of local blocks of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_num_blocks(const dbm_matrix_t *matrix); + +/******************************************************************************* + * \brief Returns the row block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, + const int **row_sizes); + +/******************************************************************************* + * \brief Returns the column block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, + const int **col_sizes); + +/******************************************************************************* + * \brief Returns the local row block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_local_rows(const dbm_matrix_t *matrix, int *nlocal_rows, + const int **local_rows); + +/******************************************************************************* + * \brief Returns the local column block sizes of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +void dbm_get_local_cols(const dbm_matrix_t *matrix, int *nlocal_cols, + const int **local_cols); + +/******************************************************************************* + * \brief Returns the MPI rank on which the given block should be stored. + * \author Ole Schuett + ******************************************************************************/ +int dbm_get_stored_coordinates(const dbm_matrix_t *matrix, const int row, + const int col); + +/******************************************************************************* + * \brief Returns the distribution of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +const dbm_distribution_t *dbm_get_distribution(const dbm_matrix_t *matrix); + +#endif + +// EOF diff --git a/src/dbm/dbm_mempool.c b/src/dbm/dbm_mempool.c new file mode 100644 index 0000000000..819508d500 --- /dev/null +++ b/src/dbm/dbm_mempool.c @@ -0,0 +1,220 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include + +#if defined(__DBM_CUDA) +#include +#endif + +#include "../offload/offload_library.h" +#include "dbm_mempool.h" + +/******************************************************************************* + * \brief Check given Cuda status and upon failure abort with a nice message. + * \author Ole Schuett + ******************************************************************************/ +#define CHECK(status) \ + if (status != cudaSuccess) { \ + fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), __FILE__, \ + __LINE__); \ + abort(); \ + } + +/******************************************************************************* + * \brief Private routine for actually allocating system memory. + * \author Ole Schuett + ******************************************************************************/ +static void *actual_malloc(const size_t size, const bool on_device) { + (void)on_device; // mark used + +#if defined(__DBM_CUDA) + if (on_device) { + void *memory; + offload_set_device(); + CHECK(cudaMalloc(&memory, size)); + assert(memory != NULL); + return memory; + } +#else + (void)on_device; // mark used +#endif + + void *memory = malloc(size); + assert(memory != NULL); + return memory; +} + +/******************************************************************************* + * \brief Private routine for actually freeing system memory. + * \author Ole Schuett + ******************************************************************************/ +static void actual_free(void *memory, const bool on_device) { + if (memory == NULL) { + return; + } + +#if defined(__DBM_CUDA) + if (on_device) { + offload_set_device(); + CHECK(cudaFree(memory)); + return; + } +#else + (void)on_device; // mark used +#endif + + free(memory); +} + +/******************************************************************************* + * \brief Private struct for storing a chunk of memory. + * \author Ole Schuett + ******************************************************************************/ +struct dbm_memchunk { + bool on_device; + size_t size; + void *mem; + struct dbm_memchunk *next; +}; +typedef struct dbm_memchunk dbm_memchunk_t; + +/******************************************************************************* + * \brief Private linked list of memory chunks that are available. + * \author Ole Schuett + ******************************************************************************/ +static dbm_memchunk_t *mempool_available_head = NULL; + +/******************************************************************************* + * \brief Private linked list of memory chunks that are in use. + * \author Ole Schuett + ******************************************************************************/ +static dbm_memchunk_t *mempool_allocated_head = NULL; + +/******************************************************************************* + * \brief Private routine for allocating host or device memory from the pool. + * \author Ole Schuett + ******************************************************************************/ +static void *internal_mempool_malloc(const size_t size, const bool on_device) { + if (size == 0) { + return NULL; + } + + dbm_memchunk_t *chunk; + +#pragma omp critical(dbm_mempool_modify) + { + // Find a suitable chuck in mempool_available. + dbm_memchunk_t **indirect = &mempool_available_head; + while (*indirect != NULL && (*indirect)->on_device != on_device) { + indirect = &(*indirect)->next; + } + chunk = *indirect; + + // If a chunck was found, remove it from mempool_available. + if (chunk != NULL) { + assert(chunk->on_device == on_device); + *indirect = chunk->next; + } + + // If no chunk was found, allocate a new one. + if (chunk == NULL) { + chunk = malloc(sizeof(dbm_memchunk_t)); + chunk->on_device = on_device; + chunk->size = 0; + chunk->mem = NULL; + } + + // Resize chunk if needed. + if (chunk->size < size) { + actual_free(chunk->mem, chunk->on_device); + chunk->mem = actual_malloc(size, chunk->on_device); + chunk->size = size; + } + + // Insert chunk into mempool_allocated. + chunk->next = mempool_allocated_head; + mempool_allocated_head = chunk; + } + + return chunk->mem; +} + +/******************************************************************************* + * \brief Internal routine for allocating host memory from the pool. + * \author Ole Schuett + ******************************************************************************/ +void *dbm_mempool_host_malloc(const size_t size) { + return internal_mempool_malloc(size, false); +} + +/******************************************************************************* + * \brief Internal routine for allocating device memory from the pool + * \author Ole Schuett + ******************************************************************************/ +void *dbm_mempool_device_malloc(const size_t size) { + return internal_mempool_malloc(size, true); +} + +/******************************************************************************* + * \brief Internal routine for releasing memory back to the pool. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mempool_free(void *mem) { + if (mem == NULL) { + return; + } + +#pragma omp critical(dbm_mempool_modify) + { + // Find chuck in mempool_allocated. + dbm_memchunk_t **indirect = &mempool_allocated_head; + while (*indirect != NULL && (*indirect)->mem != mem) { + indirect = &(*indirect)->next; + } + dbm_memchunk_t *chunk = *indirect; + assert(chunk != NULL && chunk->mem == mem); + + // Remove chuck from mempool_allocated. + *indirect = chunk->next; + + // Add chuck to mempool_available. + chunk->next = mempool_available_head; + mempool_available_head = chunk; + } +} + +/******************************************************************************* + * \brief Internal routine for freeing all memory in the pool. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mempool_clear(void) { + assert(omp_get_num_threads() == 1); + assert(mempool_allocated_head == NULL); // check for memory leak + + // while (mempool_allocated_head != NULL) { + // dbm_memchunk_t *chunk = mempool_allocated_head; + // mempool_allocated_head = chunk->next; + // printf("Found alloacted memory chunk of size: %lu\n", chunk->size); + // actual_free(chunk->mem, chunk->on_device); + // free(chunk); + //} + + // Free chunks in mempool_avavailable. + while (mempool_available_head != NULL) { + dbm_memchunk_t *chunk = mempool_available_head; + mempool_available_head = chunk->next; + actual_free(chunk->mem, chunk->on_device); + free(chunk); + } +} + +// EOF diff --git a/src/dbm/dbm_mempool.h b/src/dbm/dbm_mempool.h new file mode 100644 index 0000000000..b8d1c511bc --- /dev/null +++ b/src/dbm/dbm_mempool.h @@ -0,0 +1,46 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ +#ifndef DBM_MEMPOOL_H +#define DBM_MEMPOOL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/******************************************************************************* + * \brief Internal routine for allocating host memory from the pool. + * \author Ole Schuett + ******************************************************************************/ +void *dbm_mempool_host_malloc(const size_t size); + +/******************************************************************************* + * \brief Internal routine for allocating device memory from the pool. + * \author Ole Schuett + ******************************************************************************/ +void *dbm_mempool_device_malloc(const size_t size); + +/******************************************************************************* + * \brief Internal routine for releasing memory back to the pool. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mempool_free(void *memory); + +/******************************************************************************* + * \brief Internal routine for freeing all memory in the pool. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mempool_clear(void); + +#ifdef __cplusplus +} +#endif + +#endif + +// EOF diff --git a/src/dbm/dbm_miniapp.c b/src/dbm/dbm_miniapp.c new file mode 100644 index 0000000000..ca0ac23c5b --- /dev/null +++ b/src/dbm/dbm_miniapp.c @@ -0,0 +1,184 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include + +#include "../offload/offload_library.h" +#include "dbm_library.h" +#include "dbm_matrix.h" +#include "dbm_mpi.h" + +/******************************************************************************* + * \brief Private routine for subtracting two timespecs. + * \author Ole Schuett + ******************************************************************************/ +static double time_diff(const struct timespec start, + const struct timespec end) { + return (end.tv_sec - start.tv_sec) + 1e-9 * (end.tv_nsec - start.tv_nsec); +} + +/******************************************************************************* + * \brief Private routine for creating a distribution and an empty matrix. + * \author Ole Schuett + ******************************************************************************/ +static dbm_matrix_t *create_some_matrix(const dbm_mpi_comm_t comm) { + const int nrow = 200; + const int ncol = 200; + + int cart_dims[2], cart_periods[2], cart_coords[2]; + dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords); + + // Create distribution. + int row_dist[nrow]; + int col_dist[ncol]; + for (int i = 0; i < nrow; i++) { + row_dist[i] = i % cart_dims[0]; + } + for (int i = 0; i < ncol; i++) { + col_dist[i] = i % cart_dims[1]; + } + const int fortran_comm = dbm_mpi_comm_c2f(comm); + dbm_distribution_t *dist = NULL; + dbm_distribution_new(&dist, fortran_comm, nrow, ncol, row_dist, col_dist); + + // Create matrix. + int row_sizes[nrow]; + int col_sizes[ncol]; + for (int i = 0; i < nrow; i++) { + row_sizes[i] = 18; + } + for (int i = 0; i < ncol; i++) { + col_sizes[i] = 18; + } + dbm_matrix_t *matrix = NULL; + dbm_create(&matrix, dist, "some name", nrow, ncol, row_sizes, col_sizes); + dbm_distribution_release(dist); + return matrix; +} + +/******************************************************************************* + * \brief Private routine for reserving all blocks of the given matrix. + * \author Ole Schuett + ******************************************************************************/ +static void reserve_all_blocks(dbm_matrix_t *matrix) { + int nrows, ncols; + const int *row_sizes, *col_sizes; + dbm_get_row_sizes(matrix, &nrows, &row_sizes); + dbm_get_col_sizes(matrix, &ncols, &col_sizes); + + const int nblocks = nrows * ncols; + int reserve_row[nblocks]; + int reserve_col[nblocks]; + + int nblocks_reserve = 0; + for (int i = 0; i < nblocks; i++) { + const int row = i % nrows; + const int col = i % ncols; + if (dbm_get_stored_coordinates(matrix, row, col) == matrix->dist->my_rank) { + reserve_row[nblocks_reserve] = row; + reserve_col[nblocks_reserve] = col; + nblocks_reserve++; + } + } + dbm_reserve_blocks(matrix, nblocks_reserve, reserve_row, reserve_col); +} + +/******************************************************************************* + * \brief Private routine for setting all blocks to 1.0. + * \author Ole Schuett + ******************************************************************************/ +static void set_all_blocks(dbm_matrix_t *matrix) { + dbm_iterator_t *iter = NULL; + dbm_iterator_start(&iter, matrix); + while (dbm_iterator_blocks_left(iter)) { + int row, col, row_size, col_size; + double *block; + dbm_iterator_next_block(iter, &row, &col, &block, &row_size, &col_size); + const int block_size = row_size * col_size; + for (int i = 0; i < block_size; i++) { + block[i] = 1.0; + } + } + dbm_iterator_stop(iter); +} + +/******************************************************************************* + * \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply. + * \author Ole Schuett + ******************************************************************************/ +int main(int argc, char *argv[]) { + dbm_mpi_init(&argc, &argv); + + const dbm_mpi_comm_t world_comm = dbm_mpi_get_comm_world(); + const int nranks = dbm_mpi_comm_size(world_comm); + const int my_rank = dbm_mpi_comm_rank(world_comm); + + if (my_rank == 0) { + printf("MPI ranks: %i\n", nranks); + printf("OpenMP threads: %i\n", omp_get_max_threads()); + } + + if (offload_get_device_count() > 0) { + offload_set_device_id(my_rank % offload_get_device_count()); + } + + // Create 2D cart. + const int dims[2] = {nranks, 1}; + const int periods[2] = {true, true}; + dbm_mpi_comm_t comm = + dbm_mpi_cart_create(world_comm, 2, dims, periods, false); + + dbm_library_init(); + + // Benchmark reserve blocks. + struct timespec time_start, time_end; + clock_gettime(CLOCK_REALTIME, &time_start); + for (int icycle = 0; icycle < 50; icycle++) { + dbm_matrix_t *matrix = create_some_matrix(comm); + reserve_all_blocks(matrix); + dbm_release(matrix); + } + clock_gettime(CLOCK_REALTIME, &time_end); + if (my_rank == 0) { + printf("reserve blocks: %.3f seconds\n", time_diff(time_start, time_end)); + } + + // Benchmark matrix multiply. + dbm_matrix_t *matrix_a = create_some_matrix(comm); + dbm_matrix_t *matrix_b = create_some_matrix(comm); + dbm_matrix_t *matrix_c = create_some_matrix(comm); + reserve_all_blocks(matrix_a); + reserve_all_blocks(matrix_b); + set_all_blocks(matrix_a); + set_all_blocks(matrix_b); + long flop; + clock_gettime(CLOCK_REALTIME, &time_start); + dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false, + 1e-8, &flop); + clock_gettime(CLOCK_REALTIME, &time_end); + dbm_release(matrix_a); + dbm_release(matrix_b); + dbm_release(matrix_c); + + const double t = time_diff(time_start, time_end); + if (my_rank == 0) { + printf("matrix multiply: %.3f s, %.1f MFLOP/s \n", t, 1e-9 * flop / t); + printf("done :-)\n"); + } + + dbm_library_finalize(); + dbm_mpi_comm_free(&comm); + dbm_mpi_finalize(); + return 0; +} + +// EOF diff --git a/src/dbm/dbm_mpi.c b/src/dbm/dbm_mpi.c new file mode 100644 index 0000000000..1358bf9f13 --- /dev/null +++ b/src/dbm/dbm_mpi.c @@ -0,0 +1,421 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include + +#include "dbm_mpi.h" + +#if defined(__parallel) +/******************************************************************************* + * \brief Check given MPI status and upon failure abort with a nice message. + * \author Ole Schuett + ******************************************************************************/ +#define CHECK(status) \ + if (status != MPI_SUCCESS) { \ + fprintf(stderr, "MPI error in %s:%i\n", __FILE__, __LINE__); \ + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); \ + } +#endif + +/******************************************************************************* + * \brief Wrapper around MPI_Init. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_init(int *argc, char ***argv) { +#if defined(__parallel) + CHECK(MPI_Init(argc, argv)); +#else + (void)argc; // mark used + (void)argv; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Finalize. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_finalize() { +#if defined(__parallel) + CHECK(MPI_Finalize()); +#endif +} + +/******************************************************************************* + * \brief Returns MPI_COMM_WORLD. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_get_comm_world() { +#if defined(__parallel) + return MPI_COMM_WORLD; +#else + return -1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_f2c. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_comm_f2c(const int fortran_comm) { +#if defined(__parallel) + return MPI_Comm_f2c(fortran_comm); +#else + (void)fortran_comm; // mark used + return -1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_c2f. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm) { +#if defined(__parallel) + return MPI_Comm_c2f(comm); +#else + (void)comm; // mark used + return -1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_rank. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm) { +#if defined(__parallel) + int rank; + CHECK(MPI_Comm_rank(comm, &rank)); + return rank; +#else + (void)comm; // mark used + return 0; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_size. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_size(const dbm_mpi_comm_t comm) { +#if defined(__parallel) + int nranks; + CHECK(MPI_Comm_size(comm, &nranks)); + return nranks; +#else + (void)comm; // mark used + return 1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_create. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, + const int ndims, const int dims[], + const int periods[], const int reorder) { +#if defined(__parallel) + dbm_mpi_comm_t comm_cart; + CHECK(MPI_Cart_create(comm_old, ndims, dims, periods, reorder, &comm_cart)); + return comm_cart; +#else + (void)comm_old; // mark used + (void)ndims; + (void)dims; + (void)periods; + (void)reorder; + return -1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_get. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], + int periods[], int coords[]) { +#if defined(__parallel) + CHECK(MPI_Cart_get(comm, maxdims, dims, periods, coords)); +#else + (void)comm; // mark used + for (int i = 0; i < maxdims; i++) { + dims[i] = 1; + periods[i] = 1; + coords[i] = 0; + } +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_rank. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_cart_rank(const dbm_mpi_comm_t comm, const int coords[]) { +#if defined(__parallel) + int rank; + CHECK(MPI_Cart_rank(comm, coords, &rank)); + return rank; +#else + (void)comm; // mark used + (void)coords; + return 0; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_sub. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_cart_sub(const dbm_mpi_comm_t comm, + const int remain_dims[]) { +#if defined(__parallel) + dbm_mpi_comm_t newcomm; + CHECK(MPI_Cart_sub(comm, remain_dims, &newcomm)); + return newcomm; +#else + (void)comm; // mark used + (void)remain_dims; + return -1; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_free. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_comm_free(dbm_mpi_comm_t *comm) { +#if defined(__parallel) + CHECK(MPI_Comm_free(comm)); +#else + (void)comm; // mark used +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_compare. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, + const dbm_mpi_comm_t comm2) { +#if defined(__parallel) + int res; + CHECK(MPI_Comm_compare(comm1, comm2, &res)); + return res == MPI_IDENT || res == MPI_CONGRUENT || res == MPI_SIMILAR; +#else + (void)comm1; // mark used + (void)comm2; + return true; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_max_int(int *values, const int count, const dbm_mpi_comm_t comm) { +#if defined(__parallel) + int *recvbuf = malloc(count * sizeof(int)); + CHECK(MPI_Allreduce(values, recvbuf, count, MPI_INT, MPI_MAX, comm)); + memcpy(values, recvbuf, count * sizeof(int)); + free(recvbuf); +#else + (void)comm; // mark used + (void)values; + (void)count; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_max_double(double *values, const int count, + const dbm_mpi_comm_t comm) { +#if defined(__parallel) + double *recvbuf = malloc(count * sizeof(double)); + CHECK(MPI_Allreduce(values, recvbuf, count, MPI_DOUBLE, MPI_MAX, comm)); + memcpy(values, recvbuf, count * sizeof(double)); + free(recvbuf); +#else + (void)comm; // mark used + (void)values; + (void)count; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_int(int *values, const int count, const dbm_mpi_comm_t comm) { +#if defined(__parallel) + int *recvbuf = malloc(count * sizeof(int)); + CHECK(MPI_Allreduce(values, recvbuf, count, MPI_INT, MPI_SUM, comm)); + memcpy(values, recvbuf, count * sizeof(int)); + free(recvbuf); +#else + (void)comm; // mark used + (void)values; + (void)count; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_LONG. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_long(long *values, const int count, + const dbm_mpi_comm_t comm) { +#if defined(__parallel) + long *recvbuf = malloc(count * sizeof(long)); + CHECK(MPI_Allreduce(values, recvbuf, count, MPI_LONG, MPI_SUM, comm)); + memcpy(values, recvbuf, count * sizeof(long)); + free(recvbuf); +#else + (void)comm; // mark used + (void)values; + (void)count; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_double(double *values, const int count, + const dbm_mpi_comm_t comm) { +#if defined(__parallel) + double *recvbuf = malloc(count * sizeof(double)); + CHECK(MPI_Allreduce(values, recvbuf, count, MPI_DOUBLE, MPI_SUM, comm)); + memcpy(values, recvbuf, count * sizeof(double)); + free(recvbuf); +#else + (void)comm; // mark used + (void)values; + (void)count; +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Sendrecv for datatype MPI_BYTE. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_sendrecv_byte(const void *sendbuf, const int sendcount, + const int dest, const int sendtag, void *recvbuf, + const int recvcount, const int source, + const int recvtag, const dbm_mpi_comm_t comm) { +#if defined(__parallel) + MPI_Status status; + CHECK(MPI_Sendrecv(sendbuf, sendcount, MPI_BYTE, dest, sendtag, recvbuf, + recvcount, MPI_BYTE, source, recvtag, comm, &status)) + int count_received; + CHECK(MPI_Get_count(&status, MPI_BYTE, &count_received)); + return count_received; +#else + (void)sendbuf; // mark used + (void)sendcount; + (void)dest; + (void)sendtag; + (void)recvbuf; + (void)recvcount; + (void)source; + (void)recvtag; + (void)comm; + fprintf(stderr, "Error: dbm_mpi_sendrecv_byte not available without MPI\n"); + abort(); +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Sendrecv for datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_sendrecv_double(const double *sendbuf, const int sendcount, + const int dest, const int sendtag, double *recvbuf, + const int recvcount, const int source, + const int recvtag, const dbm_mpi_comm_t comm) { +#if defined(__parallel) + MPI_Status status; + CHECK(MPI_Sendrecv(sendbuf, sendcount, MPI_DOUBLE, dest, sendtag, recvbuf, + recvcount, MPI_DOUBLE, source, recvtag, comm, &status)) + int count_received; + CHECK(MPI_Get_count(&status, MPI_DOUBLE, &count_received)); + return count_received; +#else + (void)sendbuf; // mark used + (void)sendcount; + (void)dest; + (void)sendtag; + (void)recvbuf; + (void)recvcount; + (void)source; + (void)recvtag; + (void)comm; + fprintf(stderr, "Error: dbm_mpi_sendrecv_double not available without MPI\n"); + abort(); +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoall for datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, + const int recvcount, const dbm_mpi_comm_t comm) { +#if defined(__parallel) + CHECK(MPI_Alltoall(sendbuf, sendcount, MPI_INT, recvbuf, recvcount, MPI_INT, + comm)); +#else + (void)comm; // mark used + assert(sendcount == recvcount); + memcpy(recvbuf, sendbuf, sendcount * sizeof(int)); +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoallv for datatype MPI_BYTE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoallv_byte(const void *sendbuf, const int *sendcounts, + const int *sdispls, void *recvbuf, + const int *recvcounts, const int *rdispls, + const dbm_mpi_comm_t comm) { +#if defined(__parallel) + CHECK(MPI_Alltoallv(sendbuf, sendcounts, sdispls, MPI_BYTE, recvbuf, + recvcounts, rdispls, MPI_BYTE, comm)); +#else + (void)comm; // mark used + assert(sendcounts[0] == recvcounts[0]); + assert(sdispls[0] == 0 && rdispls[0] == 0); + memcpy(recvbuf, sendbuf, sendcounts[0]); +#endif +} + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, + const int *sdispls, double *recvbuf, + const int *recvcounts, const int *rdispls, + const dbm_mpi_comm_t comm) { +#if defined(__parallel) + CHECK(MPI_Alltoallv(sendbuf, sendcounts, sdispls, MPI_DOUBLE, recvbuf, + recvcounts, rdispls, MPI_DOUBLE, comm)); +#else + (void)comm; // mark used + assert(sendcounts[0] == recvcounts[0]); + assert(sdispls[0] == 0 && rdispls[0] == 0); + memcpy(recvbuf, sendbuf, sendcounts[0] * sizeof(double)); +#endif +} + +// EOF diff --git a/src/dbm/dbm_mpi.h b/src/dbm/dbm_mpi.h new file mode 100644 index 0000000000..49b9b7ed2c --- /dev/null +++ b/src/dbm/dbm_mpi.h @@ -0,0 +1,180 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MPI_H +#define DBM_MPI_H + +#include + +#if defined(__parallel) +#include +typedef MPI_Comm dbm_mpi_comm_t; +#else +typedef int dbm_mpi_comm_t; +#endif + +/******************************************************************************* + * \brief Wrapper around MPI_Init. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_init(int *argc, char ***argv); + +/******************************************************************************* + * \brief Wrapper around MPI_Finalize. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_finalize(); + +/******************************************************************************* + * \brief Returns MPI_COMM_WORLD. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_get_comm_world(); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_f2c. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_comm_f2c(const int fortran_comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_c2f. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_rank. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_size. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_comm_size(const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_create. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, + const int ndims, const int dims[], + const int periods[], const int reorder); + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_get. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], + int periods[], int coords[]); + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_rank. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_cart_rank(const dbm_mpi_comm_t comm, const int coords[]); + +/******************************************************************************* + * \brief Wrapper around MPI_Cart_sub. + * \author Ole Schuett + ******************************************************************************/ +dbm_mpi_comm_t dbm_mpi_cart_sub(const dbm_mpi_comm_t comm, + const int remain_dims[]); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_free. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_comm_free(dbm_mpi_comm_t *comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Comm_compare. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, + const dbm_mpi_comm_t comm2); + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_max_int(int *values, const int count, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_max_double(double *values, const int count, + const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_int(int *values, const int count, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_LONG. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_long(long *values, const int count, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_sum_double(double *values, const int count, + const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Sendrecv for datatype MPI_BYTE. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_sendrecv_byte(const void *sendbuf, const int sendcount, + const int dest, const int sendtag, void *recvbuf, + const int recvcount, const int source, + const int recvtag, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Sendrecv for datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +int dbm_mpi_sendrecv_double(const double *sendbuf, const int sendcount, + const int dest, const int sendtag, double *recvbuf, + const int recvcount, const int source, + const int recvtag, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoall for datatype MPI_INT. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, + const int recvcount, const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoallv for datatype MPI_BYTE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoallv_byte(const void *sendbuf, const int *sendcounts, + const int *sdispls, void *recvbuf, + const int *recvcounts, const int *rdispls, + const dbm_mpi_comm_t comm); + +/******************************************************************************* + * \brief Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE. + * \author Ole Schuett + ******************************************************************************/ +void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, + const int *sdispls, double *recvbuf, + const int *recvcounts, const int *rdispls, + const dbm_mpi_comm_t comm); + +#endif + +// EOF diff --git a/src/dbm/dbm_multiply.c b/src/dbm/dbm_multiply.c new file mode 100644 index 0000000000..4d6ccbffdb --- /dev/null +++ b/src/dbm/dbm_multiply.c @@ -0,0 +1,319 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include + +#include "dbm_hyperparams.h" +#include "dbm_library.h" +#include "dbm_multiply.h" +#include "dbm_multiply_comm.h" +#include "dbm_multiply_cpu.h" +#include "dbm_multiply_cuda.h" +#include "dbm_multiply_internal.h" + +/******************************************************************************* + * \brief Returns the larger of two given integer (missing from the C standard) + * \author Ole Schuett + ******************************************************************************/ +static inline int imax(int x, int y) { return (x > y ? x : y); } + +/******************************************************************************* + * \brief Private routine for computing the max filter threshold for each row. + * \author Ole Schuett + ******************************************************************************/ +static float *compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix, + const double filter_eps) { + const int nrows = (trans) ? matrix->ncols : matrix->nrows; + int *nblocks_per_row = calloc(nrows, sizeof(int)); + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const int row = (trans) ? blk->col : blk->row; + nblocks_per_row[row]++; + } + } + + dbm_mpi_sum_int(nblocks_per_row, nrows, matrix->dist->comm); + + float *row_max_eps = malloc(nrows * sizeof(float)); + for (int i = 0; i < nrows; i++) { + const float f = ((float)filter_eps) / ((float)imax(1, nblocks_per_row[i])); + row_max_eps[i] = f * f; + } + free(nblocks_per_row); + return row_max_eps; // Ownership of row_max_eps transfers to caller. +} + +/******************************************************************************* + * \brief Private struct for storing the context of the multiplication backend. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { +#if defined(__DBM_CUDA) + dbm_multiply_cuda_context_t cuda; +#endif +} backend_context_t; + +/******************************************************************************* + * \brief Private routine for intializing the multiplication backend. + * \author Ole Schuett + ******************************************************************************/ +static backend_context_t *backend_start(const dbm_matrix_t *matrix_c) { + backend_context_t *ctx = calloc(1, sizeof(backend_context_t)); + +#if defined(__DBM_CUDA) + dbm_multiply_cuda_start(MAX_BATCH_SIZE, matrix_c->nshards, matrix_c->shards, + &ctx->cuda); +#else + (void)matrix_c; // mark as used +#endif + + return ctx; +} + +/******************************************************************************* + * \brief Private routine for handing newly arrived packs to the backend. + * \author Ole Schuett + ******************************************************************************/ +static void backend_upload_packs(const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + backend_context_t *ctx) { + +#if defined(__DBM_CUDA) + dbm_multiply_cuda_upload_packs(pack_a, pack_b, &ctx->cuda); +#else + (void)pack_a; // mark as used + (void)pack_b; + (void)ctx; +#endif +} + +/******************************************************************************* + * \brief Private routine for sending a batch to the multiplication backend. + * \author Ole Schuett + ******************************************************************************/ +static void backend_process_batch(const int ntasks, dbm_task_t batch[ntasks], + const bool transa, const bool transb, + const double alpha, const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, const int kshard, + dbm_shard_t *shard_c, + backend_context_t *ctx) { +#if defined(__DBM_CUDA) + (void)pack_a; // mark as used + (void)pack_b; + (void)shard_c; + dbm_multiply_cuda_process_batch(ntasks, batch, transa, transb, alpha, kshard, + &ctx->cuda); +#else + (void)kshard; // mark as used + (void)ctx; + dbm_multiply_cpu_process_batch(ntasks, batch, transa, transb, alpha, pack_a, + pack_b, shard_c); +#endif +} + +/******************************************************************************* + * \brief Private routine for downloading results of the multiplication backend. + * \author Ole Schuett + ******************************************************************************/ +static void backend_download_results(backend_context_t *ctx) { +#if defined(__DBM_CUDA) + dbm_multiply_cuda_download_results(&ctx->cuda); +#else + (void)ctx; // mark as used +#endif +} + +/******************************************************************************* + * \brief Private routine for shutting down the multiplication backend. + * \author Ole Schuett + ******************************************************************************/ +static void backend_stop(backend_context_t *ctx) { +#if defined(__DBM_CUDA) + dbm_multiply_cuda_stop(&ctx->cuda); +#endif + free(ctx); +} + +/******************************************************************************* + * \brief Private routine for multipling two packs. + * \author Ole Schuett + ******************************************************************************/ +static void multiply_packs(const bool transa, const bool transb, + const double alpha, const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + const dbm_matrix_t *matrix_a, + const dbm_matrix_t *matrix_b, dbm_matrix_t *matrix_c, + const float *rows_left_max_eps, long *flop, + backend_context_t *ctx) { + + const float alpha2 = alpha * alpha; + + long flop_sum = 0; +#pragma omp parallel for schedule(dynamic) reduction(+ : flop_sum) + for (int kshard = 0; kshard < matrix_c->nshards; kshard++) { + dbm_shard_t *shard_c = &matrix_c->shards[kshard]; + + dbm_task_t batch[MAX_BATCH_SIZE]; + int ntasks = 0; + + // Essentially a merge sort (assuming blocks are pre-sorted by shared index) + int jblock_start = 0; + for (int iblock = 0; iblock < pack_a->nblocks; iblock++) { + const dbm_block_t *blk_a = &pack_a->blocks[iblock]; + const int row_left = (transa) ? blk_a->col : blk_a->row; + const int col_left = (transa) ? blk_a->row : blk_a->col; + if (row_left % matrix_c->nshards != kshard) { + continue; + } + for (int jblock = jblock_start; jblock < pack_b->nblocks; jblock++) { + const dbm_block_t *blk_b = &pack_b->blocks[jblock]; + const int row_right = (transb) ? blk_b->col : blk_b->row; + const int col_right = (transb) ? blk_b->row : blk_b->col; + if (col_left < row_right) { + break; + } + if (col_left > row_right) { + jblock_start++; + continue; + } + // Found block pair with col_left == row_right. + + // Check norms. + if (alpha2 * blk_a->norm * blk_b->norm < rows_left_max_eps[row_left]) { + continue; + } + + // Check block sizes. + const int row_size_a = matrix_a->row_sizes[blk_a->row]; + const int col_size_a = matrix_a->col_sizes[blk_a->col]; + const int row_size_left = (transa) ? col_size_a : row_size_a; + const int col_size_left = (transa) ? row_size_a : col_size_a; + const int row_size_b = matrix_b->row_sizes[blk_b->row]; + const int col_size_b = matrix_b->col_sizes[blk_b->col]; + const int row_size_right = (transb) ? col_size_b : row_size_b; + const int col_size_right = (transb) ? row_size_b : col_size_b; + const int row_size_c = matrix_c->row_sizes[row_left]; + const int col_size_c = matrix_c->col_sizes[col_right]; + const int m = row_size_left, n = col_size_right, k = col_size_left; + assert(m == row_size_c); + assert(n == col_size_c); + assert(k == row_size_right); + + // Count flops. + assert(m * n * k > 0); + flop_sum += 2 * m * n * k; + + // Get C block. + dbm_block_t *blk_c = + dbm_shard_get_or_promise_block(shard_c, row_left, col_right, m * n); + + // Invalidate norm of C block because its data is going to change. + blk_c->norm = -1.0; + + // Add block multiplication to batch. + batch[ntasks].m = m; + batch[ntasks].n = n; + batch[ntasks].k = k; + batch[ntasks].offset_a = blk_a->offset; + batch[ntasks].offset_b = blk_b->offset; + batch[ntasks].offset_c = blk_c->offset; + ntasks++; + + if (ntasks == MAX_BATCH_SIZE) { + backend_process_batch(ntasks, batch, transa, transb, alpha, pack_a, + pack_b, kshard, shard_c, ctx); + ntasks = 0; + } + } + } + backend_process_batch(ntasks, batch, transa, transb, alpha, pack_a, pack_b, + kshard, shard_c, ctx); + } + *flop += flop_sum; +} + +/******************************************************************************* + * \brief Performs a multiplication of two dbm_matrix_t matrices. + * See dbm_matrix.h for details. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply(const bool transa, const bool transb, const double alpha, + dbm_matrix_t *matrix_a, dbm_matrix_t *matrix_b, + const double beta, dbm_matrix_t *matrix_c, + const bool retain_sparsity, const double filter_eps, + long *flop) { + + assert(omp_get_num_threads() == 1); + assert(retain_sparsity == false); // TODO implement + + // Denote left/right to matrices a/b after possible transpose. + const int nrows_left = (transa) ? matrix_a->ncols : matrix_a->nrows; + const int ncols_left = (transa) ? matrix_a->nrows : matrix_a->ncols; + const int nrows_right = (transb) ? matrix_b->ncols : matrix_b->nrows; + const int ncols_right = (transb) ? matrix_b->nrows : matrix_b->ncols; + + // Sanity check matrix dimensions. + assert(ncols_left == nrows_right); + assert(nrows_left == matrix_c->nrows); + assert(ncols_right == matrix_c->ncols); + + // Prepare matrix_c. + dbm_scale(matrix_c, beta); + + // Start uploading matrix_c to the GPU. + backend_context_t *ctx = backend_start(matrix_c); + + // Compute norms and filter thresholds for each row. + dbm_compute_block_norms(matrix_a); + dbm_compute_block_norms(matrix_b); + float *rows_left_max_eps = compute_rows_max_eps(transa, matrix_a, filter_eps); + + // Redistribute matrix_a and matrix_b across MPI ranks. + dbm_comm_iterator_t *iter = + dbm_comm_iterator_start(transa, transb, matrix_a, matrix_b, matrix_c); + + // Main loop. + *flop = 0; + dbm_pack_t *pack_a, *pack_b; + while (dbm_comm_iterator_next(iter, &pack_a, &pack_b)) { + backend_upload_packs(pack_a, pack_b, ctx); + multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b, + matrix_c, rows_left_max_eps, flop, ctx); + } + + // Start downloading matrix_c from the GPU. + backend_download_results(ctx); + + // Sanity check if matrix_c contains the correct blocks. + for (int ishard = 0; ishard < matrix_c->nshards; ishard++) { + dbm_shard_t *shard = &matrix_c->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const int rank = dbm_get_stored_coordinates(matrix_c, blk->row, blk->col); + assert(rank == matrix_c->dist->my_rank); + } + } + + // Wait for all other MPI ranks to complete, then release ressources. + dbm_comm_iterator_stop(iter); + free(rows_left_max_eps); + backend_stop(ctx); + + // Compute average flops per rank. + dbm_mpi_sum_long(flop, 1, matrix_c->dist->comm); + *flop = (*flop + matrix_c->dist->nranks - 1) / matrix_c->dist->nranks; + + // Final filter pass. + dbm_filter(matrix_c, filter_eps); +} + +// EOF diff --git a/src/dbm/dbm_multiply.h b/src/dbm/dbm_multiply.h new file mode 100644 index 0000000000..a37f59d188 --- /dev/null +++ b/src/dbm/dbm_multiply.h @@ -0,0 +1,37 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MULTIPLY_H +#define DBM_MULTIPLY_H + +#include + +#include "dbm_matrix.h" + +/******************************************************************************* + * \brief Performs a multiplication of two dbm_matrix_t matrices, + as C := alpha * op( A ) * op( B ) + beta * C. + + The filter_eps parameter is used to filter the resulting matrix. + The filtering criterion is whether the block-frobenius norm is less + than the specified epsilon. One-the-fly filtering is done such that + individual multiplications are skipped if the product of the frobenius + norms of the left- and right-matrix blocks are less than the specified + epsilon divided by the maximum number of possible multiplies in each + row. In addition a final filtering is done as well with the same + epsilon value. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply(const bool transa, const bool transb, const double alpha, + dbm_matrix_t *matrix_a, dbm_matrix_t *matrix_b, + const double beta, dbm_matrix_t *matrix_c, + const bool retain_sparsity, const double filter_eps, + long *flop); + +#endif + +// EOF diff --git a/src/dbm/dbm_multiply_comm.c b/src/dbm/dbm_multiply_comm.c new file mode 100644 index 0000000000..4323ff2d4b --- /dev/null +++ b/src/dbm/dbm_multiply_comm.c @@ -0,0 +1,423 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include "dbm_multiply_comm.h" + +#include +#include +#include + +#include "dbm_mempool.h" + +/******************************************************************************* + * \brief Returns the larger of two given integer (missing from the C standard) + * \author Ole Schuett + ******************************************************************************/ +static inline int imax(int x, int y) { return (x > y ? x : y); } + +/******************************************************************************* + * \brief Private routine for computing greatest common divisor of two numbers. + * \author Ole Schuett + ******************************************************************************/ +static int gcd(const int a, const int b) { + if (a == 0) + return b; + return gcd(b % a, a); // Euclid's algorithm. +} + +/******************************************************************************* + * \brief Private routine for computing least common multiple of two numbers. + * \author Ole Schuett + ******************************************************************************/ +static int lcm(const int a, const int b) { return (a * b) / gcd(a, b); } + +/******************************************************************************* + * \brief Private routine for computing the sum of the given integers. + * \author Ole Schuett + ******************************************************************************/ +static inline int isum(const int n, const int input[n]) { + int output = 0; + for (int i = 0; i < n; i++) { + output += input[i]; + } + return output; +} + +/******************************************************************************* + * \brief Private routine for computing the cumulative sums of given numbers. + * \author Ole Schuett + ******************************************************************************/ +static inline void icumsum(const int n, const int input[n], int output[n]) { + output[0] = 0; + for (int i = 1; i < n; i++) { + output[i] = output[i - 1] + input[i - 1]; + } +} + +/******************************************************************************* + * \brief Private struct used for planing during pack_matrix. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + const dbm_block_t *blk; + int rank; // target mpi rank + int offset; // offset in data_send array +} plan_t; + +/******************************************************************************* + * \brief Private comperator passed to qsort to compare two plans by rank. + * \author Ole Schuett + ******************************************************************************/ +static int compare_plan_by_rank(const void *a, const void *b) { + const plan_t *blk_a = (plan_t *)a; + const plan_t *blk_b = (plan_t *)b; + return blk_a->rank - blk_b->rank; +} + +/******************************************************************************* + * \brief Private comperator passed to qsort to compare two blocks by row. + * \author Ole Schuett + ******************************************************************************/ +static int compare_pack_blocks_by_row(const void *a, const void *b) { + const dbm_block_t *blk_a = (dbm_block_t *)a; + const dbm_block_t *blk_b = (dbm_block_t *)b; + return blk_a->row - blk_b->row; +} + +/******************************************************************************* + * \brief Private comperator passed to qsort to compare two blocks by column. + * \author Ole Schuett + ******************************************************************************/ +static int compare_pack_blocks_by_col(const void *a, const void *b) { + const dbm_block_t *blk_a = (dbm_block_t *)a; + const dbm_block_t *blk_b = (dbm_block_t *)b; + return blk_a->col - blk_b->col; +} + +/******************************************************************************* + * \brief Private routing for redistributing a matrix along selected dimensions. + * \author Ole Schuett + ******************************************************************************/ +static dbm_packed_matrix_t pack_matrix(const bool trans_matrix, + const bool trans_dist, + const dbm_matrix_t *matrix, + const dbm_distribution_t *dist, + const int nticks) { + + assert(dbm_mpi_comms_are_similar(matrix->dist->comm, dist->comm)); + const int nranks = dist->nranks; + + // The row/col indicies are distributed along one cart dimension and the + // ticks are distributed along the other cart dimension. + const dbm_dist_1d_t *dist_indices = (trans_dist) ? &dist->cols : &dist->rows; + const dbm_dist_1d_t *dist_ticks = (trans_dist) ? &dist->rows : &dist->cols; + + const int nsend_packs = nticks / dist_ticks->nranks; + assert(nsend_packs * dist_ticks->nranks == nticks); + + // Assemble packed matrix. + dbm_packed_matrix_t packed; + packed.dist_indices = dist_indices; + packed.dist_ticks = dist_ticks; + packed.nsend_packs = nsend_packs; + packed.send_packs = malloc(nsend_packs * sizeof(dbm_pack_t)); + + for (int ipack = 0; ipack < nsend_packs; ipack++) { + + // 1st pass: Compute number of blocks that will be send in total. + int nblocks_send = 0; + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const int index_k = (trans_matrix) ? blk->row : blk->col; + const int itick = index_k % nticks; // TODO: smarter load-balancing + if (itick / dist_ticks->nranks == ipack) { + nblocks_send++; // This block belongs to the current set of packs. + } + } + } + + // 2nd pass: Plan where to send which blocks. + plan_t *plans = malloc(nblocks_send * sizeof(plan_t)); + int iplan = 0; // position of current plan + for (int ishard = 0; ishard < matrix->nshards; ishard++) { + dbm_shard_t *shard = &matrix->shards[ishard]; + for (int iblock = 0; iblock < shard->nblocks; iblock++) { + const dbm_block_t *blk = &shard->blocks[iblock]; + const int index_m_or_n = (trans_matrix) ? blk->col : blk->row; + const int index_k = (trans_matrix) ? blk->row : blk->col; + const int itick = index_k % nticks; // Has to be same mapping as above. + if (itick / dist_ticks->nranks == ipack) { + // Compute rank to which this block should be send. + const int coord_left = dist_indices->index2coord[index_m_or_n]; + const int coord_right = itick % dist_ticks->nranks; + const int coords[2] = {(trans_dist) ? coord_right : coord_left, + (trans_dist) ? coord_left : coord_right}; + const int rank = dbm_mpi_cart_rank(dist->comm, coords); + // Create plan. + plans[iplan].blk = blk; + plans[iplan].rank = rank; + iplan++; + } + } + } + assert(iplan == nblocks_send); + + // Sort plans by rank. + qsort(plans, nblocks_send, sizeof(plan_t), &compare_plan_by_rank); + + // 3th pass: Compute per rank send counts and displacements. + int ndata_send = 0; + int blks_send_count[nranks], data_send_count[nranks]; + memset(blks_send_count, 0, nranks * sizeof(int)); + memset(data_send_count, 0, nranks * sizeof(int)); + for (int iblock = 0; iblock < nblocks_send; iblock++) { + plan_t *plan = &plans[iblock]; + const int row_size = matrix->row_sizes[plan->blk->row]; + const int col_size = matrix->col_sizes[plan->blk->col]; + const int block_size = row_size * col_size; + plan->offset = ndata_send; // Offset within data_send array. + ndata_send += block_size; + blks_send_count[plan->rank] += 1; + data_send_count[plan->rank] += block_size; + } + int blks_send_displ[nranks], data_send_displ[nranks]; + icumsum(nranks, blks_send_count, blks_send_displ); + icumsum(nranks, data_send_count, data_send_displ); + + // 4th pass: Fill blks_send and data_send arrays. + double *data_send = dbm_mempool_host_malloc(ndata_send * sizeof(double)); + dbm_block_t *blks_send = malloc(nblocks_send * sizeof(dbm_block_t)); +#pragma omp parallel for schedule(dynamic) + for (int iblock = 0; iblock < nblocks_send; iblock++) { + const plan_t *plan = &plans[iblock]; + const int ishard = plan->blk->row % matrix->nshards; + const dbm_shard_t *shard = &matrix->shards[ishard]; + const double *blk_data = &shard->data[plan->blk->offset]; + const int row_size = matrix->row_sizes[plan->blk->row]; + const int col_size = matrix->col_sizes[plan->blk->col]; + const int block_size = row_size * col_size; + memcpy(&data_send[plan->offset], blk_data, block_size * sizeof(double)); + assert(plan->blk->norm >= 0.0); + blks_send[iblock] = *plan->blk; + blks_send[iblock].offset = plan->offset; // overwrite offset + } + free(plans); + + // Substract data_send_displ from block offsets. + for (int irank = 0; irank < nranks; irank++) { + for (int i = 0; i < blks_send_count[irank]; i++) { + blks_send[blks_send_displ[irank] + i].offset -= data_send_displ[irank]; + } + } + + // 1st communication: Exchange block counts. + int blks_recv_count[nranks], blks_recv_displ[nranks]; + dbm_mpi_alltoall_int(blks_send_count, 1, blks_recv_count, 1, dist->comm); + icumsum(nranks, blks_recv_count, blks_recv_displ); + const int nblocks_recv = isum(nranks, blks_recv_count); + + // 2nd communication: Exchange blocks. + dbm_block_t *blks_recv = malloc(nblocks_recv * sizeof(dbm_block_t)); + int blks_send_count_byte[nranks], blks_send_displ_byte[nranks]; + int blks_recv_count_byte[nranks], blks_recv_displ_byte[nranks]; + for (int i = 0; i < nranks; i++) { // TODO: this is ugly! + blks_send_count_byte[i] = blks_send_count[i] * sizeof(dbm_block_t); + blks_send_displ_byte[i] = blks_send_displ[i] * sizeof(dbm_block_t); + blks_recv_count_byte[i] = blks_recv_count[i] * sizeof(dbm_block_t); + blks_recv_displ_byte[i] = blks_recv_displ[i] * sizeof(dbm_block_t); + } + dbm_mpi_alltoallv_byte( + blks_send, blks_send_count_byte, blks_send_displ_byte, blks_recv, + blks_recv_count_byte, blks_recv_displ_byte, dist->comm); + free(blks_send); + + // 3rd communication: Exchange data counts. + // TODO: could be computed from blks_recv. + int data_recv_count[nranks], data_recv_displ[nranks]; + dbm_mpi_alltoall_int(data_send_count, 1, data_recv_count, 1, dist->comm); + icumsum(nranks, data_recv_count, data_recv_displ); + const int ndata_recv = isum(nranks, data_recv_count); + + // 4th communication: Exchange data. + double *data_recv = dbm_mempool_host_malloc(ndata_recv * sizeof(double)); + dbm_mpi_alltoallv_double(data_send, data_send_count, data_send_displ, + data_recv, data_recv_count, data_recv_displ, + dist->comm); + dbm_mempool_free(data_send); + + // Add data_recv_displ to block offsets. + for (int irank = 0; irank < nranks; irank++) { + for (int i = 0; i < blks_recv_count[irank]; i++) { + blks_recv[blks_recv_displ[irank] + i].offset += data_recv_displ[irank]; + } + } + + // Sort recveived blocks by shared index as required for multiply_packs(). + if (trans_matrix) { + qsort(blks_recv, nblocks_recv, sizeof(dbm_block_t), + &compare_pack_blocks_by_row); + } else { + qsort(blks_recv, nblocks_recv, sizeof(dbm_block_t), + &compare_pack_blocks_by_col); + } + + // Assemble received stuff into a pack. + packed.send_packs[ipack].nblocks = nblocks_recv; + packed.send_packs[ipack].data_size = ndata_recv; + packed.send_packs[ipack].blocks = blks_recv; + packed.send_packs[ipack].data = data_recv; + } + + // Allocate pack_recv. + int max_nblocks = 0, max_data_size = 0; + for (int ipack = 0; ipack < packed.nsend_packs; ipack++) { + max_nblocks = imax(max_nblocks, packed.send_packs[ipack].nblocks); + max_data_size = imax(max_data_size, packed.send_packs[ipack].data_size); + } + dbm_mpi_max_int(&max_nblocks, 1, packed.dist_ticks->comm); + dbm_mpi_max_int(&max_data_size, 1, packed.dist_ticks->comm); + packed.max_nblocks = max_nblocks; + packed.max_data_size = max_data_size; + packed.recv_pack.blocks = malloc(packed.max_nblocks * sizeof(dbm_block_t)); + packed.recv_pack.data = + dbm_mempool_host_malloc(packed.max_data_size * sizeof(double)); + + return packed; // Ownership of packed transfers to caller. +} + +/******************************************************************************* + * \brief Private routine for sending and receiving the pack for the given tick. + * \author Ole Schuett + ******************************************************************************/ +static dbm_pack_t *sendrecv_pack(const int itick, const int nticks, + dbm_packed_matrix_t *packed) { + const int nranks = packed->dist_ticks->nranks; + const int my_rank = packed->dist_ticks->my_rank; + + // Compute send rank and pack. + const int itick_of_rank0 = (itick + nticks - my_rank) % nticks; + const int send_rank = (my_rank + nticks - itick_of_rank0) % nranks; + const int send_itick = (itick_of_rank0 + send_rank) % nticks; + const int send_ipack = send_itick / nranks; + assert(send_itick % nranks == my_rank); + + // Compute recevice rank and pack. + const int recv_rank = itick % nranks; + const int recv_ipack = itick / nranks; + + if (send_rank == my_rank) { + assert(send_rank == recv_rank && send_ipack == recv_ipack); + return &packed->send_packs[send_ipack]; // Local pack, no mpi needed. + } else { + const dbm_pack_t *send_pack = &packed->send_packs[send_ipack]; + + // Exchange blocks. + const int nblocks_in_bytes = dbm_mpi_sendrecv_byte( + /*sendbuf=*/send_pack->blocks, + /*sendcound=*/send_pack->nblocks * sizeof(dbm_block_t), + /*dest=*/send_rank, + /*sendtag=*/send_ipack, + /*recvbuf=*/packed->recv_pack.blocks, + /*recvcount=*/packed->max_nblocks * sizeof(dbm_block_t), + /*source=*/recv_rank, + /*recvtag=*/recv_ipack, + /*comm=*/packed->dist_ticks->comm); + + assert(nblocks_in_bytes % sizeof(dbm_block_t) == 0); + packed->recv_pack.nblocks = nblocks_in_bytes / sizeof(dbm_block_t); + + // Exchange data. + packed->recv_pack.data_size = dbm_mpi_sendrecv_double( + /*sendbuf=*/send_pack->data, + /*sendcound=*/send_pack->data_size, + /*dest=*/send_rank, + /*sendtag=*/send_ipack, + /*recvbuf=*/packed->recv_pack.data, + /*recvcount=*/packed->max_data_size, + /*source=*/recv_rank, + /*recvtag=*/recv_ipack, + /*comm=*/packed->dist_ticks->comm); + + return &packed->recv_pack; + } +} + +/******************************************************************************* + * \brief Private routine for releasing a packed matrix. + * \author Ole Schuett + ******************************************************************************/ +static void free_packed_matrix(dbm_packed_matrix_t *packed) { + free(packed->recv_pack.blocks); + dbm_mempool_free(packed->recv_pack.data); + for (int ipack = 0; ipack < packed->nsend_packs; ipack++) { + free(packed->send_packs[ipack].blocks); + dbm_mempool_free(packed->send_packs[ipack].data); + } + free(packed->send_packs); +} + +/******************************************************************************* + * \brief Internal routine for creating a communication iterator. + * \author Ole Schuett + ******************************************************************************/ +dbm_comm_iterator_t *dbm_comm_iterator_start(const bool transa, + const bool transb, + const dbm_matrix_t *matrix_a, + const dbm_matrix_t *matrix_b, + const dbm_matrix_t *matrix_c) { + + dbm_comm_iterator_t *iter = malloc(sizeof(dbm_comm_iterator_t)); + iter->dist = matrix_c->dist; + + // During each communication tick we'll fetch a pack_a and pack_b. + // Since the cart might be non-squared, the number of communication ticks is + // chosen as the least common multiple of the cart's dimensions. + iter->nticks = lcm(iter->dist->rows.nranks, iter->dist->cols.nranks); + iter->itick = 0; + + // 1.arg=source dimension, 2.arg=target dimension, false=rows, true=columns. + iter->packed_a = + pack_matrix(transa, false, matrix_a, iter->dist, iter->nticks); + iter->packed_b = + pack_matrix(!transb, true, matrix_b, iter->dist, iter->nticks); + + return iter; +} + +/******************************************************************************* + * \brief Internal routine for retriving next pair of packs from given iterator. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, + dbm_pack_t **pack_b) { + if (iter->itick >= iter->nticks) { + return false; // end of iterator reached + } + + // Start each rank at a different tick to spread the load on the sources. + const int shift = iter->dist->rows.my_rank + iter->dist->cols.my_rank; + const int shifted_itick = (iter->itick + shift) % iter->nticks; + *pack_a = sendrecv_pack(shifted_itick, iter->nticks, &iter->packed_a); + *pack_b = sendrecv_pack(shifted_itick, iter->nticks, &iter->packed_b); + + iter->itick++; + return true; +} + +/******************************************************************************* + * \brief Internal routine for releasing the given communication iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter) { + free_packed_matrix(&iter->packed_a); + free_packed_matrix(&iter->packed_b); + free(iter); +} + +// EOF diff --git a/src/dbm/dbm_multiply_comm.h b/src/dbm/dbm_multiply_comm.h new file mode 100644 index 0000000000..c051f482c6 --- /dev/null +++ b/src/dbm/dbm_multiply_comm.h @@ -0,0 +1,68 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MULTIPLY_COMM_H +#define DBM_MULTIPLY_COMM_H + +#include "dbm_distribution.h" +#include "dbm_matrix.h" +#include "dbm_multiply_internal.h" + +#include + +/******************************************************************************* + * \brief Internal struct for storing a packed matrix. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + const dbm_dist_1d_t *dist_indices; + const dbm_dist_1d_t *dist_ticks; + int nsend_packs; + dbm_pack_t *send_packs; + dbm_pack_t recv_pack; + int max_nblocks; // Max across all ranks in dist_ticks. + int max_data_size; +} dbm_packed_matrix_t; + +/******************************************************************************* + * \brief Internal struct for storing a communication iterator. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int nticks; + int itick; + dbm_distribution_t *dist; + dbm_packed_matrix_t packed_a; + dbm_packed_matrix_t packed_b; +} dbm_comm_iterator_t; + +/******************************************************************************* + * \brief Internal routine for creating a communication iterator. + * \author Ole Schuett + ******************************************************************************/ +dbm_comm_iterator_t *dbm_comm_iterator_start(const bool transa, + const bool transb, + const dbm_matrix_t *matrix_a, + const dbm_matrix_t *matrix_b, + const dbm_matrix_t *matrix_c); + +/******************************************************************************* + * \brief Internal routine for retriving next pair of packs from given iterator. + * \author Ole Schuett + ******************************************************************************/ +bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, + dbm_pack_t **pack_b); + +/******************************************************************************* + * \brief Internal routine for releasing the given communication iterator. + * \author Ole Schuett + ******************************************************************************/ +void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter); + +#endif + +// EOF diff --git a/src/dbm/dbm_multiply_cpu.c b/src/dbm/dbm_multiply_cpu.c new file mode 100644 index 0000000000..93e3ae8d97 --- /dev/null +++ b/src/dbm/dbm_multiply_cpu.c @@ -0,0 +1,152 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include + +#if defined(__LIBXSMM) +#include +#endif + +#include "dbm_hyperparams.h" +#include "dbm_multiply_cpu.h" + +/******************************************************************************* + * \brief Prototype for BLAS dgemm. + * \author Ole Schuett + ******************************************************************************/ +void dgemm_(const char *transa, const char *transb, const int *m, const int *n, + const int *k, const double *alpha, const double *a, const int *lda, + const double *b, const int *ldb, const double *beta, double *c, + const int *ldc); + +/******************************************************************************* + * \brief Private convenient wrapper to hide Fortran nature of dgemm_. + * \author Ole Schuett + ******************************************************************************/ +static inline void dgemm(const bool transa, const bool transb, const int m, + const int n, const int k, const double alpha, + const double *a, const int lda, const double *b, + const int ldb, const double beta, double *c, + const int ldc) { + const char transa_char = (transa) ? 'T' : 'N'; + const char transb_char = (transb) ? 'T' : 'N'; + +#if defined(__LIBXSMM) + libxsmm_dgemm(&transa_char, &transb_char, &m, &n, &k, &alpha, a, &lda, b, + &ldb, &beta, c, &ldc); +#else + dgemm_(&transa_char, &transb_char, &m, &n, &k, &alpha, a, &lda, b, &ldb, + &beta, c, &ldc); +#endif +} + +/******************************************************************************* + * \brief Private hash function based on Szudzik's elegant pairing. + * Using unsigned int to return a positive number even after overflow. + * https://en.wikipedia.org/wiki/Pairing_function#Other_pairing_functions + * https://stackoverflow.com/a/13871379 + * http://szudzik.com/ElegantPairing.pdf + * \author Ole Schuett + ******************************************************************************/ +static inline unsigned int hash(const dbm_task_t task) { + const unsigned int m = task.m, n = task.n, k = task.k; + const unsigned int mn = (m >= n) ? m * m + m + n : m + n * n; + const unsigned int mnk = (mn >= k) ? mn * mn + mn + k : mn + k * k; + return mnk; +} + +/******************************************************************************* + * \brief Internal routine for executing the tasks in given batch on the CPU. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cpu_process_batch(const int ntasks, dbm_task_t batch[ntasks], + const bool transa, const bool transb, + const double alpha, + const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + dbm_shard_t *shard_c) { + + dbm_shard_allocate_promised_blocks(shard_c); + +#if defined(__LIBXSMM) + + // Sort tasks approximately by m,n,k via bucket sort. + int buckets[BATCH_NUM_BUCKETS]; + memset(buckets, 0, BATCH_NUM_BUCKETS * sizeof(int)); + for (int itask = 0; itask < ntasks; itask++) { + const int i = hash(batch[itask]) % BATCH_NUM_BUCKETS; + buckets[i]++; + } + for (int i = 1; i < BATCH_NUM_BUCKETS; i++) { + buckets[i] += buckets[i - 1]; + } + assert(buckets[BATCH_NUM_BUCKETS - 1] == ntasks); + int batch_order[ntasks]; + for (int itask = 0; itask < ntasks; itask++) { + const int i = hash(batch[itask]) % BATCH_NUM_BUCKETS; + buckets[i]--; + batch_order[buckets[i]] = itask; + } + + // Prepare arguments for libxsmm_dmmdispatch(). + const double beta = 1.0; + int flags = 0; + flags |= (transa) ? LIBXSMM_GEMM_FLAG_TRANS_A : 0; + flags |= (transb) ? LIBXSMM_GEMM_FLAG_TRANS_B : 0; + const int prefetch = LIBXSMM_PREFETCH_NONE; // somehow prefetching is slower + + // Loop over tasks. + libxsmm_dmmfunction kernel_func = NULL; + int kernel_m = 0, kernel_n = 0, kernel_k = 0; + for (int itask = 0; itask < ntasks; itask++) { + const dbm_task_t task = batch[batch_order[itask]]; + + if (task.m != kernel_m || task.n != kernel_n || task.k != kernel_k) { + kernel_func = libxsmm_dmmdispatch(task.m, task.n, task.k, NULL /*lda*/, + NULL /*ldb*/, NULL /*ldc*/, &alpha, + &beta, &flags, &prefetch); + kernel_m = task.m; + kernel_n = task.n; + kernel_k = task.k; + } + + const double *data_a = &pack_a->data[task.offset_a]; + const double *data_b = &pack_b->data[task.offset_b]; + double *data_c = &shard_c->data[task.offset_c]; + + if (kernel_func != NULL) { + kernel_func(data_a, data_b, data_c); + } else { + const int lda = (transa) ? task.k : task.m; + const int ldb = (transb) ? task.n : task.k; + const int ldc = task.m; + dgemm(transa, transb, task.m, task.n, task.k, alpha, data_a, lda, data_b, + ldb, 1.0, data_c, ldc); + } + } + +#else + + // Fallback to BLAS when libxsmm is not available. + for (int itask = 0; itask < ntasks; itask++) { + const dbm_task_t task = batch[itask]; + const int lda = (transa) ? task.k : task.m; + const int ldb = (transb) ? task.n : task.k; + const int ldc = task.m; + const double *data_a = &pack_a->data[task.offset_a]; + const double *data_b = &pack_b->data[task.offset_b]; + double *data_c = &shard_c->data[task.offset_c]; + dgemm(transa, transb, task.m, task.n, task.k, alpha, data_a, lda, data_b, + ldb, 1.0, data_c, ldc); + } + +#endif +} + +// EOF diff --git a/src/dbm/dbm_multiply_cpu.h b/src/dbm/dbm_multiply_cpu.h new file mode 100644 index 0000000000..a3ba47e3c5 --- /dev/null +++ b/src/dbm/dbm_multiply_cpu.h @@ -0,0 +1,28 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ +#ifndef DBM_MULTIPLY_CPU_H +#define DBM_MULTIPLY_CPU_H + +#include + +#include "dbm_multiply_internal.h" +#include "dbm_shard.h" + +/******************************************************************************* + * \brief Internal routine for executing the tasks in given batch on the CPU. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cpu_process_batch(const int ntasks, dbm_task_t batch[ntasks], + const bool transa, const bool transb, + const double alpha, + const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + dbm_shard_t *shard_c); + +#endif + +// EOF diff --git a/src/dbm/dbm_multiply_cuda.cu b/src/dbm/dbm_multiply_cuda.cu new file mode 100644 index 0000000000..c747a7e2d8 --- /dev/null +++ b/src/dbm/dbm_multiply_cuda.cu @@ -0,0 +1,270 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifdef __DBM_CUDA + +#include +#include + +#include "../offload/offload_library.h" +#include "dbm_mempool.h" +#include "dbm_multiply_cuda.h" + +/******************************************************************************* + * \brief Check given Cuda status and upon failure abort with a nice message. + * \author Ole Schuett + ******************************************************************************/ +#define CHECK(status) \ + if (status != cudaSuccess) { \ + fprintf(stderr, "ERROR: %s %s %d\n", cudaGetErrorString(status), __FILE__, \ + __LINE__); \ + abort(); \ + } + +/******************************************************************************* + * \brief Atomic add for doubles that also works prior to compute capability 6. + * \author Ole Schuett + ******************************************************************************/ +__device__ static void atomicAddDouble(double *address, double val) { + if (val == 0.0) + return; + +#if __CUDA_ARCH__ >= 600 + atomicAdd(address, val); // part of cuda library +#else + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + +#endif +} + +/******************************************************************************* + * \brief Internal routine for intializing the cuda backend. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_start(const int max_batch_size, const int nshards, + dbm_shard_t *shards_c_host, + dbm_multiply_cuda_context_t *ctx) { + // Select GPU device. + offload_set_device(); + + ctx->nshards = nshards; + ctx->shards_c_host = shards_c_host; + ctx->max_batch_size = max_batch_size; + CHECK(cudaStreamCreate(&ctx->main_stream)); + + // Allocate device storage for batches. + const size_t size = nshards * max_batch_size * sizeof(dbm_task_t); + ctx->batches_dev = (dbm_task_t *)dbm_mempool_device_malloc(size); + + // Allocate and upload shards of result matrix C. + ctx->shards_c_dev = + (dbm_shard_cuda_t *)malloc(nshards * sizeof(dbm_shard_cuda_t)); + for (int i = 0; i < nshards; i++) { + CHECK(cudaStreamCreate(&ctx->shards_c_dev[i].stream)); + ctx->shards_c_dev[i].data_size = ctx->shards_c_host[i].data_size; + const size_t size = ctx->shards_c_dev[i].data_size * sizeof(double); + ctx->shards_c_dev[i].data = (double *)dbm_mempool_device_malloc(size); + CHECK(cudaMemcpyAsync(ctx->shards_c_dev[i].data, ctx->shards_c_host[i].data, + size, cudaMemcpyHostToDevice, + ctx->shards_c_dev[i].stream)); + } +} + +/******************************************************************************* + * \brief Private routine for uploading a single pack onto the device. + * \author Ole Schuett + ******************************************************************************/ +static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev, + const cudaStream_t stream) { + + const size_t size = pack_host->data_size * sizeof(double); + if (pack_dev->data_size < pack_host->data_size) { + dbm_mempool_free(pack_dev->data); + pack_dev->data = (double *)dbm_mempool_device_malloc(size); + } + CHECK(cudaMemcpyAsync(pack_dev->data, pack_host->data, size, + cudaMemcpyHostToDevice, stream)); +} + +/******************************************************************************* + * \brief Internal routine for uploading newly arrived packs onto the device. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_upload_packs(const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + dbm_multiply_cuda_context_t *ctx) { + // Select GPU device. + offload_set_device(); + + // Wait for all c-streams to complete before overwriting old packs. + cudaEvent_t event; + CHECK(cudaEventCreate(&event)); + for (int i = 0; i < ctx->nshards; i++) { + CHECK(cudaEventRecord(event, ctx->shards_c_dev[i].stream)) + CHECK(cudaStreamWaitEvent(ctx->main_stream, event, 0)); + } + + upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream); + upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream); + + // Have all c-streams wait until new packs are uploaded. + CHECK(cudaEventRecord(event, ctx->main_stream)) + for (int i = 0; i < ctx->nshards; i++) { + CHECK(cudaStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0)); + } + CHECK(cudaEventDestroy(event)); +} + +/******************************************************************************* + * \brief A very naive - but generic - matrix multiplication kernel. + * \author Ole Schuett + ******************************************************************************/ +__global__ static void +process_batch_kernel(const bool transa, const bool transb, const double alpha, + const dbm_task_t *batch, const double *pack_a_data, + const double *pack_b_data, double *shard_c_data) { + + const dbm_task_t task = batch[blockIdx.x]; + const int lda = (transa) ? task.k : task.m; + const int ldb = (transb) ? task.n : task.k; + const int ldc = task.m; + const double *data_a = &pack_a_data[task.offset_a]; + const double *data_b = &pack_b_data[task.offset_b]; + double *data_c = &shard_c_data[task.offset_c]; + + for (int i = threadIdx.z; i < task.m; i += blockDim.z) { + for (int j = threadIdx.y; j < task.n; j += blockDim.y) { + for (int l = threadIdx.x; l < task.k; l += blockDim.x) { + const int idx_a = (transa) ? i * lda + l : l * lda + i; + const int idx_b = (transb) ? l * ldb + j : j * ldb + l; + const int idx_c = j * ldc + i; + const double val = alpha * data_a[idx_a] * data_b[idx_b]; + atomicAddDouble(&data_c[idx_c], val); + } + } + } +} + +/******************************************************************************* + * \brief Internal routine for executing the tasks in given batch on the GPU. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch, + const bool transa, const bool transb, + const double alpha, const int kshard, + dbm_multiply_cuda_context_t *ctx) { + if (ntasks == 0) { + return; // Nothing to do. + } + + // Select GPU device. + offload_set_device(); + + const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard]; + dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[kshard]; + + // Upload new batch. + dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size]; + const size_t size = ntasks * sizeof(dbm_task_t); + CHECK(cudaMemcpyAsync(batch_dev, batch, size, cudaMemcpyHostToDevice, + shard_c_dev->stream)); + cudaEvent_t batch_uploaded; + CHECK(cudaEventCreate(&batch_uploaded)); + CHECK(cudaEventRecord(batch_uploaded, shard_c_dev->stream)); + + // Grow shard_c_dev->data if nessecary. + if (shard_c_dev->data_size != shard_c_host->data_promised) { + // TODO experiment with over-allocation. + double *old_data_dev = shard_c_dev->data; + const size_t old_size = shard_c_dev->data_size * sizeof(double); + shard_c_dev->data_size = shard_c_host->data_promised; + const size_t new_size = shard_c_dev->data_size * sizeof(double); + shard_c_dev->data = (double *)dbm_mempool_device_malloc(new_size); + CHECK(cudaMemsetAsync(shard_c_dev->data, 0, new_size, + shard_c_dev->stream)); // TODO: zero only tail + CHECK(cudaMemcpyAsync(shard_c_dev->data, old_data_dev, old_size, + cudaMemcpyDeviceToDevice, shard_c_dev->stream)); + // Wait for copy to complete before freeing old buffer. + CHECK(cudaStreamSynchronize(shard_c_dev->stream)); + dbm_mempool_free(old_data_dev); + } + + // Launch kernel. + const int nblocks = ntasks; // TODO tune launch parameters. + const dim3 threads_per_block(4, 4, 4); + const size_t smem_per_block = 0; + process_batch_kernel<<stream>>>( + transa, transb, alpha, batch_dev, ctx->pack_a_dev.data, + ctx->pack_b_dev.data, shard_c_dev->data); + CHECK(cudaGetLastError()); + + // Wait for batch to be uploaded before refilling it. + CHECK(cudaEventSynchronize(batch_uploaded)); + CHECK(cudaEventDestroy(batch_uploaded)); +} + +/******************************************************************************* + * \brief Internal routine for downloading results from the device. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx) { + // Select GPU device. + offload_set_device(); + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < ctx->nshards; i++) { + // Grow host buffer if nessecary. + dbm_shard_t *shard_c_host = &ctx->shards_c_host[i]; + dbm_shard_allocate_promised_blocks(shard_c_host); + + // Download results from device. + dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i]; + assert(shard_c_host->data_size == shard_c_dev->data_size); + const size_t size = shard_c_dev->data_size * sizeof(double); + CHECK(cudaMemcpyAsync(shard_c_host->data, shard_c_dev->data, size, + cudaMemcpyDeviceToHost, shard_c_dev->stream)); + } +} + +/******************************************************************************* + * \brief Internal routine for shutting down the cuda backend. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_stop(dbm_multiply_cuda_context_t *ctx) { + // Select GPU device. + offload_set_device(); + + // Wait for completion, then free cuda ressources. +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < ctx->nshards; i++) { + dbm_shard_cuda_t *shard_c_dev = &ctx->shards_c_dev[i]; + CHECK(cudaStreamSynchronize(shard_c_dev->stream)); + CHECK(cudaStreamDestroy(shard_c_dev->stream)); + dbm_mempool_free(shard_c_dev->data); + } + free(ctx->shards_c_dev); + + dbm_mempool_free(ctx->pack_a_dev.data); + dbm_mempool_free(ctx->pack_b_dev.data); + dbm_mempool_free(ctx->batches_dev); + CHECK(cudaStreamDestroy(ctx->main_stream)); +} + +#endif // __DBM_CUDA + +// EOF diff --git a/src/dbm/dbm_multiply_cuda.h b/src/dbm/dbm_multiply_cuda.h new file mode 100644 index 0000000000..67b6842fa0 --- /dev/null +++ b/src/dbm/dbm_multiply_cuda.h @@ -0,0 +1,94 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MULTIPLY_CUDA_H +#define DBM_MULTIPLY_CUDA_H + +#ifdef __DBM_CUDA + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "dbm_multiply_internal.h" +#include "dbm_shard.h" + +/******************************************************************************* + * \brief Internal struct for storing per shard cuda objects. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + double *data; // on the device + int data_size; + cudaStream_t stream; +} dbm_shard_cuda_t; + +/******************************************************************************* + * \brief Internal struct for storing the cuda backend's context. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + cudaStream_t main_stream; + + int nshards; + dbm_shard_t *shards_c_host; + dbm_shard_cuda_t *shards_c_dev; + + dbm_pack_t pack_a_dev; + dbm_pack_t pack_b_dev; + + int max_batch_size; + dbm_task_t *batches_dev; +} dbm_multiply_cuda_context_t; + +/******************************************************************************* + * \brief Internal routine for intializing the cuda backend. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_start(const int max_batch_size, const int nshards, + dbm_shard_t *shards_c_host, + dbm_multiply_cuda_context_t *ctx); + +/******************************************************************************* + * \brief Internal routine for uploading newly arrived packs onto the device. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_upload_packs(const dbm_pack_t *pack_a, + const dbm_pack_t *pack_b, + dbm_multiply_cuda_context_t *ctx); + +/******************************************************************************* + * \brief Internal routine for executing the tasks in given batch on the GPU. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_process_batch(const int ntasks, const dbm_task_t *batch, + const bool transa, const bool transb, + const double alpha, const int kshard, + dbm_multiply_cuda_context_t *ctx); + +/******************************************************************************* + * \brief Internal routine for downloading results from the device. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_download_results(dbm_multiply_cuda_context_t *ctx); + +/******************************************************************************* + * \brief Internal routine for shutting down the cuda backend. + * \author Ole Schuett + ******************************************************************************/ +void dbm_multiply_cuda_stop(dbm_multiply_cuda_context_t *ctx); + +#ifdef __cplusplus +} +#endif + +#endif // __DBM_CUDA +#endif + +// EOF diff --git a/src/dbm/dbm_multiply_internal.h b/src/dbm/dbm_multiply_internal.h new file mode 100644 index 0000000000..71248b75e5 --- /dev/null +++ b/src/dbm/dbm_multiply_internal.h @@ -0,0 +1,39 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#ifndef DBM_MULTIPLY_INTERNAL_H +#define DBM_MULTIPLY_INTERNAL_H + +#include "dbm_shard.h" + +/******************************************************************************* + * \brief Internal struct for storing a pack - essentially a shard for MPI. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int nblocks; + int data_size; + dbm_block_t *blocks; + double *data; +} dbm_pack_t; + +/******************************************************************************* + * \brief Internal struct for storing a task, ie. a single block multiplication. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int m; + int n; + int k; + int offset_a; + int offset_b; + int offset_c; +} dbm_task_t; + +#endif + +// EOF diff --git a/src/dbm/dbm_shard.c b/src/dbm/dbm_shard.c new file mode 100644 index 0000000000..5b0816574a --- /dev/null +++ b/src/dbm/dbm_shard.c @@ -0,0 +1,198 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include + +#include "dbm_hyperparams.h" +#include "dbm_shard.h" + +/******************************************************************************* + * \brief Internal routine for initializing a shard. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_init(dbm_shard_t *shard) { + shard->nblocks = 0; + shard->nblocks_allocated = INITIAL_NBLOCKS_ALLOCATED; + shard->blocks = malloc(shard->nblocks_allocated * sizeof(dbm_block_t)); + + shard->hashtable_size = HASHTABLE_FACTOR * shard->nblocks_allocated; + shard->hashtable = calloc(shard->hashtable_size, sizeof(int)); + + shard->data_size = 0; + shard->data_promised = 0; + shard->data_allocated = INITIAL_DATA_ALLOCATED; + shard->data = malloc(shard->data_allocated * sizeof(double)); + + omp_init_lock(&shard->lock); +} + +/******************************************************************************* + * \brief Internal routine for releasing a shard. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_release(dbm_shard_t *shard) { + free(shard->blocks); + free(shard->hashtable); + free(shard->data); + omp_destroy_lock(&shard->lock); +} + +/******************************************************************************* + * \brief Private hash function based on Szudzik's elegant pairing. + * Using unsigned int to return a positive number even after overflow. + * https://en.wikipedia.org/wiki/Pairing_function#Other_pairing_functions + * https://stackoverflow.com/a/13871379 + * http://szudzik.com/ElegantPairing.pdf + * \author Ole Schuett + ******************************************************************************/ +static inline unsigned int hash(const unsigned int row, + const unsigned int col) { + return (row >= col) ? row * row + row + col : row + col * col; +} + +/******************************************************************************* + * \brief Private routine for inserting a block into a shard's hashtable. + * \author Ole Schuett + ******************************************************************************/ +static void hashtable_insert(dbm_shard_t *shard, const int block_idx) { + assert(0 <= block_idx && block_idx < shard->nblocks); + const dbm_block_t *blk = &shard->blocks[block_idx]; + int slot = hash(blk->row, blk->col) % shard->hashtable_size; + while (true) { + if (shard->hashtable[slot] == 0) { + shard->hashtable[slot] = block_idx + 1; // 1-based because 0 means empty + return; + } + // linear probing + slot = (slot + 1) % shard->hashtable_size; + } +} + +/******************************************************************************* + * \brief Internal routine for looking up a block from a shard. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_lookup(const dbm_shard_t *shard, const int row, + const int col) { + int slot = hash(row, col) % shard->hashtable_size; + while (true) { + const int block_idx = shard->hashtable[slot] - 1; // 1-based, 0 means empty. + if (block_idx < 0) { + return NULL; // block not found + } + assert(0 <= block_idx && block_idx < shard->nblocks); + dbm_block_t *blk = &shard->blocks[block_idx]; + if (blk->row == row && blk->col == col) { + return blk; + } + // linear probing + slot = (slot + 1) % shard->hashtable_size; + } +} + +/******************************************************************************* + * \brief Internal routine for allocating the metadata of a new block. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, + const int col, const int block_size) { + // Grow blocks array if nessecary. + if (shard->nblocks_allocated < shard->nblocks + 1) { + dbm_block_t *old_blocks = shard->blocks; + shard->nblocks_allocated = ALLOCATION_FACTOR * (shard->nblocks + 1); + shard->blocks = malloc(shard->nblocks_allocated * sizeof(dbm_block_t)); + memcpy(shard->blocks, old_blocks, shard->nblocks * sizeof(dbm_block_t)); + free(old_blocks); + + // rebuild hashtable + free(shard->hashtable); + shard->hashtable_size = HASHTABLE_FACTOR * shard->nblocks_allocated; + shard->hashtable = calloc(shard->hashtable_size, sizeof(int)); + for (int i = 0; i < shard->nblocks; i++) { + hashtable_insert(shard, i); + } + } + + const int new_block_idx = shard->nblocks; + shard->nblocks++; + dbm_block_t *new_block = &shard->blocks[new_block_idx]; + new_block->row = row; + new_block->col = col; + new_block->offset = shard->data_promised; + new_block->norm = 0.0; // initially data will be zeroed + shard->data_promised += block_size; + // The data_size will be increase after the memory is allocated and zeroed. + hashtable_insert(shard, new_block_idx); + return new_block; +} + +/******************************************************************************* + * \brief Internal routine for allocating and zeroing any promised block's data. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard) { + + // Reallocate data array if nessecary. + if (shard->data_promised > shard->data_allocated) { + double *old_data = shard->data; + shard->data_allocated = ALLOCATION_FACTOR * shard->data_promised; + shard->data = malloc(shard->data_allocated * sizeof(double)); + memcpy(shard->data, old_data, shard->data_size * sizeof(double)); + free(old_data); + } + + // Zero new blocks. + // The following memset is usually the first touch of the memory, which leads + // to frequent page faults. The executing thread determines the NUMA location + if (shard->data_promised > shard->data_size) { + const int tail = shard->data_promised - shard->data_size; + memset(&shard->data[shard->data_size], 0, tail * sizeof(double)); + shard->data_size = shard->data_promised; + } +} + +/******************************************************************************* + * \brief Internal routine for getting block or promising a new one. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, + const int col, + const int block_size) { + dbm_block_t *existing_blk = dbm_shard_lookup(shard, row, col); + if (existing_blk != NULL) { + return existing_blk; + } else { + return dbm_shard_promise_new_block(shard, row, col, block_size); + } +} + +/******************************************************************************* + * \brief Internal routine for getting block or allocating a new one. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, + const int col, + const int block_size) { + dbm_block_t *existing_blk = dbm_shard_lookup(shard, row, col); + if (existing_blk != NULL) { + return existing_blk; + } + + // Create a new block. + dbm_block_t *new_blk = + dbm_shard_promise_new_block(shard, row, col, block_size); + dbm_shard_allocate_promised_blocks(shard); + + return new_blk; +} + +// EOF diff --git a/src/dbm/dbm_shard.h b/src/dbm/dbm_shard.h new file mode 100644 index 0000000000..82ff677049 --- /dev/null +++ b/src/dbm/dbm_shard.h @@ -0,0 +1,94 @@ +/*----------------------------------------------------------------------------*/ +/* CP2K: A general program to perform molecular dynamics simulations */ +/* Copyright 2000-2022 CP2K developers group */ +/* */ +/* SPDX-License-Identifier: BSD-3-Clause */ +/*----------------------------------------------------------------------------*/ +#ifndef DBM_SHARD_H +#define DBM_SHARD_H + +#include + +/******************************************************************************* + * \brief Internal struct for storing a block's metadata. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int row; + int col; + int offset; + float norm; +} dbm_block_t; + +/******************************************************************************* + * \brief Internal struct for storing a matrix shard. + * \author Ole Schuett + ******************************************************************************/ +typedef struct { + int nblocks; + int nblocks_allocated; + dbm_block_t *blocks; + + int hashtable_size; + int *hashtable; // maps row/col to block numbers + + int data_promised; // referenced by a dbm_block_t.offset, but not yet + // allocated + int data_allocated; // we over allocate to amortized the resizing cost + int data_size; // actually allocated and initialized + double *data; + + omp_lock_t lock; // used by dbm_put_block +} dbm_shard_t; + +/******************************************************************************* + * \brief Internal routine for initializing a shard. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_init(dbm_shard_t *shard); + +/******************************************************************************* + * \brief Internal routine for releasing a shard. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_release(dbm_shard_t *shard); + +/******************************************************************************* + * \brief Internal routine for looking up a block from a shard. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_lookup(const dbm_shard_t *shard, const int row, + const int col); + +/******************************************************************************* + * \brief Internal routine for allocating the metadata of a new block. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, + const int col, const int block_size); + +/******************************************************************************* + * \brief Internal routine for allocating and zeroing any promised block's data. + * \author Ole Schuett + ******************************************************************************/ +void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard); + +/******************************************************************************* + * \brief Internal routine for getting block or promising a new one. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, + const int col, + const int block_size); + +/******************************************************************************* + * \brief Internal routine for getting block or allocating a new one. + * \author Ole Schuett + ******************************************************************************/ +dbm_block_t *dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, + const int col, + const int block_size); + +#endif + +// EOF diff --git a/src/f77_interface.F b/src/f77_interface.F index 92700d0d9f..18be4718da 100644 --- a/src/f77_interface.F +++ b/src/f77_interface.F @@ -47,6 +47,8 @@ MODULE f77_interface unpack_subsys_particles USE dbcsr_api, ONLY: dbcsr_finalize_lib,& dbcsr_init_lib + USE dbm_api, ONLY: dbm_library_finalize,& + dbm_library_init USE eip_environment, ONLY: eip_init USE eip_environment_types, ONLY: eip_env_create,& eip_env_release,& @@ -293,6 +295,8 @@ SUBROUTINE init_cp2k(init_mpi, ierr) CALL pw_cuda_init() CALL grid_library_init() + + CALL dbm_library_init() ELSE ierr = cp_failure_level END IF @@ -327,6 +331,8 @@ SUBROUTINE finalize_cp2k(finalize_mpi, ierr) END DO DEALLOCATE (f_envs) + CALL dbm_library_finalize() + CALL grid_library_finalize() ! Finalize the DBCSR configuration diff --git a/src/offload/offload_library.h b/src/offload/offload_library.h index bae6041ba9..9dd53b0252 100644 --- a/src/offload/offload_library.h +++ b/src/offload/offload_library.h @@ -11,7 +11,7 @@ extern "C" { #endif -#if defined(__GRID_CUDA) || defined(__PW_CUDA) +#if defined(__GRID_CUDA) || defined(__DBM_CUDA) || defined(__PW_CUDA) #define __OFFLOAD_CUDA #elif defined(__GRID_HIP) #define __OFFLOAD_HIP diff --git a/tools/precommit/check_file_properties.py b/tools/precommit/check_file_properties.py index 462da9e72b..175782e3a1 100755 --- a/tools/precommit/check_file_properties.py +++ b/tools/precommit/check_file_properties.py @@ -19,6 +19,7 @@ r"\$\{.*\}\$", r"__.*__", r"CUDA_VERSION", + r"DBM_VALIDATE_AGAINST_DBCSR", r"FD_DEBUG", r"GRID_DO_COLLOCATE", r"INTEL_MKL_VERSION", @@ -97,7 +98,7 @@ C_EXTENSIONS = (".c", ".cu", ".cpp", ".cc", ".h", ".hpp") -BSD_DIRECTORIES = ("src/offload/", "src/grid/") +BSD_DIRECTORIES = ("src/offload/", "src/grid/", "src/dbm/") @lru_cache(maxsize=None) diff --git a/tools/toolchain/scripts/generate_arch_files.sh b/tools/toolchain/scripts/generate_arch_files.sh index 59fc295a2e..3542eca046 100755 --- a/tools/toolchain/scripts/generate_arch_files.sh +++ b/tools/toolchain/scripts/generate_arch_files.sh @@ -116,7 +116,7 @@ LIBS="${CP_LIBS} -lstdc++" # CUDA handling CUDA_LIBS="-lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt IF_DEBUG(-lnvToolsExt|)" -CUDA_DFLAGS="-D__GRID_CUDA -D__DBCSR_ACC -D__PW_CUDA IF_DEBUG(-D__OFFLOAD_PROFILING|)" +CUDA_DFLAGS="-D__GRID_CUDA -D__DBM_CUDA -D__DBCSR_ACC -D__PW_CUDA IF_DEBUG(-D__OFFLOAD_PROFILING|)" if [ "${ENABLE_CUDA}" = __TRUE__ ] && [ "${GPUVER}" != no ]; then LIBS="${LIBS} IF_CUDA(${CUDA_LIBS}|)" DFLAGS="IF_CUDA(${CUDA_DFLAGS}|) ${DFLAGS}"