DBM: Add new Distributed Block-sparse Matrix library

cp2k · Jan 17, 2022 · 82a440d · 82a440d
1 parent 48211d0
commit 82a440d
Show file tree

Hide file tree

Showing 35 changed files with 5,645 additions and 3 deletions.
diff --git a/INSTALL.md b/INSTALL.md
@@ -199,6 +199,7 @@ the FFTW3 threading library libfftw3_threads (or libfftw3_omp) is required.
   It requires to link `-lnvToolsExt`.
 - Link to a blas/scalapack library that accelerates large DGEMMs (e.g. libsci_acc)
 - Use the `-D__GRID_CUDA` to compile the GPU and HYBRID backends for the grid library.
+- Use the `-D__DBM_CUDA` to compile the GPU backend for the sparse tensor library.
 
 ### 2k. LIBXC (optional, wider choice of xc functionals)
 

diff --git a/src/PACKAGE b/src/PACKAGE
@@ -20,6 +20,7 @@
         "dbcsrx",
         "arnoldi",
         "grid",
+        "dbm",
     ],
     "implicit": "INIT_METADYN|META_FORCE_CALCULATION|plumed_f_installed|plumed_f_gcreate|plumed_f_gcmd",
 }
diff --git a/src/cp2k_info.F b/src/cp2k_info.F
@@ -289,6 +289,9 @@ FUNCTION cp2k_flags() RESULT(flags)
 #if defined(__GRID_HIP)
       flags = TRIM(flags)//" grid_hip"
 #endif
+#if defined(__DBM_CUDA)
+      flags = TRIM(flags)//" dbm_cuda"
+#endif
 #if defined(__OFFLOAD_PROFILING)
       flags = TRIM(flags)//" offload_profiling"
 #endif

diff --git a/src/dbm/LICENSE b/src/dbm/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2021, CP2K developers group
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/dbm/Makefile b/src/dbm/Makefile
@@ -0,0 +1,41 @@
+.PHONY : all clean
+
+all: dbm_miniapp.x
+
+clean:
+	rm -fv *.o */*.o *.x ../offload/*.o
+
+CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra
+NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__DBM_CUDA
+LIBS := -lm -lblas -lstdc++
+
+ALL_HEADERS := $(shell find . -name "*.h") $(shell find ../offload/ -name "*.h")
+ALL_OBJECTS := ../offload/offload_library.o \
+               dbm_distribution.o \
+               dbm_library.o \
+               dbm_matrix.o \
+               dbm_mempool.o \
+               dbm_mpi.o \
+               dbm_multiply.o \
+               dbm_multiply_comm.o \
+               dbm_multiply_cpu.o \
+               dbm_shard.o
+
+# Enable Cuda when nvcc compiler is present.
+NVCC := $(shell which nvcc)
+ifneq ($(NVCC),)
+LIBS += -lcudart -lcuda -lcublas -L${CUDA_PATH}/lib64
+CFLAGS += -I${CUDA_PATH}/include -D__DBM_CUDA
+ALL_OBJECTS += dbm_multiply_cuda.o
+
+%.o: %.cu $(ALL_HEADERS)
+	cd $(dir $<); $(NVCC) -c $(NVFLAGS) $(notdir $<)
+endif
+
+%.o: %.c $(ALL_HEADERS)
+	cd $(dir $<); $(CC) -c -std=c11 $(CFLAGS) $(notdir $<)
+
+dbm_miniapp.x: dbm_miniapp.o $(ALL_OBJECTS)
+	$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
+
+#EOF
diff --git a/src/dbm/PACKAGE b/src/dbm/PACKAGE
@@ -0,0 +1,5 @@
+{
+    "description": "Distributed Block-sparse Matrix",
+    "requires": ["../base", "../mpiwrap", "../offload"],
+    "public": ["dbm_api.F"],
+}
diff --git a/src/dbm/README.md b/src/dbm/README.md
@@ -0,0 +1,80 @@
+# DBM: Distributed Block-sparse Matrices
+
+The DBM is a drop-in replacement for [DBCSR](https://github.com/cp2k/dbcsr)
+written in C. For the time being only features required by [DBT](../dbt/) are implemented.
+
+## Storage
+
+The DBM uses [coordinate lists](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO))
+as internal storage format.
+An existing block is represented by the following data structure:
+
+```C
+typedef struct {
+  int row; // zero based
+  int col;
+  int offset;
+  float norm;
+} dbm_block_t;
+```
+
+The `norm` is cached for performance reasons.
+A negative value indicates that the norm is invalid and needs to be recomputed.
+
+To allow for efficient OpenMP parallelism the blocks are
+[sharded](https://en.wikipedia.org/wiki/Shard_(database_architecture)) via round-robin:
+
+```C
+const int ishard = row % matrix->nshards;
+```
+
+## MPI Communication
+
+The communication scheme in [dbm_multiply_comm.c](./dbm_multiply_comm.c) is decoupled
+from the local multiplication in [dbm_multiply.c](./dbm_multiply.c) via the
+[iterator pattern](https://en.wikipedia.org/wiki/Iterator_pattern):
+
+```C
+while (dbm_comm_iterator_next(iter, &pack_a, &pack_b)) {
+  backend_upload_packs(pack_a, pack_b, ctx);
+  multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b,
+                 matrix_c, rows_left_max_eps, flop, ctx);
+}
+```
+
+## Backends
+
+The last stage of the multiplication are the backends for specific hardware, e.g.
+[CPU](./dbm_multiply_cpu.c) and [CUDA](./dbm_multiply_cuda.cu).
+They are passed batches of task for processing. Each task describes a single block
+multiplication. A simplest backend implementation looks like this:
+
+<!-- markdownlint-disable MD013 -->
+```C
+for (int itask = 0; itask < ntasks; itask++) {
+  const dbm_task_t task = batch[itask];
+  const int lda = (transa) ? task.k : task.m;
+  const int ldb = (transb) ? task.n : task.k;
+  const int ldc = task.m;
+  const double *data_a = &pack_a->data[task.offset_a];
+  const double *data_b = &pack_b->data[task.offset_b];
+  double *data_c = &shard_c->data[task.offset_c];
+  dgemm(transa, transb, task.m, task.n, task.k, alpha, data_a, lda, data_b, ldb, 1.0, data_c, ldc);
+}
+```
+<!-- markdownlint-enable MD013 -->
+
+## MiniApp
+
+The `dbm_miniapp.x` binary allows to run a simple smoke test.
+
+```shell
+$ cd cp2k/src/dbm
+$ make -j
+$ OMP_NUM_THREADS=2 ./dbm_miniapp.x
+MPI ranks:      1
+OpenMP threads: 2
+reserve blocks: 0.047 seconds
+matrix multiply: 0.001 s, 2.1 MFLOP/s
+done :-)
+```