Update DBCSR to v2.1.0-rc10 (#809)

* Init DBCSR as first to set GPU device * Add a comment, DBCSR has to be the first GPU call (workaround for mult-gpu) * Remove MPI allocation in DBCSR * Update dbcsr to 2.1.0-rc10
cp2k · Mar 10, 2020 · 6f13f0d · 6f13f0d
1 parent 1736ca2
commit 6f13f0d
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 24 deletions.
diff --git a/INSTALL.md b/INSTALL.md
@@ -95,23 +95,24 @@ FFTW can be used to improve FFT speed on a wide range of architectures. It is st
   * `-D__MAX_CONTR=4` (default=2) can be used to compile efficient contraction kernels up to l=4, but the build time will increase accordingly.
 
 ### 2h. libsmm (optional, improved performance for matrix multiplication)
-  * A library for small matrix multiplies can be built from the included source (see tools/build_libsmm/README).  Usually only the double precision real and perhaps complex is needed.  Link to the generated libraries. For a couple of architectures prebuilt libsmm are available at https://www.cp2k.org/static/downloads/libsmm/.
+  * A library for small matrix multiplies can be built from the included source (see exts/dbcsr/tools/build_libsmm/README).  Usually only the double precision real and perhaps complex is needed.  Link to the generated libraries. For a couple of architectures prebuilt libsmm are available at https://www.cp2k.org/static/downloads/libsmm/.
   * Add `-D__HAS_smm_dnn` to the defines to make the code use the double precision real library.  Similarly use `-D__HAS_smm_snn` for single precision real and `-D__HAS_smm_znn` / `-D__HAS_smm_cnn` for double / single precision complex.
   * Add `-D__HAS_smm_vec` to enable the new vectorized interfaces of libsmm.
 
 ### 2i. libxsmm (optional, improved performance for matrix multiplication)
   * A library for matrix operations and deep learning primitives: https://github.com/hfp/libxsmm/
-  * Add `-D__LIBXSMM` to enable it (with suitable include and library paths)
+  * Add `-D__LIBXSMM` to enable it, with suitable include and library paths, e.g. `FCFLAGS += -I${LIBXSMM_DIR}/include -D__LIBXSMM` and `LIBS += -L${LIBXSMM_DIR}/lib -lxsmmf -lxsmm -ldl`
 
 ### 2j. CUDA (optional, improved performance on GPU systems)
+  * Specify NVCC (e.g. `NVCC = nvcc`) and NVFLAGS (e.g. `NVFLAGS = -O3 -g -w --std=c++11`) variables.
   * `-D__ACC` needed to enable accelerator support.
   * Use the `-D__DBCSR_ACC` to enable accelerator support for matrix multiplications.
-  * Add `-lcudart -lrt -lnvrtc` to LIBS.
-  * Specify the GPU type (e.g. `GPUVER   = P100`)
+  * Add `-lstdc++ -lcudart -lnvrtc -lcuda -lcublas` to LIBS.
+  * Specify the GPU type (e.g. `GPUVER   = P100`), possible values are K20X, K40, K80, P100, V100.
   * Specify the C++ compiler (e.g. `CXX = g++`). Remember to set the flags to support C++11 standard.
   * Use `-D__PW_CUDA` for CUDA support for PW (gather/scatter/fft) calculations.
   * CUFFT 7.0 has a known bug and is therefore disabled by default. NVIDIA's webpage list a patch (an upgraded version cufft i.e. >= 7.0.35) - use this together with `-D__HAS_PATCHED_CUFFT_70`.
-  * Use `-D__CUDA_PROFILING` to turn on Nvidia Tools Extensions.
+  * Use `-D__CUDA_PROFILING` to turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt`.
   * Link to a blas/scalapack library that accelerates large DGEMMs (e.g. libsci_acc)
 
 ### 2k. libxc (optional, wider choice of xc functionals)
@@ -260,7 +261,6 @@ Features useful to deal with legacy systems
   * `-D__NO_IPI_DRIVER` disables the socket interface in case of troubles compiling on systems that do not support POSIX sockets.
   * `-D__HAS_IEEE_EXCEPTIONS` disables trapping temporarily for libraries like scalapack.
   * The Makefile automatically compiles in the path to the data directory via the flag `-D__DATA_DIR`. If you want to compile in a different path, set the variable `DATA_DIR` in your arch-file.
-  * `-D__HAS_NO_CUDA_STREAM_PRIORITIES` - Needed for CUDA sdk version < 5.5
   * `-D__NO_STATM_ACCESS` - Do not try to read from /proc/self/statm to get memory usage information. This is otherwise attempted on several. Linux-based architectures or using with the NAG, gfortran, compilers.
   * `-D__CHECK_DIAG` Debug option which activates an orthonormality check of the eigenvectors calculated by the selected eigensolver
 

diff --git a/exts/Makefile.inc b/exts/Makefile.inc
@@ -1,4 +1,4 @@
-ifeq ("","$(wildcard $(EXTSHOME)/dbcsr/Makefile)")
+ifeq ("","$(wildcard $(EXTSHOME)/dbcsr/.cp2k/Makefile)")
 $(error "No DBCSR submodule available, please run 'git submodule update --init --recursive'")
 endif
 
@@ -13,19 +13,22 @@ extclean: dbcsrclean
 	@echo "Clean EXT"
 
 dbcsr:
-	+$(MAKE) -C $(EXTSHOME)/$@ \
+	+$(MAKE) -C $(EXTSHOME)/$@ -f .cp2k/Makefile \
 	   INCLUDEMAKE=$(ARCHDIR)/$(ARCH).$(ONEVERSION) \
 	   LIBDIR=$(LIBEXTSDIR)/$@ \
 	   OBJDIR=$(OBJEXTSDIR)/$@ \
-	   FYPPEXE=$(TOOLSRC)/build_utils/fypp
+	   ACC="$(NVCC)" \
+	   ACCFLAGS="$(NVFLAGS)"
 
 dbcsrversion:
-	@$(MAKE) -C $(EXTSHOME)/dbcsr \
-	   FYPPEXE=$(TOOLSRC)/build_utils/fypp \
+	@$(MAKE) -C $(EXTSHOME)/dbcsr -f .cp2k/Makefile \
+	   ACC="$(NVCC)" \
+	   ACCFLAGS="$(NVFLAGS)" \
 	   version
 
 dbcsrclean:
 	@echo "Clean DBCSR"
-	@$(MAKE) -C $(EXTSHOME)/dbcsr \
-	   FYPPEXE=$(TOOLSRC)/build_utils/fypp \
+	@$(MAKE) -C $(EXTSHOME)/dbcsr -f .cp2k/Makefile \
+	   ACC="$(NVCC)" \
+	   ACCFLAGS="$(NVFLAGS)" \
 	   clean
diff --git a/exts/dbcsr b/exts/dbcsr
diff --git a/src/cp2k_info.F b/src/cp2k_info.F
@@ -222,9 +222,6 @@ FUNCTION cp2k_flags() RESULT(flags)
 #if defined __HAS_LIBGRID
       flags = TRIM(flags)//" libgrid"
 #endif
-#if defined __HAS_NO_CUDA_STREAM_PRIORITIES
-      flags = TRIM(flags)//" has_no_cuda_stream_priorities"
-#endif
 #if defined __MAX_CONTR
       CALL integer_to_string(__MAX_CONTR, tmp_str)
       flags = TRIM(flags)//" max_contr="//TRIM(tmp_str)

diff --git a/src/f77_interface.F b/src/f77_interface.F
@@ -265,12 +265,13 @@ SUBROUTINE init_cp2k(init_mpi, ierr)
          !   *** init the bibliography ***
          CALL add_all_references()
 
-         CALL pw_cuda_init()
-
          ! Initialize the DBCSR configuration
          ! Attach the time handler hooks to DBCSR
+         ! DBCSR sets the device for multi-gpu, make sure it is the first GPU call
          CALL dbcsr_init_lib(default_para_env%group, timeset_hook, timestop_hook, &
                              cp_abort_hook, cp_warn_hook, io_unit=unit_nr)
+
+         CALL pw_cuda_init()
       ELSE
          ierr = cp_failure_level
       END IF

diff --git a/src/start/cp2k_runs.F b/src/start/cp2k_runs.F
@@ -191,14 +191,15 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
       NULLIFY (para_env, f_env, dft_control)
       CALL cp_para_env_create(para_env, group=mpi_comm, owns_group=.FALSE.)
 
+      ! DBCSR sets the GPU device, make sure it is the first GPU call
+      CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
+
       CALL cuda_nvtx_init()
 
       CALL pw_cuda_init()
 
       CALL pw_fpga_init()
 
-      CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
-
       CALL cp_sirius_init()
 
       NULLIFY (globenv, force_env)
@@ -262,11 +263,12 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
          CALL cp_sirius_finalize()
          CALL pw_cuda_finalize()
          CALL pw_fpga_finalize()
+         ! DBCSR sets the GPU device, make sure it is the first GPU call
          CALL farming_run(input_declaration, root_section, para_env, initial_variables)
-         CALL cp_sirius_init()
          CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
          CALL pw_cuda_init()
          CALL pw_fpga_init()
+         CALL cp_sirius_init()
       CASE (do_opt_basis)
          CALL run_optimize_basis(input_declaration, root_section, para_env)
          globenv%run_type_id = none_run

diff --git a/tools/regtesting/do_regtest b/tools/regtesting/do_regtest
@@ -219,6 +219,8 @@ else
     numprocs=${numprocs:-1}
 fi
 
+mem_limit=unlimited
+
 if [[ $numprocs > 1 ]] ; then
     if [[ "$farming" == "yes" ]]; then
        cp2k_run_prefix=${cp2k_run_prefix:-"${mpiexec} -np $(($maxtasks + 1)) ${valgrindstring}"}
@@ -395,7 +397,7 @@ function run_regtest_dir() {
           cp2k_exit_status=43
        fi
      else
-       ( ulimit -t ${job_max_time} ; ${cp2k_prefix} ${input_file} ${cp2k_postfix} &> ${output_file} ) >& /dev/null
+       ( ulimit -t ${job_max_time} -v ${mem_limit} ; ${cp2k_prefix} ${input_file} ${cp2k_postfix} &> ${output_file} ) >& /dev/null
        (( cp2k_exit_status = $? ))
      fi
      # check if this was a valgrinded run, and adjust exit status as needed
@@ -536,7 +538,7 @@ function run_unittest() {
   output_file=${dir_test}/${dir}.out
   unittest=${dir##*/}
   testcmd="${cp2k_run_prefix} ${dir_base}/${cp2k_dir}/exe/${dir_triplet}/${unittest}.${cp2k_version} ${dir_base}/${cp2k_dir}"
-  ( ulimit -t ${job_max_time} ; ${testcmd} &> ${output_file} ) >& /dev/null
+  ( ulimit -t ${job_max_time} -v ${mem_limit} ; ${testcmd} &> ${output_file} ) >& /dev/null
   (( cp2k_exit_status = $? ))
   t2=`date +%s`
   timing=$((1+t2-t1))
@@ -691,6 +693,7 @@ echo "skiptest         = ${skiptest}"
 echo "do_unit_test     = ${do_unit_test}"
 echo "farming          = ${farming}"
 echo "maxbuildtasks    = ${maxbuildtasks}"
+echo "mem_limit        = ${mem_limit}"
 
 t=1
 while [ $t -le ${ndirstoskip} ]; do