Feature: compute module_gint by GPU (#4109)

* add CUDA code for module_gint and fix bug about cusolver * add comments for code_gen.cpp * add integrated test cases for CUDA gint and cusolver * add some gint test cases * modify cuda tests * modify the location of the CUDA test folder * modify some GPU test cases * modify some STRU and INPUT files for GPU test cases to reduce testing time * format INPUT file in GPU test cases * format some code * add curly brackets to if and for statements * add const to some function arguments * fix comments * fix error in gint force * remove const in gint_rho * Update input_conv_test.cpp modify input test about GPU * remove time.sh in tests/integrate * modify INPUT file in GPU test cases * remove inappropriate changes in tests/performance. * remove USE_CUSOLVER_LCAO flag * enable check in non-debug enviroment * Use abbreviations instead of unreasonable naming * fix error in debug * modify the doc about ks_solver * change the default ks_solver to cusolver * modify cuda.md --------- Co-authored-by: A-006 <3158793232@qq.com> Co-authored-by: Mohan Chen <mohan.chen.chen.mohan@gmail.com>
deepmodeling · May 12, 2024 · a2798b2 · a2798b2
1 parent 922db46
commit a2798b2
Show file tree

Hide file tree

Showing 126 changed files with 14,911 additions and 702 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,9 +13,8 @@ project(
 option(ENABLE_LCAO "Enable LCAO calculation." ON)
 option(ENABLE_DEEPKS "Enable DeePKS functionality" OFF)
 option(ENABLE_LIBXC "Enable LibXC functionality" OFF)
-option(USE_CUDA "Enable support to CUDA for PW." OFF)
+option(USE_CUDA "Enable support to CUDA for ABACUS." OFF)
 option(ENABLE_FLOAT_FFTW "Enable support to single precision FFTW library." OFF)
-# option(USE_CUSOLVER_LCAO "Enable support to CUSOLVER for LCAO." OFF)
 option(USE_ROCM "Enable support to ROCm." OFF)
 option(USE_OPENMP "Enable OpenMP in ABACUS." ON)
 option(ENABLE_ASAN "Enable AddressSanitizer" OFF)
@@ -68,11 +67,6 @@ if(ENABLE_RAPIDJSON)
   include_directories(${RapidJSON_INCLUDE_PATH})
 endif()
 
-if(USE_CUDA)
-  set(USE_CUSOLVER_LCAO ON)
-else()
-  set(USE_CUSOLVER_LCAO OFF)
-endif()
 # get commit info
 if(COMMIT_INFO)
   find_package(Git)
@@ -247,37 +241,28 @@ endif()
 include(CheckLanguage)
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
-  if(NOT DEFINED USE_CUDA OR NOT DEFINED USE_CUSOLVER_LCAO)
-    if(NOT DEFINED USE_CUDA AND NOT DEFINED USE_CUSOLVER_LCAO)
-      message(
-        "CUDA components detected. \nWill build the CUDA for PW version of ABACUS by default."
-      )
-      set(USE_CUDA ON)
-      set(USE_CUSOLVER_LCAO OFF)
-    elseif(NOT DEFINED USE_CUDA)
-      set(USE_CUDA OFF)
-    else()
-      set(USE_CUSOLVER_LCAO OFF)
-    endif()
+  if(NOT DEFINED USE_CUDA)
+    message(
+      "CUDA components detected. \nWill build the CUDA version of ABACUS by default."
+    )
+    set(USE_CUDA ON)
   else()
-    if(NOT USE_CUDA AND NOT USE_CUSOLVER_LCAO)
+    if(NOT USE_CUDA)
       message(
         STATUS
-          "CUDA components detected, but both USE_CUDA and USE_CUSOLVER_LCAO set to OFF. NOT building CUDA version of ABACUS."
+          "CUDA components detected, but USE_CUDA is set to OFF. NOT building CUDA version of ABACUS."
       )
     endif()
   endif()
 else() # CUDA not found
-  if(USE_CUDA OR USE_CUSOLVER_LCAO)
+  if(USE_CUDA)
     message(
       FATAL_ERROR
-        "USE_CUDA or USE_CUSOLVER_LCAO set but no CUDA components found.")
-    set(USE_CUDA OFF)
-    set(USE_CUSOLVER_LCAO OFF)
+        "USE_CUDA is set but no CUDA components found.")
   endif()
 endif()
 
-if(USE_CUDA OR USE_CUSOLVER_LCAO)
+if(USE_CUDA)
   cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below
   set_if_higher(CMAKE_CXX_STANDARD 14)
   set(CMAKE_CXX_EXTENSIONS ON)
@@ -317,12 +302,12 @@ if(USE_CUDA OR USE_CUSOLVER_LCAO)
   if(USE_CUDA)
     add_compile_definitions(__CUDA)
     add_compile_definitions(__UT_USE_CUDA)
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+      set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE)
+    endif()
+    if (USE_OPENMP AND OpenMP_CXX_FOUND)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE)
     endif()
-  endif()
-  if(USE_CUSOLVER_LCAO)
-    add_compile_definitions(__CUSOLVER_LCAO)
   endif()
 endif()
 
@@ -716,7 +701,7 @@ if(ENABLE_LCAO)
   if(USE_ELPA)
     target_link_libraries(${ABACUS_BIN_NAME} genelpa)
   endif()
-  if(USE_CUSOLVER_LCAO)
+  if(USE_CUDA)
     target_link_libraries(diag_cusolver)
   endif()
 endif()

diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md
@@ -1,7 +1,6 @@
 # CUDA GPU Implementations
 
-In ABACUS, we provide the option to use the GPU devices to accelerate the performance.
-And it has the following general features:
+In ABACUS, we provide the option to use GPU devices to accelerate performance. The implementation of GPU acceleration differs between PW basis and LCAO basis. Specifically, under PW basis, it has the following features:
 
 - **Full gpu implementations**: During the SCF progress, `Psi`, `Hamilt`, `Hsolver`, `DiagCG`, and `DiagoDavid` classes are stored or calculated by the GPU devices.
 
@@ -13,6 +12,8 @@ And it has the following general features:
 
 - **Parallel strategy**: K point parallel.
 
+Unlike PW basis, only the grid integration module (module_gint) and the diagonalization of the Hamiltonian matrix (module_hsolver) have been implemented with GPU acceleration under LCAO basis, and the acceleration is limited to gamma only calculation. Additionally, LCAO basis does not support multi-GPU acceleration. Both the grid integration module and the Hamiltonian matrix solver only support acceleration on a single GPU.
+
 ## Required hardware/software
 
 To compile and use ABACUS in CUDA mode, you currently need to have an NVIDIA GPU and install the corresponding NVIDIA CUDA toolkit software on your system (this is only tested on Linux and unsupported on Windows):
@@ -36,14 +37,11 @@ In `INPUT` file we need to set the value keyword [device](../input_files/input-m
 We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.
 
 ## Known limitations
-
-- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`,
-- Only PW basis is supported, so the input keyword `basis_type` can only take the value `pw`,
+PW basis:
+- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`.
 - Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically.
 - By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).
 
-## FAQ
-```
-Q: Does the GPU implementations support atomic orbital basis sets?
-A: Currently no.
-```
+LCAO basis:
+- Does not support multi-k calculation, so if the input keyword `device` is set to `gpu`, the input keyword `gamma_only` can only take the value `1`.
+- Does not support multi-GPU acceleration.
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -61,6 +61,7 @@
     - [search\_radius](#search_radius)
     - [search\_pbc](#search_pbc)
     - [bx, by, bz](#bx-by-bz)
+    - [num\_stream] (#num_stream)
   - [Electronic structure](#electronic-structure)
     - [basis\_type](#basis_type)
     - [ks\_solver](#ks_solver)
@@ -643,10 +644,8 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
   - cpu: for CPUs via Intel, AMD, or Other supported CPU devices
   - gpu: for GPUs via CUDA or ROCm.
 
-  Known limitations:
+  Known limitations: If using the pw basis, the ks_solver must be cg/bpcg/dav to support `gpu` acceleration. If using the lcao basis, `gamma_only` must be set to `1`, as multi-k calculation is currently not supported for `gpu`. lcao_in_pw also does not support `gpu`.
 
-  - pw basis: required by the `gpu` acceleration options
-  - cg/bpcg/dav ks_solver: required by the `gpu` acceleration options
 - **Default**: cpu
 
 ### precision
@@ -883,6 +882,12 @@ These variables are used to control the numerical atomic orbitals related parame
 - **Description**: In the matrix operation of grid integral, bx/by/bz grids (in x, y, z directions) are treated as a whole as a matrix element. A different value will affect the calculation speed. The default is 0, which means abacus will automatically calculate these values.
 - **Default**: 0
 
+### num_stream
+
+- **Type** :int
+- **Description**: choose the number of streams in GPU when we compute the `LCAO`. According to different devices , we may have different effects.For most devices,the stream is
+enough when the number is bigger then 2.
+- **Default** : "4" 
 [back to top](#full-list-of-input-keywords)
 
 ## Electronic structure
@@ -914,7 +919,7 @@ calculations.
 
   - **genelpa**: This method should be used if you choose localized orbitals.
   - **scalapack_gvx**: Scalapack can also be used for localized orbitals.
-  - **cusolver**: (Unavailable currently, it will be fixed in future versions) This method needs building with the cusolver component for lcao and at least one gpu is available.
+  - **cusolver**: This method needs building with CUDA and at least one gpu is available.
 
   If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message:
 

diff --git a/examples/gpu/si16_lcao/INPUT b/examples/gpu/si16_lcao/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+device          gpu
+gamma_only      1  # GPU acceleration currently only support gamma_only set to 1.
+ks_solver		cusolver  # if not set, the default ks_solver is cusolver,
+                          # you can also choose genelpa or scalapack_gvx.
+
+#nbands			8
+symmetry		1
+
+#Parameters (2.Iteration)
+ecutwfc			100
+scf_thr		    1e-6
+scf_nmax		100
+cal_force       1
+cal_stress      1
+
+#Parameters (3.Basis)
+basis_type		lcao
+
+#Parameters (4.Smearing)
+smearing_method		gauss
+smearing_sigma		0.002
+
+#Parameters (5.Mixing)
+mixing_type		broyden
+mixing_beta		0.3
diff --git a/examples/gpu/si16_lcao/KPT b/examples/gpu/si16_lcao/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/examples/gpu/si16_lcao/STRU b/examples/gpu/si16_lcao/STRU
@@ -0,0 +1,37 @@
+ATOMIC_SPECIES
+Si 14.000 ../../../tests/PP_ORB/Si_ONCV_PBE-1.0.upf
+
+NUMERICAL_ORBITAL
+../../../tests/PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb
+
+LATTICE_CONSTANT
+0.999660
+
+LATTICE_VECTORS
+   10.20000    10.20000     0.00000
+   10.20000     0.00000    10.20000
+    0.00000    10.20000    10.20000
+
+ATOMIC_POSITIONS
+Direct
+
+Si
+0.0
+16
+  0.0000000   0.0000000   0.0000000 1 1 1
+  0.1250000   0.1250000   0.1250000 1 1 1
+  0.0000000   0.0000000   0.5000000 1 1 1
+  0.1250000   0.1250000   0.6250000 1 1 1
+  0.0000000   0.5000000   0.0000000 1 1 1
+  0.1250000   0.6250000   0.1250000 1 1 1
+  0.0000000   0.5000000   0.5000000 1 1 1
+  0.1250000   0.6250000   0.6250000 1 1 1
+  0.5000000   0.0000000   0.0000000 1 1 1
+  0.6250000   0.1250000   0.1250000 1 1 1
+  0.5000000   0.0000000   0.5000000 1 1 1
+  0.6250000   0.1250000   0.6250000 1 1 1
+  0.5000000   0.5000000   0.0000000 1 1 1
+  0.6250000   0.6250000   0.1250000 1 1 1
+  0.5000000   0.5000000   0.5000000 1 1 1
+  0.6250000   0.6250000   0.6250000 1 1 1
+
diff --git a/source/module_base/global_variable.cpp b/source/module_base/global_variable.cpp
@@ -47,6 +47,7 @@ int CURRENT_K = 0;
 int CAL_FORCE = 0; // if cal_force >1, means do the grid integration 'cal_force' times.
 double FORCE_THR = 1.0e-3;
 bool CAL_STRESS = false;
+int NUM_STREAM = 4;
 double PRESS1 = 0.0;
 double PRESS2 = 0.0;
 double PRESS3 = 0.0;

diff --git a/source/module_base/global_variable.h b/source/module_base/global_variable.h
@@ -47,6 +47,9 @@ extern int CURRENT_K; // 8
 extern int CAL_FORCE; // 8.1
 extern double FORCE_THR; // 8.2
 extern bool CAL_STRESS; // 8.25 calcualte the stress
+
+extern int NUM_STREAM;
+
 extern double PRESS1;
 extern double PRESS2;
 extern double PRESS3;

diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h
@@ -99,12 +99,22 @@ extern "C"
                 		const std::complex<double> *beta,
 		const std::complex<double> *c, const int *ic, const int *jc, const int *descc);
 
-  void pztranc_(
+    void pztranc_(
 		const int *M, const int *N,
 		const std::complex<double> *alpha,
 		const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
 		const std::complex<double> *beta,
 		std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);
+
+    void pdgemr2d_(const int *M, const int *N,
+	    double *A, const int *IA, const int *JA, const int *DESCA, 
+		double *B, const int *IB, const int *JB, const int *DESCB,
+		const int *ICTXT);			   
+
+    void pzgemr2d_(const int *M, const int *N,
+	    std::complex<double> *A, const int *IA, const int *JA, const int *DESCA, 
+		std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
+		const int *ICTXT);
 }
 
 class ScalapackConnector

diff --git a/source/module_basis/module_ao/ORB_control.cpp b/source/module_basis/module_ao/ORB_control.cpp
@@ -340,12 +340,7 @@ void ORB_control::divide_HS_2d(
     pv->dim0 = (int)sqrt((double)dsize); // mohan update 2012/01/13
     // while (GlobalV::NPROC_IN_POOL%dim0!=0)
 
-    if (ks_solver == "cusolver")
-    {
-        pv->dim0 = 1; pv->dim1 = dsize;
-    } // Xu Shu add 2022-03-25
-    else
-        pv->set_proc_dim(dsize);
+    pv->set_proc_dim(dsize);
 
     if (pv->testpb)
         ModuleBase::GlobalFunc::OUT(ofs_running, "dim0", pv->dim0);
@@ -359,8 +354,6 @@ assert(nb2d > 0);
 #endif
     pv->set_block_size(nb2d); // mohan add 2010-06-28
 
-    if (ks_solver == "cusolver")
-        pv->set_block_size(1); // Xu Shu add 2022-03-25
     ModuleBase::GlobalFunc::OUT(ofs_running, "nb2d", pv->get_block_size());
 
     this->set_parameters(ofs_running, ofs_warning);

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp
@@ -45,7 +45,7 @@ void Local_Orbital_wfc::gamma_file(psi::Psi<double>* psid, elecstate::ElecState*
      || GlobalV::KS_SOLVER == "lapack_gvx" 
      || GlobalV::KS_SOLVER == "scalapack_gvx" 
      || GlobalV::KS_SOLVER == "cg_in_lcao"
-#ifdef __CUSOLVER_LCAO
+#ifdef __CUDA
         || GlobalV::KS_SOLVER == "cusolver"
 #endif
         )

diff --git a/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt
@@ -31,7 +31,7 @@ if(USE_ELPA)
       genelpa
   )
 endif()
-if(USE_CUSOLVER_LCAO)
+if(USE_CUDA)
   target_link_libraries(diag_cusolver)
 endif()
 

diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt
@@ -1,3 +1,5 @@
+#add_subdirectory(kernels/cuda)
+
 list(APPEND objects
     gint.cpp
     gint_gamma.cpp
@@ -20,6 +22,22 @@ list(APPEND objects
     grid_technique.cpp
 )
 
+if(USE_CUDA)
+  list(APPEND objects
+      kernels/cuda/cuda_tools.cu
+      kernels/cuda/vbatch_matrix_mul.cu
+      kernels/cuda/gint_vl.cu
+      kernels/cuda/gint_rho.cu
+      kernels/cuda/gint_force.cu
+      gint_vl_gpu.cu
+      gint_rho_gpu.cu
+      gint_force_gpu.cu
+      gtask_vl.cpp
+      gtask_rho.cpp
+      gtask_force.cpp
+  )
+endif()
+
 add_library(
     gint
     OBJECT
@@ -29,3 +47,7 @@ add_library(
 if(ENABLE_COVERAGE)
   add_coverage(gint)
 endif()
+
+IF (BUILD_TESTING)
+  add_subdirectory(test)
+endif()