Skip to content

Commit

Permalink
Feature: compute module_gint by GPU (#4109)
Browse files Browse the repository at this point in the history
* add CUDA code for module_gint and fix bug about cusolver

* add comments for code_gen.cpp

* add integrated test cases for CUDA gint and cusolver

* add some gint test cases

* modify cuda tests

* modify the location of the CUDA test folder

* modify some GPU test cases
* modify some STRU and INPUT files for GPU test cases to reduce testing time

* format INPUT file in GPU test cases

* format some code
* add curly brackets to if and for statements
* add const to some function arguments

* fix comments

* fix error in gint force

* remove const in gint_rho

* Update input_conv_test.cpp

modify input test about GPU

* remove time.sh in tests/integrate

* modify INPUT file in GPU test cases

* remove inappropriate changes in tests/performance.

* remove USE_CUSOLVER_LCAO flag

* enable check in non-debug enviroment

* Use abbreviations instead of unreasonable naming

* fix error in debug

* modify the doc about ks_solver

* change the default ks_solver to cusolver

* modify cuda.md

---------

Co-authored-by: A-006 <3158793232@qq.com>
Co-authored-by: Mohan Chen <mohan.chen.chen.mohan@gmail.com>
  • Loading branch information
3 people committed May 12, 2024
1 parent 922db46 commit a2798b2
Show file tree
Hide file tree
Showing 126 changed files with 14,911 additions and 702 deletions.
49 changes: 17 additions & 32 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ project(
option(ENABLE_LCAO "Enable LCAO calculation." ON)
option(ENABLE_DEEPKS "Enable DeePKS functionality" OFF)
option(ENABLE_LIBXC "Enable LibXC functionality" OFF)
option(USE_CUDA "Enable support to CUDA for PW." OFF)
option(USE_CUDA "Enable support to CUDA for ABACUS." OFF)
option(ENABLE_FLOAT_FFTW "Enable support to single precision FFTW library." OFF)
# option(USE_CUSOLVER_LCAO "Enable support to CUSOLVER for LCAO." OFF)
option(USE_ROCM "Enable support to ROCm." OFF)
option(USE_OPENMP "Enable OpenMP in ABACUS." ON)
option(ENABLE_ASAN "Enable AddressSanitizer" OFF)
Expand Down Expand Up @@ -68,11 +67,6 @@ if(ENABLE_RAPIDJSON)
include_directories(${RapidJSON_INCLUDE_PATH})
endif()

if(USE_CUDA)
set(USE_CUSOLVER_LCAO ON)
else()
set(USE_CUSOLVER_LCAO OFF)
endif()
# get commit info
if(COMMIT_INFO)
find_package(Git)
Expand Down Expand Up @@ -247,37 +241,28 @@ endif()
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
if(NOT DEFINED USE_CUDA OR NOT DEFINED USE_CUSOLVER_LCAO)
if(NOT DEFINED USE_CUDA AND NOT DEFINED USE_CUSOLVER_LCAO)
message(
"CUDA components detected. \nWill build the CUDA for PW version of ABACUS by default."
)
set(USE_CUDA ON)
set(USE_CUSOLVER_LCAO OFF)
elseif(NOT DEFINED USE_CUDA)
set(USE_CUDA OFF)
else()
set(USE_CUSOLVER_LCAO OFF)
endif()
if(NOT DEFINED USE_CUDA)
message(
"CUDA components detected. \nWill build the CUDA version of ABACUS by default."
)
set(USE_CUDA ON)
else()
if(NOT USE_CUDA AND NOT USE_CUSOLVER_LCAO)
if(NOT USE_CUDA)
message(
STATUS
"CUDA components detected, but both USE_CUDA and USE_CUSOLVER_LCAO set to OFF. NOT building CUDA version of ABACUS."
"CUDA components detected, but USE_CUDA is set to OFF. NOT building CUDA version of ABACUS."
)
endif()
endif()
else() # CUDA not found
if(USE_CUDA OR USE_CUSOLVER_LCAO)
if(USE_CUDA)
message(
FATAL_ERROR
"USE_CUDA or USE_CUSOLVER_LCAO set but no CUDA components found.")
set(USE_CUDA OFF)
set(USE_CUSOLVER_LCAO OFF)
"USE_CUDA is set but no CUDA components found.")
endif()
endif()

if(USE_CUDA OR USE_CUSOLVER_LCAO)
if(USE_CUDA)
cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below
set_if_higher(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_EXTENSIONS ON)
Expand Down Expand Up @@ -317,12 +302,12 @@ if(USE_CUDA OR USE_CUSOLVER_LCAO)
if(USE_CUDA)
add_compile_definitions(__CUDA)
add_compile_definitions(__UT_USE_CUDA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE)
endif()
if (USE_OPENMP AND OpenMP_CXX_FOUND)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE)
endif()
endif()
if(USE_CUSOLVER_LCAO)
add_compile_definitions(__CUSOLVER_LCAO)
endif()
endif()

Expand Down Expand Up @@ -716,7 +701,7 @@ if(ENABLE_LCAO)
if(USE_ELPA)
target_link_libraries(${ABACUS_BIN_NAME} genelpa)
endif()
if(USE_CUSOLVER_LCAO)
if(USE_CUDA)
target_link_libraries(diag_cusolver)
endif()
endif()
Expand Down
18 changes: 8 additions & 10 deletions docs/advanced/acceleration/cuda.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# CUDA GPU Implementations

In ABACUS, we provide the option to use the GPU devices to accelerate the performance.
And it has the following general features:
In ABACUS, we provide the option to use GPU devices to accelerate performance. The implementation of GPU acceleration differs between PW basis and LCAO basis. Specifically, under PW basis, it has the following features:

- **Full gpu implementations**: During the SCF progress, `Psi`, `Hamilt`, `Hsolver`, `DiagCG`, and `DiagoDavid` classes are stored or calculated by the GPU devices.

Expand All @@ -13,6 +12,8 @@ And it has the following general features:

- **Parallel strategy**: K point parallel.

Unlike PW basis, only the grid integration module (module_gint) and the diagonalization of the Hamiltonian matrix (module_hsolver) have been implemented with GPU acceleration under LCAO basis, and the acceleration is limited to gamma only calculation. Additionally, LCAO basis does not support multi-GPU acceleration. Both the grid integration module and the Hamiltonian matrix solver only support acceleration on a single GPU.

## Required hardware/software

To compile and use ABACUS in CUDA mode, you currently need to have an NVIDIA GPU and install the corresponding NVIDIA CUDA toolkit software on your system (this is only tested on Linux and unsupported on Windows):
Expand All @@ -36,14 +37,11 @@ In `INPUT` file we need to set the value keyword [device](../input_files/input-m
We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.

## Known limitations

- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`,
- Only PW basis is supported, so the input keyword `basis_type` can only take the value `pw`,
PW basis:
- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`.
- Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically.
- By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).

## FAQ
```
Q: Does the GPU implementations support atomic orbital basis sets?
A: Currently no.
```
LCAO basis:
- Does not support multi-k calculation, so if the input keyword `device` is set to `gpu`, the input keyword `gamma_only` can only take the value `1`.
- Does not support multi-GPU acceleration.
13 changes: 9 additions & 4 deletions docs/advanced/input_files/input-main.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
- [search\_radius](#search_radius)
- [search\_pbc](#search_pbc)
- [bx, by, bz](#bx-by-bz)
- [num\_stream] (#num_stream)
- [Electronic structure](#electronic-structure)
- [basis\_type](#basis_type)
- [ks\_solver](#ks_solver)
Expand Down Expand Up @@ -643,10 +644,8 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
- cpu: for CPUs via Intel, AMD, or Other supported CPU devices
- gpu: for GPUs via CUDA or ROCm.

Known limitations:
Known limitations: If using the pw basis, the ks_solver must be cg/bpcg/dav to support `gpu` acceleration. If using the lcao basis, `gamma_only` must be set to `1`, as multi-k calculation is currently not supported for `gpu`. lcao_in_pw also does not support `gpu`.

- pw basis: required by the `gpu` acceleration options
- cg/bpcg/dav ks_solver: required by the `gpu` acceleration options
- **Default**: cpu

### precision
Expand Down Expand Up @@ -883,6 +882,12 @@ These variables are used to control the numerical atomic orbitals related parame
- **Description**: In the matrix operation of grid integral, bx/by/bz grids (in x, y, z directions) are treated as a whole as a matrix element. A different value will affect the calculation speed. The default is 0, which means abacus will automatically calculate these values.
- **Default**: 0

### num_stream

- **Type** :int
- **Description**: choose the number of streams in GPU when we compute the `LCAO`. According to different devices , we may have different effects.For most devices,the stream is
enough when the number is bigger then 2.
- **Default** : "4"
[back to top](#full-list-of-input-keywords)

## Electronic structure
Expand Down Expand Up @@ -914,7 +919,7 @@ calculations.

- **genelpa**: This method should be used if you choose localized orbitals.
- **scalapack_gvx**: Scalapack can also be used for localized orbitals.
- **cusolver**: (Unavailable currently, it will be fixed in future versions) This method needs building with the cusolver component for lcao and at least one gpu is available.
- **cusolver**: This method needs building with CUDA and at least one gpu is available.

If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message:

Expand Down
29 changes: 29 additions & 0 deletions examples/gpu/si16_lcao/INPUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
INPUT_PARAMETERS
#Parameters (1.General)
suffix autotest
calculation scf
device gpu
gamma_only 1 # GPU acceleration currently only support gamma_only set to 1.
ks_solver cusolver # if not set, the default ks_solver is cusolver,
# you can also choose genelpa or scalapack_gvx.

#nbands 8
symmetry 1

#Parameters (2.Iteration)
ecutwfc 100
scf_thr 1e-6
scf_nmax 100
cal_force 1
cal_stress 1

#Parameters (3.Basis)
basis_type lcao

#Parameters (4.Smearing)
smearing_method gauss
smearing_sigma 0.002

#Parameters (5.Mixing)
mixing_type broyden
mixing_beta 0.3
4 changes: 4 additions & 0 deletions examples/gpu/si16_lcao/KPT
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
K_POINTS
0
Gamma
1 1 1 0 0 0
37 changes: 37 additions & 0 deletions examples/gpu/si16_lcao/STRU
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
ATOMIC_SPECIES
Si 14.000 ../../../tests/PP_ORB/Si_ONCV_PBE-1.0.upf

NUMERICAL_ORBITAL
../../../tests/PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb

LATTICE_CONSTANT
0.999660

LATTICE_VECTORS
10.20000 10.20000 0.00000
10.20000 0.00000 10.20000
0.00000 10.20000 10.20000

ATOMIC_POSITIONS
Direct

Si
0.0
16
0.0000000 0.0000000 0.0000000 1 1 1
0.1250000 0.1250000 0.1250000 1 1 1
0.0000000 0.0000000 0.5000000 1 1 1
0.1250000 0.1250000 0.6250000 1 1 1
0.0000000 0.5000000 0.0000000 1 1 1
0.1250000 0.6250000 0.1250000 1 1 1
0.0000000 0.5000000 0.5000000 1 1 1
0.1250000 0.6250000 0.6250000 1 1 1
0.5000000 0.0000000 0.0000000 1 1 1
0.6250000 0.1250000 0.1250000 1 1 1
0.5000000 0.0000000 0.5000000 1 1 1
0.6250000 0.1250000 0.6250000 1 1 1
0.5000000 0.5000000 0.0000000 1 1 1
0.6250000 0.6250000 0.1250000 1 1 1
0.5000000 0.5000000 0.5000000 1 1 1
0.6250000 0.6250000 0.6250000 1 1 1

1 change: 1 addition & 0 deletions source/module_base/global_variable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ int CURRENT_K = 0;
int CAL_FORCE = 0; // if cal_force >1, means do the grid integration 'cal_force' times.
double FORCE_THR = 1.0e-3;
bool CAL_STRESS = false;
int NUM_STREAM = 4;
double PRESS1 = 0.0;
double PRESS2 = 0.0;
double PRESS3 = 0.0;
Expand Down
3 changes: 3 additions & 0 deletions source/module_base/global_variable.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ extern int CURRENT_K; // 8
extern int CAL_FORCE; // 8.1
extern double FORCE_THR; // 8.2
extern bool CAL_STRESS; // 8.25 calcualte the stress

extern int NUM_STREAM;

extern double PRESS1;
extern double PRESS2;
extern double PRESS3;
Expand Down
12 changes: 11 additions & 1 deletion source/module_base/scalapack_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,22 @@ extern "C"
const std::complex<double> *beta,
const std::complex<double> *c, const int *ic, const int *jc, const int *descc);

void pztranc_(
void pztranc_(
const int *M, const int *N,
const std::complex<double> *alpha,
const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
const std::complex<double> *beta,
std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);

void pdgemr2d_(const int *M, const int *N,
double *A, const int *IA, const int *JA, const int *DESCA,
double *B, const int *IB, const int *JB, const int *DESCB,
const int *ICTXT);

void pzgemr2d_(const int *M, const int *N,
std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
const int *ICTXT);
}

class ScalapackConnector
Expand Down
9 changes: 1 addition & 8 deletions source/module_basis/module_ao/ORB_control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,7 @@ void ORB_control::divide_HS_2d(
pv->dim0 = (int)sqrt((double)dsize); // mohan update 2012/01/13
// while (GlobalV::NPROC_IN_POOL%dim0!=0)

if (ks_solver == "cusolver")
{
pv->dim0 = 1; pv->dim1 = dsize;
} // Xu Shu add 2022-03-25
else
pv->set_proc_dim(dsize);
pv->set_proc_dim(dsize);

if (pv->testpb)
ModuleBase::GlobalFunc::OUT(ofs_running, "dim0", pv->dim0);
Expand All @@ -359,8 +354,6 @@ assert(nb2d > 0);
#endif
pv->set_block_size(nb2d); // mohan add 2010-06-28

if (ks_solver == "cusolver")
pv->set_block_size(1); // Xu Shu add 2022-03-25
ModuleBase::GlobalFunc::OUT(ofs_running, "nb2d", pv->get_block_size());

this->set_parameters(ofs_running, ofs_warning);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void Local_Orbital_wfc::gamma_file(psi::Psi<double>* psid, elecstate::ElecState*
|| GlobalV::KS_SOLVER == "lapack_gvx"
|| GlobalV::KS_SOLVER == "scalapack_gvx"
|| GlobalV::KS_SOLVER == "cg_in_lcao"
#ifdef __CUSOLVER_LCAO
#ifdef __CUDA
|| GlobalV::KS_SOLVER == "cusolver"
#endif
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ if(USE_ELPA)
genelpa
)
endif()
if(USE_CUSOLVER_LCAO)
if(USE_CUDA)
target_link_libraries(diag_cusolver)
endif()

Expand Down
22 changes: 22 additions & 0 deletions source/module_hamilt_lcao/module_gint/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#add_subdirectory(kernels/cuda)

list(APPEND objects
gint.cpp
gint_gamma.cpp
Expand All @@ -20,6 +22,22 @@ list(APPEND objects
grid_technique.cpp
)

if(USE_CUDA)
list(APPEND objects
kernels/cuda/cuda_tools.cu
kernels/cuda/vbatch_matrix_mul.cu
kernels/cuda/gint_vl.cu
kernels/cuda/gint_rho.cu
kernels/cuda/gint_force.cu
gint_vl_gpu.cu
gint_rho_gpu.cu
gint_force_gpu.cu
gtask_vl.cpp
gtask_rho.cpp
gtask_force.cpp
)
endif()

add_library(
gint
OBJECT
Expand All @@ -29,3 +47,7 @@ add_library(
if(ENABLE_COVERAGE)
add_coverage(gint)
endif()

IF (BUILD_TESTING)
add_subdirectory(test)
endif()

0 comments on commit a2798b2

Please sign in to comment.