From aea159b0c8818d735f053a8075a1bc3bd5f42581 Mon Sep 17 00:00:00 2001 From: jiyuang Date: Tue, 12 Oct 2021 11:39:45 +0800 Subject: [PATCH 01/27] add PDOS with soc --- source/src_io/energy_dos.cpp | 18 +++++++++++++++--- tools/plot-tools/abacus_plot/dos.py | 25 ++++++++++++++++--------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/source/src_io/energy_dos.cpp b/source/src_io/energy_dos.cpp index d0b2a5fb78a..b8c312dde10 100644 --- a/source/src_io/energy_dos.cpp +++ b/source/src_io/energy_dos.cpp @@ -208,7 +208,7 @@ void energy::perform_dos(void) //scale up a little bit so the end peaks are displaced better double delta=(emax-emin)*dos_scale; - std::cout << dos_scale; + //std::cout << dos_scale; emax=emax+delta/2.0; emin=emin-delta/2.0; @@ -477,7 +477,7 @@ void energy::perform_dos(void) { std::stringstream ps; ps << GlobalV::global_out_dir << "TDOS"; std::ofstream out(ps.str().c_str()); - if (GlobalV::NSPIN==1) + if (GlobalV::NSPIN==1||GlobalV::NSPIN==4) { for (int n=0; n" <" << GlobalV::NSPIN<< "<"<<"/"<<"nspin"<<">"<< std::endl; - out << "<"<<"norbitals"<<">" <"<< std::endl; + if (GlobalV::NSPIN==4) + out << "<"<<"norbitals"<<">" <"<< std::endl; + else + out << "<"<<"norbitals"<<">" <"<< std::endl; out << "<"<<"energy"<<"_"<<"values units"<<"="<<"\""<<"eV"<<"\""<<">"<nw; ++j) { const int L1 = atom1->iw2l[j]; @@ -592,6 +596,14 @@ void energy::perform_dos(void) out <" <" < Date: Mon, 18 Oct 2021 16:54:28 +0800 Subject: [PATCH 02/27] Fix: torch version mismatch with cxx11 abi setting. --- CMakeLists.txt | 2 +- Dockerfile.gnu | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e74b923b6c7..039de03e675 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,7 +135,7 @@ if(ENABLE_DEEPKS) find_package(Torch REQUIRED) include_directories(${TORCH_INCLUDE_DIRS}) target_link_libraries(${ABACUS_BIN_NAME} ${TORCH_LIBRARIES}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + add_compile_options(${TORCH_CXX_FLAGS}) find_path(libnpy_SOURCE_DIR npy.hpp diff --git a/Dockerfile.gnu b/Dockerfile.gnu index 7505d441b95..69883a36bce 100644 --- a/Dockerfile.gnu +++ b/Dockerfile.gnu @@ -39,8 +39,8 @@ RUN cd /tmp \ && cd /tmp && rm -rf fftw-3.3.9 && rm fftw-3.3.9.tar.gz RUN cd /tmp \ - && wget https://download.pytorch.org/libtorch/cpu/libtorch-shared-with-deps-1.9.0%2Bcpu.zip --no-check-certificate --quiet \ - && unzip libtorch-shared-with-deps-1.9.0+cpu.zip \ + && wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.9.1%2Bcpu.zip --no-check-certificate --quiet \ + && unzip libtorch-cxx11-abi-shared-with-deps-1.9.1+cpu.zip \ && cp -r libtorch/include /usr/local \ && cp -r libtorch/lib /usr/local \ && cp -r libtorch/share /usr/local \ From 958a130c9ad48dc38cd98b6d85130c9e0ab54734 Mon Sep 17 00:00:00 2001 From: Chun Cai Date: Mon, 18 Oct 2021 16:59:26 +0800 Subject: [PATCH 03/27] Add deepks to build test. --- .github/workflows/hosted_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hosted_test.yml b/.github/workflows/hosted_test.yml index 7b4fc13a46e..990ffbcd4b1 100644 --- a/.github/workflows/hosted_test.yml +++ b/.github/workflows/hosted_test.yml @@ -11,7 +11,7 @@ jobs: build_args: "" name: "Build with GNU compilers" - tag: gnu - build_args: "-DENABLE_LIBXC=1" + build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1" name: "Build with GNU compilers and extra components" - tag: intel build_args: "" From 52d246fc78282e80d646e7687259f79b025cefaf Mon Sep 17 00:00:00 2001 From: maki49 <1579492865@qq.com> Date: Wed, 20 Oct 2021 11:31:54 +0800 Subject: [PATCH 04/27] fix a input_conv bug for deepks parameters --- source/input_conv.cpp | 5 +++-- source/src_io/write_input.cpp | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/input_conv.cpp b/source/input_conv.cpp index 17fdde8055f..1755ebd37d6 100644 --- a/source/input_conv.cpp +++ b/source/input_conv.cpp @@ -629,12 +629,13 @@ void Input_Conv::Convert(void) } ModuleBase::timer::tick("Input_Conv","Convert"); - return; - //----------------------------------------------- //caoyu add for DeePKS //----------------------------------------------- + #ifdef __DEEPKS GlobalV::out_descriptor = INPUT.out_descriptor; GlobalV::deepks_scf = INPUT.deepks_scf; + #endif + return; } diff --git a/source/src_io/write_input.cpp b/source/src_io/write_input.cpp index 8d8540d631b..e4c2a2af8db 100644 --- a/source/src_io/write_input.cpp +++ b/source/src_io/write_input.cpp @@ -97,6 +97,9 @@ void Input::Print(const std::string &fn)const // for deepks ModuleBase::GlobalFunc::OUTP(ofs,"out_descriptor",out_descriptor,">0 compute descriptor for deepks"); ModuleBase::GlobalFunc::OUTP(ofs,"lmax_descriptor",lmax_descriptor,">0 lmax used in descriptor for deepks"); + ModuleBase::GlobalFunc::OUTP(ofs,"deepks_scf",deepks_scf,">0 load a model and mix int in SCF"); + ModuleBase::GlobalFunc::OUTP(ofs,"deepks_scf",model_file,"file dir of traced pytorch model: 'model.ptg"); + ofs << "\n#Parameters (4.LCAO)" << std::endl; ModuleBase::GlobalFunc::OUTP(ofs,"basis_type",basis_type,"PW; LCAO in pw; LCAO"); From 9df67e29f48a01528eeb5f1f30457f2ce87d2ce9 Mon Sep 17 00:00:00 2001 From: maki49 <1579492865@qq.com> Date: Wed, 20 Oct 2021 12:05:20 +0800 Subject: [PATCH 05/27] add write_input of deepks parameters --- source/src_io/write_input.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/src_io/write_input.cpp b/source/src_io/write_input.cpp index e4c2a2af8db..74a548c132f 100644 --- a/source/src_io/write_input.cpp +++ b/source/src_io/write_input.cpp @@ -98,7 +98,7 @@ void Input::Print(const std::string &fn)const ModuleBase::GlobalFunc::OUTP(ofs,"out_descriptor",out_descriptor,">0 compute descriptor for deepks"); ModuleBase::GlobalFunc::OUTP(ofs,"lmax_descriptor",lmax_descriptor,">0 lmax used in descriptor for deepks"); ModuleBase::GlobalFunc::OUTP(ofs,"deepks_scf",deepks_scf,">0 load a model and mix int in SCF"); - ModuleBase::GlobalFunc::OUTP(ofs,"deepks_scf",model_file,"file dir of traced pytorch model: 'model.ptg"); + ModuleBase::GlobalFunc::OUTP(ofs,"model_file",model_file,"file dir of traced pytorch model: 'model.ptg"); ofs << "\n#Parameters (4.LCAO)" << std::endl; From 5b365ea4c4eef0dcc914d2df540fb39ecb424df4 Mon Sep 17 00:00:00 2001 From: maki49 <1579492865@qq.com> Date: Wed, 20 Oct 2021 15:14:01 +0800 Subject: [PATCH 06/27] replace 'torch::symeig' by torch::linalg::eigh' --- source/src_lcao/LCAO_descriptor.cpp | 11 +++++++---- source/src_lcao/LCAO_descriptor.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index ccf8ae4abe9..09b2714da07 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace GlobalC { @@ -1351,7 +1352,8 @@ void LCAO_Descriptor::cal_descriptor_tensor(void) { torch::Tensor vd; std::tuple d_v(this->d_tensor[inl], vd); - d_v = torch::symeig(pdm_tensor[inl], /*eigenvalues=*/true, /*upper=*/true); + //d_v = torch::symeig(pdm_tensor[inl], /*eigenvalues=*/true, /*upper=*/true); + d_v = torch::linalg::eigh(pdm_tensor[inl], /*uplo*/"U"); d_tensor[inl] = std::get<0>(d_v); } return; @@ -1382,7 +1384,7 @@ void LCAO_Descriptor::cal_gedm(const ModuleBase::matrix &dm) //-----prepare for autograd--------- this->cal_projected_DM(dm); this->cal_descriptor(); - this->cal_descriptor_tensor(); //use torch::symeig + this->cal_descriptor_tensor(); //use torch::linalg::eigh //-----prepared----------------------- //forward std::vector inputs; @@ -1395,7 +1397,7 @@ void LCAO_Descriptor::cal_gedm(const ModuleBase::matrix &dm) //cal gedm std::vector gedm_shell; gedm_shell.push_back(torch::ones_like(ec[0])); - this->gedm_tensor = torch::autograd::grad(ec, this->pdm_tensor, gedm_shell, /*retain_grad=*/true); + this->gedm_tensor = torch::autograd::grad(ec, this->pdm_tensor, gedm_shell, /*retain_grad=*/true, /*create_graph=*/false, /*allow_unused=*/true); //gedm_tensor(Hartree) to gedm(Ry) for (int inl = 0;inl < inlmax;++inl) @@ -1428,7 +1430,8 @@ void LCAO_Descriptor::cal_gvdm() int nm = 2*this->inl_l[inl]+1; //repeat each block for nm times in an additional dimension torch::Tensor tmp_x = this->pdm_tensor[inl].reshape({nm, nm}).unsqueeze(0).repeat({nm, 1, 1}); - torch::Tensor tmp_y = std::get<0>(torch::symeig(tmp_x, true)); + //torch::Tensor tmp_y = std::get<0>(torch::symeig(tmp_x, true)); + torch::Tensor tmp_y = std::get<0>(torch::linalg::eigh(tmp_x, "U")); torch::Tensor tmp_yshell = torch::eye(nm, torch::TensorOptions().dtype(torch::kFloat64)); std::vector tmp_rpt; //repeated-pdm-tensor (x) std::vector tmp_rdt; //repeated-d-tensor (y) diff --git a/source/src_lcao/LCAO_descriptor.h b/source/src_lcao/LCAO_descriptor.h index ca7f8cecccb..7364d0218f1 100644 --- a/source/src_lcao/LCAO_descriptor.h +++ b/source/src_lcao/LCAO_descriptor.h @@ -182,7 +182,7 @@ class LCAO_Descriptor //gvx:d(d)/dX, [natom][3][natom][des_per_atom] torch::Tensor gvx_tensor; - //d(d)/dD, autograd from torch::symeig + //d(d)/dD, autograd from torch::linalg::eigh std::vector gevdm_vector; //dD/dX, tensor form of gdmx From ed42bdd48e3e61f85bdf580ec6d13a13bfee80fb Mon Sep 17 00:00:00 2001 From: dyzheng Date: Thu, 21 Oct 2021 17:27:35 +0800 Subject: [PATCH 07/27] fixed bug of Makefile compiler without __DEEPKS --- source/src_lcao/LCAO_descriptor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index ccf8ae4abe9..7f45e60f7bc 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -1717,7 +1717,6 @@ void LCAO_Descriptor::cal_e_delta_band(const std::vector &dm } return; } -#endif void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) { @@ -1785,4 +1784,5 @@ void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) assert(this->gvx_tensor.size(3) == this->des_per_atom); return; -} \ No newline at end of file +} +#endif From 96e65b238e2764c73624ba0d426fe3de24453cd3 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Thu, 21 Oct 2021 17:27:35 +0800 Subject: [PATCH 08/27] fixed bug of Makefile compiler without __DEEPKS --- source/src_lcao/LCAO_descriptor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index ccf8ae4abe9..7f45e60f7bc 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -1717,7 +1717,6 @@ void LCAO_Descriptor::cal_e_delta_band(const std::vector &dm } return; } -#endif void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) { @@ -1785,4 +1784,5 @@ void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) assert(this->gvx_tensor.size(3) == this->des_per_atom); return; -} \ No newline at end of file +} +#endif From dbcf965acadced1c9381ebcbbc143566bebc86bc Mon Sep 17 00:00:00 2001 From: dyzheng Date: Thu, 21 Oct 2021 17:34:37 +0800 Subject: [PATCH 09/27] fixed some reference results of 301* and 304* examples --- tests/integrate/301_NO_GO_DJ_Si/result.ref | 6 +++--- tests/integrate/304_NO_GO_AF/INPUT | 2 +- tests/integrate/304_NO_GO_AF/result.ref | 6 +++--- tests/integrate/304_NO_GO_FM/result.ref | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integrate/301_NO_GO_DJ_Si/result.ref b/tests/integrate/301_NO_GO_DJ_Si/result.ref index cb762430bc5..1f4456885b1 100644 --- a/tests/integrate/301_NO_GO_DJ_Si/result.ref +++ b/tests/integrate/301_NO_GO_DJ_Si/result.ref @@ -1,3 +1,3 @@ -etotref -212.6475140334407 -etotperatomref -106.3237570167 -totaltimeref 1.96 +etotref -212.6400385776964 +etotperatomref -106.3200192888 +totaltimeref 0.96 diff --git a/tests/integrate/304_NO_GO_AF/INPUT b/tests/integrate/304_NO_GO_AF/INPUT index c017b74700c..d1a546b57a7 100644 --- a/tests/integrate/304_NO_GO_AF/INPUT +++ b/tests/integrate/304_NO_GO_AF/INPUT @@ -15,7 +15,7 @@ out_charge 0 #out_band 1 smearing gaussian -sigma 0.07 +sigma 0.02 #force 1 #force_thr_ev 0.01 diff --git a/tests/integrate/304_NO_GO_AF/result.ref b/tests/integrate/304_NO_GO_AF/result.ref index 6654f537121..5c0ad583f82 100644 --- a/tests/integrate/304_NO_GO_AF/result.ref +++ b/tests/integrate/304_NO_GO_AF/result.ref @@ -1,3 +1,3 @@ -etotref -6373.604933471915 -etotperatomref -3186.8024667360 -totaltimeref 15.3 +etotref -6372.520190337425 +etotperatomref -3186.2600951687 +totaltimeref 11. diff --git a/tests/integrate/304_NO_GO_FM/result.ref b/tests/integrate/304_NO_GO_FM/result.ref index d96232ebfe4..a66411d61c6 100644 --- a/tests/integrate/304_NO_GO_FM/result.ref +++ b/tests/integrate/304_NO_GO_FM/result.ref @@ -1,3 +1,3 @@ -etotref -196.0599139708231 -etotperatomref -98.0299569854 -totaltimeref 5.89 +etotref -196.0555462724182 +etotperatomref -98.0277731362 +totaltimeref 2.9 From 4b90c38950e2de1de8d46ce9c6e7250f52c0b213 Mon Sep 17 00:00:00 2001 From: wenfei-li Date: Thu, 21 Oct 2021 19:57:25 +0800 Subject: [PATCH 10/27] deepks parallelization : S overlap matrix --- source/Makefile.Objects | 1 + source/src_lcao/LCAO_descriptor.cpp | 75 +++++++++----- source/src_lcao/LCAO_descriptor.h | 4 +- source/src_lcao/LCAO_matrix.cpp | 10 ++ source/src_parallel/parallel_deepks.cpp | 128 ++++++++++++++++++++++++ source/src_parallel/parallel_deepks.h | 56 +++++++++++ 6 files changed, 248 insertions(+), 26 deletions(-) create mode 100644 source/src_parallel/parallel_deepks.cpp create mode 100644 source/src_parallel/parallel_deepks.h diff --git a/source/Makefile.Objects b/source/Makefile.Objects index e7dbb91317b..303f9dbd0e2 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -225,6 +225,7 @@ parallel_reduce.o\ parallel_pw.o\ ft.o\ parallel_grid.o\ +parallel_deepks.o \ OBJS_FIRST_PRINCIPLES=$(OBJS_MAIN)\ $(OBJS_PW)\ diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index 09b2714da07..c34ed1841a3 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -200,6 +200,12 @@ void LCAO_Descriptor::build_S_descriptor(const bool& calc_deri) { ModuleBase::TITLE("LCAO_Descriptor", "build_S_descriptor"); //array to store data + + if (!GlobalV::GAMMA_ONLY_LOCAL) + { + ModuleBase::WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! "); + } + double olm[3] = {0.0, 0.0, 0.0}; //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)> //??? @@ -234,47 +240,66 @@ void LCAO_Descriptor::build_S_descriptor(const bool& calc_deri) const int N1 = atom1->iw2n[jj0]; const int m1 = atom1->iw2m[jj0]; - for (int L2 = 0; L2 <= GlobalC::ORB.Alpha[0].getLmax(); ++L2) + int iw1_local=GlobalC::ParaD.trace_loc_orb[iw1_all]; + if(iw1_local<0) + { + ++iw1_all; + } + else { - for (int N2 = 0; N2 < GlobalC::ORB.Alpha[0].getNchi(L2); ++N2) + for (int L2 = 0; L2 <= GlobalC::ORB.Alpha[0].getLmax(); ++L2) { - for (int m2 = 0; m2 < 2 * L2 + 1; ++m2) + for (int N2 = 0; N2 < GlobalC::ORB.Alpha[0].getNchi(L2); ++N2) { - olm[0] = olm[1] = olm[2] = 0.0; - if (!calc_deri) + for (int m2 = 0; m2 < 2 * L2 + 1; ++m2) { - GlobalC::UOT.snap_psipsi(GlobalC::ORB, olm, 0, 'D', tau1, - T1, L1, m1, N1, GlobalC::GridD.getAdjacentTau(ad), - T2, L2, m2, N2, GlobalV::NSPIN); - if (GlobalV::GAMMA_ONLY_LOCAL) + olm[0] = olm[1] = olm[2] = 0.0; + if (!calc_deri) { - this->set_S_mu_alpha(iw1_all, inl_index[T2](I2,L2,N2), m2, olm[0]); + GlobalC::UOT.snap_psipsi(GlobalC::ORB, olm, 0, 'D', tau1, + T1, L1, m1, N1, GlobalC::GridD.getAdjacentTau(ad), + T2, L2, m2, N2, GlobalV::NSPIN); + if (GlobalV::GAMMA_ONLY_LOCAL) + { + this->set_S_mu_alpha(iw1_all, inl_index[T2](I2,L2,N2), m2, olm[0]); + } } - } - else - { - GlobalC::UOT.snap_psipsi(GlobalC::ORB, olm, 1, 'D', tau1, - T1, L1, m1, N1, GlobalC::GridD.getAdjacentTau(ad), - T2, L2, m2, N2, GlobalV::NSPIN); - if (GlobalV::GAMMA_ONLY_LOCAL) + else { - this->set_DS_mu_alpha(iw1_all, inl_index[T2](I2,L2,N2), m2, olm[0], olm[1], olm[2]); + GlobalC::UOT.snap_psipsi(GlobalC::ORB, olm, 1, 'D', tau1, + T1, L1, m1, N1, GlobalC::GridD.getAdjacentTau(ad), + T2, L2, m2, N2, GlobalV::NSPIN); + if (GlobalV::GAMMA_ONLY_LOCAL) + { + this->set_DS_mu_alpha(iw1_all, inl_index[T2](I2,L2,N2), m2, olm[0], olm[1], olm[2]); + } } - } - } //m2 - } //N2 - } //nw2(L2) - ++iw1_all; + } //m2 + } //N2 + } //nw2(L2) + ++iw1_all; + } } // nw1 } // distance } // ad } // I1 } // T1 - if (!GlobalV::GAMMA_ONLY_LOCAL) + +#ifdef __MPI + GlobalC::ParaD.allsum_S_mu_alpha(this->inlmax,GlobalV::NLOCAL*(2*this->lmaxd+1),this->S_mu_alpha); +#endif + + /* + for(int inl=0;inlinlmax;inl++) { - ModuleBase::WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! "); + ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"inl:",inl); + for(int j=0;jlmaxd+1);j++) + { + GlobalV::ofs_running << "j,s_mu_alpha: " << j << " " << this->S_mu_alpha[inl][j] << std::endl; + } } + */ return; } diff --git a/source/src_lcao/LCAO_descriptor.h b/source/src_lcao/LCAO_descriptor.h index 7364d0218f1..b93ebec9ce0 100644 --- a/source/src_lcao/LCAO_descriptor.h +++ b/source/src_lcao/LCAO_descriptor.h @@ -7,7 +7,9 @@ #include "../module_base/complexmatrix.h" #include #include "../src_pw/global.h" - +#ifdef __MPI +#include "../src_parallel/parallel_deepks.h" +#endif /// /// This class computes the descriptors for each atom from LCAO basis set, /// interfaces with pytorch to obtain the correction potential in LCAO basis, diff --git a/source/src_lcao/LCAO_matrix.cpp b/source/src_lcao/LCAO_matrix.cpp index 6b0121f9f80..ab741f38264 100644 --- a/source/src_lcao/LCAO_matrix.cpp +++ b/source/src_lcao/LCAO_matrix.cpp @@ -1,5 +1,8 @@ #include "LCAO_matrix.h" #include "global_fp.h" +#ifdef __DEEPKS +#include "../src_parallel/parallel_deepks.h" +#endif LCAO_Matrix::LCAO_Matrix() { @@ -61,6 +64,13 @@ void LCAO_Matrix::divide_HS_in_frag(const bool isGamma, Parallel_Orbitals &po) // for 2d: calculate po.nloc first, then trace_loc_row and trace_loc_col // for O(N): calculate the three together. po.set_trace(); +#ifdef __DEEPKS + if(GlobalV::out_descriptor) + { + GlobalC::ParaD.set_nlocal(MPI_COMM_WORLD); + GlobalC::ParaD.set_loc_orb(MPI_COMM_WORLD); + } +#endif // (3) allocate for S, H_fixed, H, and S_diag if(isGamma) diff --git a/source/src_parallel/parallel_deepks.cpp b/source/src_parallel/parallel_deepks.cpp new file mode 100644 index 00000000000..937ea2ac4d0 --- /dev/null +++ b/source/src_parallel/parallel_deepks.cpp @@ -0,0 +1,128 @@ +#ifdef __DEEPKS + +#include "parallel_deepks.h" + +namespace GlobalC +{ +Parallel_deepks ParaD; +} + +Parallel_deepks::Parallel_deepks() +{ + this->trace_loc_orb=new int[1]; + this->norb_local=new int[1]; +} + +Parallel_deepks::~Parallel_deepks() +{ + delete[] this->trace_loc_orb; + delete[] this->norb_local; +} + +void Parallel_deepks::set_nlocal( +#ifdef __MPI + MPI_Comm MPI_WORLD +#endif +) +{ + ModuleBase::TITLE("Parallel_deepks","cal_nlocal"); + assert(GlobalV::NLOCAL>0); + + delete[] this->norb_local; + + int size; +#ifdef __MPI + assert(GlobalV::NPROC>0); + size = GlobalV::NPROC; +#else + size = 1; +#endif + this->norb_local=new int[size]; + +#ifdef __MPI + int loc_size_base=GlobalV::NLOCAL/GlobalV::NPROC; + if(loc_size_base==0) + { + GlobalV::ofs_warning << " loc_size=0" << " in proc " << GlobalV::MY_RANK+1 << std::endl; + ModuleBase::WARNING_QUIT("Parallel_deepks::set_nlocal","NLOCAL < GlobalV::NPROC"); + } + + int sum_nlocal = 0; + for(int rank=0;ranknorb_local[rank] = loc_size; + sum_nlocal += loc_size; + //GlobalV::ofs_running << " loc_size: " << loc_size << " in proc " << rank << std::endl; + //GlobalV::ofs_running << " sum_nlocal: " << sum_nlocal << " in proc " << GlobalV::MY_RANK+1 << std::endl; + } + + //check if sum of norb_local = tot. nlocal + assert(sum_nlocal == GlobalV::NLOCAL); +#else + this->norb_local[0] = GlobalV::NLOCAL; +#endif +} + +void Parallel_deepks::set_loc_orb +( +#ifdef __MPI + MPI_Comm MPI_WORLD +#endif +) +{ + ModuleBase::TITLE("Parallel_deepks","set_loc_orb"); + assert(GlobalV::NLOCAL>0); + + delete[] this->trace_loc_orb; + ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"trace_loc_orb dimension",GlobalV::NLOCAL); + + this->trace_loc_orb = new int[GlobalV::NLOCAL]; + ModuleBase::Memory::record("Parallel_deepks","trace_loc_orb",GlobalV::NLOCAL,"int"); + + for(int i=0; inorb_local[GlobalV::MY_RANK]; + + int norb_start = 0; + //this is the global index of first local orb on current processor + for(int i=0;inorb_local[i]; + } + + for (int iorb=0; iorb< norb; iorb++) + { + int global_orb = iorb + norb_start; + this->trace_loc_orb[global_orb] = iorb; + } + + //ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"norb:",norb); + //for(int i=0;itrace_loc_orb[i]); + //} +#else + for(int i=0; i Date: Mon, 25 Oct 2021 14:58:46 +0800 Subject: [PATCH 11/27] deepks parallelization : calculating pdm; update CMakelist --- source/src_lcao/LCAO_descriptor.cpp | 141 +++++++++++++++++++++++- source/src_parallel/CMakeLists.txt | 5 + source/src_parallel/parallel_deepks.cpp | 10 +- source/src_parallel/parallel_deepks.h | 8 +- 4 files changed, 151 insertions(+), 13 deletions(-) diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index c34ed1841a3..6f9e7d5a605 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -287,7 +287,7 @@ void LCAO_Descriptor::build_S_descriptor(const bool& calc_deri) } // T1 #ifdef __MPI - GlobalC::ParaD.allsum_S_mu_alpha(this->inlmax,GlobalV::NLOCAL*(2*this->lmaxd+1),this->S_mu_alpha); + GlobalC::ParaD.allsum_deepks(this->inlmax,GlobalV::NLOCAL*(2*this->lmaxd+1),this->S_mu_alpha); #endif /* @@ -355,6 +355,134 @@ void LCAO_Descriptor::cal_dm_as_descriptor(const ModuleBase::matrix &dm) void LCAO_Descriptor::cal_projected_DM(const ModuleBase::matrix &dm) { ModuleBase::TITLE("LCAO_Descriptor", "cal_projected_DM"); + ModuleBase::timer::tick("LCAO_Descriptor","cal_projected_DM"); + const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1); +#ifdef __MPI + + if(GlobalV::NPROC>1) + { + //This is for first SCF iteration, when density matrix is not available yet + if(dm.nr == 0 && dm.nc ==0) + { + ModuleBase::timer::tick("LCAO_Descriptor","cal_projected_DM"); + return; + } + //step 1: get S_alpha_mu and S_nu_beta + double **ss = this->S_mu_alpha; + + //step 2 : multiply: cal A=ST*DM*S + //A(im1,im2) = sum iw1 sum iw2 S(iw1,im1) * dm(iw1,iw2) * S(iw2,im2) + // = sum iw1 S(iw1,im1) * X(iw1,im2) + // where X(iw1,im2) = sum iw2 dm(iw1,iw2) * S(iw2,im2) + for (int inl = 0;inl < inlmax;inl++) + { + ModuleBase::GlobalFunc::ZEROS(this->pdm[inl], pdm_size); + int nm = 2 * inl_l[inl] + 1; + const int tmp_pdm_size = GlobalV::NLOCAL * nm; + double* tmp_pdm = new double[tmp_pdm_size]; // saves X(iw1,im2) + + //for each pair index1=(iw1,im2) + for(int iw1 = 0; iw1 < GlobalV::NLOCAL; iw1++) + { + int iw1_local = GlobalC::ParaO.trace_loc_col[iw1]; + if(iw1_local < 0) continue; + const int ir1 = iw1; + + ModuleBase::GlobalFunc::ZEROS(tmp_pdm, tmp_pdm_size); + + for(int im2=0;im2pdm[inl][ind] += element; + } + } + } + delete[] tmp_pdm; + } + + //step 3 : gather from all ranks + GlobalC::ParaD.allsum_deepks(this->inlmax,pdm_size,this->pdm); + } + else + { + //step 1: get dm: the coefficient of wfc, not charge density + //now, dm is an input arg of this func, but needed converting to double* + this->getdm_double(dm); + + //step 2: get S_alpha_mu and S_nu_beta + double **ss = this->S_mu_alpha; + + //step 3 : multiply: cal ST*DM*S + + //init tmp_pdm* + const int tmp_pdm_size = GlobalV::NLOCAL * (lmaxd*2+1); + double* tmp_pdm = new double[tmp_pdm_size]; + ModuleBase::GlobalFunc::ZEROS(tmp_pdm, tmp_pdm_size); + for (int inl = 0;inl < inlmax;inl++) + { + int nm = 2 * inl_l[inl] + 1; //1,3,5,... + const char t = 'T'; //transpose + const char nt = 'N'; //non transpose + const double alpha = 1; + const double beta = 0; + double *a = this->dm_double; + double *b = ss[inl]; + double *c = tmp_pdm; + dgemm_(&nt, &nt, &GlobalV::NLOCAL, &nm, &GlobalV::NLOCAL, &alpha, a, &GlobalV::NLOCAL, b, &GlobalV::NLOCAL, &beta, c, &GlobalV::NLOCAL); //DM*S + a = ss[inl]; + b = c; + c = this->pdm[inl]; + dgemm_(&t, &nt, &nm, &nm, &GlobalV::NLOCAL, &alpha, a, &GlobalV::NLOCAL, b, &GlobalV::NLOCAL, &beta, c, &nm); //ST*DM*S + } + delete[] tmp_pdm; + } +#else //step 1: get dm: the coefficient of wfc, not charge density //now, dm is an input arg of this func, but needed converting to double* this->getdm_double(dm); @@ -368,7 +496,6 @@ void LCAO_Descriptor::cal_projected_DM(const ModuleBase::matrix &dm) const int tmp_pdm_size = GlobalV::NLOCAL * (lmaxd*2+1); double* tmp_pdm = new double[tmp_pdm_size]; ModuleBase::GlobalFunc::ZEROS(tmp_pdm, tmp_pdm_size); - for (int inl = 0;inl < inlmax;inl++) { int nm = 2 * inl_l[inl] + 1; //1,3,5,... @@ -385,8 +512,10 @@ void LCAO_Descriptor::cal_projected_DM(const ModuleBase::matrix &dm) c = this->pdm[inl]; dgemm_(&t, &nt, &nm, &nm, &GlobalV::NLOCAL, &alpha, a, &GlobalV::NLOCAL, b, &GlobalV::NLOCAL, &beta, c, &nm); //ST*DM*S } - delete[] tmp_pdm; +#endif + + ModuleBase::timer::tick("LCAO_Descriptor","cal_projected_DM"); return; } @@ -395,6 +524,7 @@ void LCAO_Descriptor::cal_descriptor(void) { ModuleBase::TITLE("LCAO_Descriptor", "cal_descriptor"); delete[] d; + d = new double[this->n_descriptor]; const int lmax = GlobalC::ORB.get_lmax_d(); int id = 0; @@ -1665,7 +1795,10 @@ void LCAO_Descriptor::save_npy_d(void) npy_des.push_back(this->d[i]); } const long unsigned dshape[] = {(long unsigned) GlobalC::ucell.nat, (long unsigned) this->des_per_atom }; - npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des); + if (GlobalV::MY_RANK == 0) + { + npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des); + } return; } diff --git a/source/src_parallel/CMakeLists.txt b/source/src_parallel/CMakeLists.txt index 0f6c8d295d4..4547ba4b1ca 100644 --- a/source/src_parallel/CMakeLists.txt +++ b/source/src_parallel/CMakeLists.txt @@ -11,3 +11,8 @@ add_library( parallel_reduce.cpp subgrid_oper.cpp ) + +if(ENABLE_DEEPKS) + target_sources(parallel PRIVATE parallel_deepks.cpp) +endif() + diff --git a/source/src_parallel/parallel_deepks.cpp b/source/src_parallel/parallel_deepks.cpp index 937ea2ac4d0..49953462a9c 100644 --- a/source/src_parallel/parallel_deepks.cpp +++ b/source/src_parallel/parallel_deepks.cpp @@ -115,14 +115,14 @@ void Parallel_deepks::set_loc_orb #endif } -void Parallel_deepks::allsum_S_mu_alpha( - int inlmax, //first dimension of S_mu_alpha - int ndim, //secohd dimension of S_mu_alpha - double** S_mu_alpha) +void Parallel_deepks::allsum_deepks( + int inlmax, //first dimension + int ndim, //second dimension + double** mat) { for(int inl=0;inl Date: Mon, 25 Oct 2021 20:58:39 +0800 Subject: [PATCH 12/27] deepks parallelization : gamma point energy --- source/module_orbital/ORB_gen_tables.cpp | 280 +++++++++++++++++++++++ source/module_orbital/ORB_gen_tables.h | 16 +- source/src_lcao/LCAO_descriptor.cpp | 191 +++++++++++++++- source/src_lcao/LCAO_descriptor.h | 1 + source/src_lcao/LCAO_hamilt.cpp | 14 +- 5 files changed, 491 insertions(+), 11 deletions(-) diff --git a/source/module_orbital/ORB_gen_tables.cpp b/source/module_orbital/ORB_gen_tables.cpp index 6c21ec5f889..fa7eb40141d 100644 --- a/source/module_orbital/ORB_gen_tables.cpp +++ b/source/module_orbital/ORB_gen_tables.cpp @@ -1280,6 +1280,286 @@ double ORB_gen_tables::get_distance(const ModuleBase::Vector3 &R1, const } #ifdef __DEEPKS + +void ORB_gen_tables::snap_psialpha_half( + std::vector> &nlm, + const int& job, + const ModuleBase::Vector3& R1, + const int& T1, + const int& L1, + const int& m1, + const int& N1, + const ModuleBase::Vector3& R0, // The projector. + const int& T0, + const int& I0, + ModuleBase::IntArray* inl_index + ) const +{ + ModuleBase::timer::tick("ORB_gen_tables", "snap_psialpha_half"); + + const int ln_per_atom = GlobalC::ORB.Alpha[0].getTotal_nchi(); + assert(ln_per_atom > 0); + + bool *calproj = new bool[ln_per_atom]; + int *rmesh1 = new int[ln_per_atom]; + + if(job==0) + { + nlm.resize(1); //only energy + } + else if(job==1) + { + nlm.resize(4); //energy+force + } + + int nproj = 0; + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + nproj += (2 * L0 + 1); + } + } + + for(int dim=0;dim + const ModuleBase::Vector3 dRa = (R0 - R1) * this->lat0; + double distance10 = dRa.norm(); + + bool all_out = true; + for (int ip = 0; ip < ln_per_atom; ip++) + { + const double Rcut0 = GlobalC::ORB.Alpha[0].getRcut(); + if (distance10 > (Rcut1 + Rcut0)) + { + calproj[ip] = false; + } + else + { + all_out = false; + calproj[ip] = true; + //length of table for interpolation + rmesh1[ip] = talpha.get_rmesh(Rcut1, Rcut0); + } + } + + if (all_out) + { + delete[] calproj; + delete[] rmesh1; + ModuleBase::timer::tick("ORB_gen_tables", "snap_psialpha_half"); + return; + } + + //FOR INTERPOLATION + double *curr; //current pointer + + double psa = distance10 / talpha.dr; + int iqa = static_cast(psa); + double x0a = psa - static_cast(iqa); + double x1a = 1.0 - x0a; + double x2a = 2.0 - x0a; + double x3a = 3.0 - x0a; + double x123a = x1a * x2a * x3a / 6.0; + double x120a = x1a * x2a * x0a / 6.0; + double x032a = x0a * x3a * x2a / 2.0; + double x031a = x0a * x3a * x1a / 2.0; + + //UNIT VECTOR + + double unit_vec_dRa[3]; + unit_vec_dRa[0] = dRa.x; + unit_vec_dRa[1] = dRa.y; + unit_vec_dRa[2] = dRa.z; + + //special case for R = 0; + const double tiny1 = 1e-12; + const double tiny2 = 1e-10; + + if (distance10 < tiny1) + { + distance10 += tiny1; + } + + // Find three dimension of 'Table_DSR' ' + const int Tpair1 = T1; + const int T1_2Lplus1 = talpha.DS_2Lplus1[T1]; + + //gaunt index + const int gindex1 = L1 * L1 + m1; + std::vector rlya; + std::vector> grlya; + + if (job == 0) + { + ModuleBase::Ylm::rl_sph_harm(T1_2Lplus1 - 1, dRa.x, dRa.y, dRa.z, rlya); + } + else + { + ModuleBase::Ylm::grad_rl_sph_harm(T1_2Lplus1 - 1, dRa.x, dRa.y, dRa.z, rlya, grlya); + } + + ////////////////////////////////////////////////////////////////////////// + /// Formula : T1 T0 + /// \f[ + /// <\psi1_{L1,N1}|\Beta_{L0,m0}> + ///\f] + ////////////////////////////////////////////////////////////////////////// + + int ip = 0; //for L0,N0,m0 + int nb = 0; //for L0, N0 + + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + if (!calproj[nb]) + { + ip += 2*L0 + 1; + continue; + } + ++nb; + + // + const int Opair1 = talpha.DS_Opair(Tpair1, L1, L0, N1, N0); + const int inl = inl_index[T0](I0, L0, N0); + for (int m0 = 0;m0 < 2 * L0 + 1;m0++) + { + int gindex0 = L0 * L0 + m0; + double term_a = 0.0; + double term_a_gr[3] = {0,0,0}; + + for (int L = 0; L < T1_2Lplus1; L++) + { + //triangle rule for gaunt coefficients + int AL = L1 + L0; + int SL = abs(L1 - L0); + if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1)) + { + continue; + } + + //prefac = (i)^{lphi - lbeta - l} + //R0-R1 ==> + double i_exp = pow(-1.0, (L1 - L0 - L) / 2); + double rl1 = pow(distance10, L); + double Interp_Vnla = 0.0; + double Interp_Vnla_gr = 0.0; + + //this part is for both deri and not deri + if (distance10 > tiny2) + { + curr = talpha.Table_DSR[0][Tpair1][Opair1][L]; + if (iqa >= rmesh1[nb] - 4) + { + Interp_Vnla = 0.0; + } + else + { + Interp_Vnla = i_exp * (x123a * curr[iqa] + + x120a * curr[iqa + 3] + + x032a * curr[iqa + 1] + - x031a * curr[iqa + 2]); + } + Interp_Vnla /= rl1; + } + else + { + Interp_Vnla = i_exp * talpha.Table_DSR[0][Tpair1][Opair1][L][0]; + } + + //this part is for deri only + if(job==1) + { + if (distance10 > tiny2) + { + curr = talpha.Table_DSR[1][Tpair1][Opair1][L]; + + if (iqa >= rmesh1[nb] - 4) + { + Interp_Vnla_gr = 0.0; + } + else + { + Interp_Vnla_gr = i_exp * (x123a * curr[iqa] + + x120a * curr[iqa + 3] + + x032a * curr[iqa + 1] + - x031a * curr[iqa + 2]); + } + Interp_Vnla_gr = Interp_Vnla_gr / pow(distance10, L) - Interp_Vnla * L / distance10; + } + else + { + Interp_Vnla_gr = 0.0; + } + + ///////////////////////////////////// + /// Overlap value = S_from_table * G * Ylm + //////////////////////////////////// + for (int m = 0; m < 2 * L + 1; m++) + { + int gindexa = L * L + m; + //double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); + double tmpGaunt, tmpGaunt1; + if(job==1) + { + tmpGaunt = this->MGT.Gaunt_Coefficients(gindex1, gindex0, gindexa); + tmpGaunt1= this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); + } + else + { + tmpGaunt = this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); + } + const int lm = MGT.get_lm_index(L, m); + + term_a += tmpGaunt * Interp_Vnla * rlya[lm]; + if(job==1) + { + double tt1 = tmpGaunt1 * Interp_Vnla_gr * rlya[lm] / distance10; + double tt2 = tmpGaunt1 * Interp_Vnla; + + for (int ir = 0; ir < 3; ir++) + { + term_a_gr[ir] += tt1 * unit_vec_dRa[ir] + tt2 * grlya[lm][ir]; + } + } + } + } + }//end L + + //=============================================== + // THIRD PART: SAVE THE VALUE FOR ALL PROJECTS. + //=============================================== + + if(job==0) + { + nlm[0][ip] = term_a; + } + else if(job==1) + { + nlm[0][ip] = term_a; + for(int dim=1;dim<4;dim++) + { + nlm[dim][ip] = term_a_gr[dim-1]; + } + } + + ip+=1; + }//end m0 + }//end N0 + }//end L0 +} //caoyu add 2021-08-30 void ORB_gen_tables::snap_psialpha( double nlm[], diff --git a/source/module_orbital/ORB_gen_tables.h b/source/module_orbital/ORB_gen_tables.h index 91a6a50c3c4..6764fef7d91 100644 --- a/source/module_orbital/ORB_gen_tables.h +++ b/source/module_orbital/ORB_gen_tables.h @@ -93,7 +93,21 @@ class ORB_gen_tables const bool &calc_deri)const; // mohan add 2021-04-25); /// set as public because in hamilt_linear, #ifdef __DEEPKS - void snap_psialpha( + void snap_psialpha_half( + std::vector> &nlm, + const int& job, + const ModuleBase::Vector3& R1, + const int& T1, + const int& L1, + const int& m1, + const int& N1, + const ModuleBase::Vector3& R0, // The projector. + const int& T0, + const int& I0, + ModuleBase::IntArray* inl_index + ) const; + + void snap_psialpha( double nlm[], const int& job, const ModuleBase::Vector3& R1, diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index 6f9e7d5a605..8948b258d09 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -808,8 +808,8 @@ void LCAO_Descriptor::deepks_pre_scf(const string& model_file) //initialize the H matrix H_V_delta delete[] this->H_V_delta; - this->H_V_delta = new double[GlobalV::NLOCAL * GlobalV::NLOCAL]; - ModuleBase::GlobalFunc::ZEROS(this->H_V_delta, GlobalV::NLOCAL * GlobalV::NLOCAL); + this->H_V_delta = new double[GlobalC::ParaO.nloc]; + ModuleBase::GlobalFunc::ZEROS(this->H_V_delta, GlobalC::ParaO.nloc); //init gedm** const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1); @@ -898,6 +898,179 @@ void LCAO_Descriptor::cal_v_delta(const ModuleBase::matrix& dm) return; } +void LCAO_Descriptor::build_v_delta_alpha_new(const bool& calc_deri) +{ + ModuleBase::TITLE("LCAO_Descriptor", "build_v_delta_alpha_new"); + ModuleBase::GlobalFunc::ZEROS(this->H_V_delta,GlobalC::ParaO.nloc); //init before calculate + + const double Rcut_Alpha = GlobalC::ORB.Alpha[0].getRcut(); + //same for all types of atoms + int job; + if(!calc_deri) + { + job=0; + } + else + { + job=1; + } + + for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++) + { + Atom* atom0 = &GlobalC::ucell.atoms[T0]; + for (int I0 =0; I0< atom0->na; I0++) + { + //======================================================= + //Step1: + //saves , where beta runs over L0,M0 on atom I0 + //and psi runs over atomic basis sets on the current core + //======================================================= + std::vector>> nlm_tot; + + //GlobalC::GridD.Find_atom( atom0->tau[I0] ); + const ModuleBase::Vector3 tau0 = atom0->tau[I0]; + GlobalC::GridD.Find_atom(GlobalC::ucell, atom0->tau[I0] ,T0, I0); + + //outermost loop : all adjacent atoms + nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1); + + for (int ad=0; ad tau1 = GlobalC::GridD.getAdjacentTau(ad); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + + //middle loop : atomic basis on current processor (either row or column) + nlm_tot[ad].clear(); + + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + if (dist1 > Rcut_Alpha + Rcut_AO1) + { + continue; + } + + for (int iw1=0; iw1> nlm; + //2D, but first dimension is only 1 here + //for force, the right hand side is the gradient + //and the first dimension is then 3 + //inner loop : all projectors (L0,M0) + GlobalC::UOT.snap_psialpha_half( + nlm, job, tau1, T1, + atom1->iw2l[ iw1_0 ], // L1 + atom1->iw2m[ iw1_0 ], // m1 + atom1->iw2n[ iw1_0 ], // N1 + GlobalC::ucell.atoms[T0].tau[I0], T0, I0, this->inl_index); //R0,T0 + + nlm_tot[ad].insert({iw1_all,nlm[0]}); + }//end iw + }//end ad + //======================================================= + //Step2: + //calculate sum_(L0,M0) alpha + //and accumulate the value to Hloc_fixed(i,j) + //======================================================= + + for (int ad1=0; ad1 tau1 = GlobalC::GridD.getAdjacentTau(ad1); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); + + for (int ad2=0; ad2 < GlobalC::GridD.getAdjacentNum()+1 ; ad2++) + { + const int T2 = GlobalC::GridD.getType(ad2); + const int I2 = GlobalC::GridD.getNatom(ad2); + const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0); + const ModuleBase::Vector3 tau2 = GlobalC::GridD.getAdjacentTau(ad2); + const Atom* atom2 = &GlobalC::ucell.atoms[T2]; + const int nw2_tot = atom2->nw*GlobalV::NPOL; + + const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut(); + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0; + + if (dist1 > Rcut_Alpha + Rcut_AO1 + || dist2 > Rcut_Alpha + Rcut_AO2) + { + continue; + } + + for (int iw1=0; iw1 nlm1 = nlm_tot[ad1][iw1_all]; + std::vector nlm2 = nlm_tot[ad2][iw2_all]; + + assert(nlm1.size()==nlm2.size()); + + if(calc_deri) + { + + } + else + { + double nlm=0.0; + int ib = 0; + + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + const int inl = this->inl_index[T0](I0, L0, N0); + const int nm = 2*L0+1; + for (int m01 = 0;m01 < 2 * L0 + 1;++m01) + { + for (int m02 = 0; m02 < 2 * L0 + 1; ++m02) + { + nlm += this->gedm[inl][m01*nm+m02] * nlm1[ib+m01] * nlm2[ib+m02]; + } + } + ib+=(2*L0+1); + } + } + int index = iw2_local * GlobalC::ParaO.nrow+ iw1_local; //for genelpa + this->H_V_delta[index] += nlm; + + assert(ib==nlm1.size()); + } + }//iw2 + }//iw1 + }//ad2 + }//ad1 + }//end I0 + }//end T0 + + ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_alpha_new"); + return; + +} + //for GAMMA_ONLY, search adjacent atoms from I0 void LCAO_Descriptor::build_v_delta_alpha(const bool& calc_deri) { @@ -973,8 +1146,7 @@ void LCAO_Descriptor::build_v_delta_alpha(const bool& calc_deri) GlobalC::ucell.atoms[T0].tau[I0], T0, I0, this->inl_index, this->gedm); - //GlobalC::LM.set_HSgamma(iw1_all, iw2_all, nlm[0], 'L'); - int index = iw2_all * GlobalV::NLOCAL + iw1_all; //for genelpa + int index=iw2_local*GlobalC::ParaO.nrow+iw1_local; this->H_V_delta[index] += nlm[0]; } else // calculate force @@ -1238,7 +1410,11 @@ void LCAO_Descriptor::add_v_delta(void) { continue; } - GlobalC::LM.set_HSgamma(iw1, iw2, this->H_V_delta[iw1 * GlobalV::NLOCAL + iw2], 'L'); + + int iw1_local=GlobalC::ParaO.trace_loc_row[iw1]; + int iw2_local=GlobalC::ParaO.trace_loc_col[iw2]; + int index=iw2_local*GlobalC::ParaO.nrow+iw1_local; + GlobalC::LM.set_HSgamma(iw1, iw2, this->H_V_delta[index], 'L'); } } } @@ -1866,12 +2042,13 @@ void LCAO_Descriptor::cal_e_delta_band(const std::vector &dm { const int mu = GlobalC::ParaO.trace_loc_row[j]; const int nu = GlobalC::ParaO.trace_loc_col[i]; + if (mu >= 0 && nu >= 0) { - const int index = mu * GlobalC::ParaO.ncol + nu; + const int index=nu*GlobalC::ParaO.nrow+mu; for (int is = 0; is < GlobalV::NSPIN; ++is) { - this->e_delta_band += dm[is](nu, mu) * this->H_V_delta[i * GlobalV::NLOCAL + j]; + this->e_delta_band += dm[is](nu, mu) * this->H_V_delta[index]; } } } diff --git a/source/src_lcao/LCAO_descriptor.h b/source/src_lcao/LCAO_descriptor.h index b93ebec9ce0..50e747125dc 100644 --- a/source/src_lcao/LCAO_descriptor.h +++ b/source/src_lcao/LCAO_descriptor.h @@ -74,6 +74,7 @@ class LCAO_Descriptor ///calculate \f$\sum_{I}\sum_{nlmm'}\langle\phi_\mu|\alpha^I_{nlm}\rangle{\frac{dE}{dD^I_{nlmm'}}}\langle\alpha^I_{nlm'}|\phi_\nu\rangle\f$ (for gamma_only) void build_v_delta_alpha(const bool& cal_deri/**< [in] 0 for 3-center intergration, 1 for its derivation*/); + void build_v_delta_alpha_new(const bool& cal_deri/**< [in] 0 for 3-center intergration, 1 for its derivation*/); ///calculate \f$\sum_{I}\sum_{nlmm'}\langle\phi_\mu|\alpha^I_{nlm}\rangle{\frac{dE}{dD^I_{nlmm'}}}\langle\alpha^I_{nlm'}|\phi_\nu\rangle\f$ (for multi-k) void build_v_delta_mu(const bool &cal_deri/**< [in] 0 for 3-center intergration, 1 for its derivation*/); diff --git a/source/src_lcao/LCAO_hamilt.cpp b/source/src_lcao/LCAO_hamilt.cpp index 8a335a7e7d4..fb1ca44f54d 100644 --- a/source/src_lcao/LCAO_hamilt.cpp +++ b/source/src_lcao/LCAO_hamilt.cpp @@ -116,9 +116,18 @@ void LCAO_Hamilt::calculate_Hgamma( const int &ik ) // Peize Lin add ik 2016- //ld.cal_gedm(LOC.wfc_dm_2d.dm_gamma[0]); //ld.build_v_delta_alpha(0); GlobalC::ld.cal_gedm(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); - GlobalC::ld.build_v_delta_mu(0); + if(GlobalV::GAMMA_ONLY_LOCAL) + { + //GlobalC::ld.build_v_delta_alpha(0); + GlobalC::ld.build_v_delta_alpha_new(0); + } + else + { + + } + //GlobalC::ld.build_v_delta_mu(0); - GlobalC::ld.add_v_delta(); + //GlobalC::ld.add_v_delta(); } #endif @@ -126,7 +135,6 @@ void LCAO_Hamilt::calculate_Hgamma( const int &ik ) // Peize Lin add ik 2016- //add T+VNL+Vl matrix. GlobalC::LM.update_Hloc(); - //test if(GlobalV::NURSE) { From 2da74c84d58d1d40e7fbc185678089d7ee3bbe74 Mon Sep 17 00:00:00 2001 From: linpz Date: Tue, 26 Oct 2021 07:22:25 +0800 Subject: [PATCH 13/27] 1. fix bug in ComplexMatrix::print() 2. move cout to ofs_warning for "l>4 eggbox warning" in Numerical_Orbital_Lm::extra_uniform() --- source/module_base/complexmatrix.cpp | 15 ++++++++++----- source/module_base/complexmatrix.h | 2 +- source/module_orbital/ORB_atomic_lm.cpp | 8 ++++---- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/source/module_base/complexmatrix.cpp b/source/module_base/complexmatrix.cpp index 2ad09be443c..bee26fca03b 100644 --- a/source/module_base/complexmatrix.cpp +++ b/source/module_base/complexmatrix.cpp @@ -414,20 +414,25 @@ ComplexMatrix conj(const ComplexMatrix &m) } // Peize Lin add 2021.09.08 -std::ostream & ComplexMatrix::print( std::ostream & os, const double threshold_norm, const double threshold_imag ) const +std::ostream & ComplexMatrix::print( std::ostream & os, const double threshold_abs, const double threshold_imag ) const { for( int ir=0; ir!=this->nr; ++ir ) { for( int ic=0; ic!=this->nc; ++ic ) - if(std::norm((*this)(ir,ic))>threshold_norm) + { + const std::complex & data = (*this)(ir,ic); + if(std::abs(data)>threshold_abs) { - if(std::imag((*this)(ir,ic))>threshold_imag) - os<<(*this)(ir,ic)<<"\t"; + if(std::abs(std::imag(data))>threshold_imag) + os< Date: Tue, 26 Oct 2021 12:19:02 +0800 Subject: [PATCH 14/27] 1. Add "#ifdef _OPENMP" for all OpenMP codes. Now ABACUS can be compiled without OpenMP. --- source/Makefile | 6 +++--- source/module_base/mathzone_add1.cpp | 4 ++++ source/module_orbital/ORB_atomic_lm.cpp | 7 +++++-- source/src_lcao/gint_gamma_rho.cpp | 8 +++++-- source/src_lcao/gint_gamma_vl.cpp | 10 ++++++--- source/src_parallel/parallel_global.cpp | 9 +++++--- source/src_pw/pw_basis.cpp | 5 +++++ source/src_ri/abfs.cpp | 11 ++++++++++ source/src_ri/exx_lcao.cpp | 28 ++++++++++++++++++++++++- 9 files changed, 74 insertions(+), 14 deletions(-) diff --git a/source/Makefile b/source/Makefile index c8c1aa53144..b641f12494f 100644 --- a/source/Makefile +++ b/source/Makefile @@ -32,11 +32,11 @@ HONG_SER = -D__FP -D__LCAO ${HONG_FFTW} HONG_SER_SELINV = -D__FP -D__LCAO ${HONG_FFTW} -D__SELINV HONG_GDB = -g -D__FP -D__LCAO ${HONG_FFTW} #(2)mpi -HONG_MPI = -D__FP -D__LCAO ${HONG_FFTW} -D__MPI -D__OPENMP +HONG_MPI = -D__FP -D__LCAO ${HONG_FFTW} -D__MPI # mohan comment out 2021-02-06, add -DUSE_LIBXC=0 if you want to use LIBXC #HONG_MPI_SELINV = -D__FP ${HONG_FFTW} ${HONG_LAPACK} -D__MPI -D__SELINV -DMETIS -DEXX_DM=3 -DEXX_H_COMM=2 -DUSE_LIBXC=0 -DTEST_EXX_LCAO=0 -DTEST_EXX_RADIAL=1 -DUSE_CEREAL_SERIALIZATION -HONG_MPI_SELINV = -D__FP ${HONG_FFTW} ${HONG_LAPACK} -D__MPI -D__LCAO -D__OPENMP -D__SELINV -DMETIS -DEXX_DM=3 -DEXX_H_COMM=2 -DTEST_EXX_LCAO=0 -DTEST_EXX_RADIAL=1 -DUSE_CEREAL_SERIALIZATION -HONG_MPI_SELINV_20210523 = -D__FP ${HONG_FFTW} ${HONG_LAPACK} -D__LCAO -D__MPI -D__OPENMP -D__SELINV -DMETIS -DEXX_DM=3 -DEXX_H_COMM=2 -DTEST_EXX_LCAO=0 -DTEST_EXX_RADIAL=1 -DUSE_CEREAL_SERIALIZATION -D__EXX +HONG_MPI_SELINV = -D__FP ${HONG_FFTW} ${HONG_LAPACK} -D__MPI -D__LCAO -D__SELINV -DMETIS -DEXX_DM=3 -DEXX_H_COMM=2 -DTEST_EXX_LCAO=0 -DTEST_EXX_RADIAL=1 -DUSE_CEREAL_SERIALIZATION +HONG_MPI_SELINV_20210523 = -D__FP ${HONG_FFTW} ${HONG_LAPACK} -D__LCAO -D__MPI -D__SELINV -DMETIS -DEXX_DM=3 -DEXX_H_COMM=2 -DTEST_EXX_LCAO=0 -DTEST_EXX_RADIAL=1 -DUSE_CEREAL_SERIALIZATION -D__EXX HONG_DEEPKS = ${HONG_MPI_SELINV_20210523} -D__DEEPKS #caoyu add 2021-07-15 , use it in DeePKS. Add LIBTORCH_DIR and LIBNPY_DIR, modify ${LIBS} and build with std=c++14 !! #(3)memory #(3)memory diff --git a/source/module_base/mathzone_add1.cpp b/source/module_base/mathzone_add1.cpp index 614d535ac0a..459f0205b11 100644 --- a/source/module_base/mathzone_add1.cpp +++ b/source/module_base/mathzone_add1.cpp @@ -21,7 +21,9 @@ typedef fftw_complex FFTW_COMPLEX; //#include +#ifdef _OPENMP #include +#endif namespace ModuleBase { @@ -980,7 +982,9 @@ void Mathzone_Add1::Cubic_Spline_Interpolation { ModuleBase::timer::tick("Mathzone_Add1","Cubic_Spline_Interpolation"); +#ifdef _OPENMP #pragma omp parallel for schedule(static) +#endif for(int m = 0; m < rsize ; m++) { int klo = 0; diff --git a/source/module_orbital/ORB_atomic_lm.cpp b/source/module_orbital/ORB_atomic_lm.cpp index a93279467b5..c148848ac4c 100644 --- a/source/module_orbital/ORB_atomic_lm.cpp +++ b/source/module_orbital/ORB_atomic_lm.cpp @@ -4,7 +4,10 @@ #include "../module_base/timer.h" #include "../module_base/math_integral.h" #include "../module_base/math_sphbes.h" + +#ifdef _OPENMP #include +#endif Numerical_Orbital_Lm::Numerical_Orbital_Lm() { @@ -224,7 +227,7 @@ void Numerical_Orbital_Lm::extra_uniform(const double &dr_uniform_in, const bool // do interpolation here to make grid more dense -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (int ir = 0; ir < this->nr_uniform; ir++) @@ -545,7 +548,7 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void) { r_tmp[ir] *= (ir&1) ? four_three : two_three; } -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (int ik = 0; ik < nk; ik++) diff --git a/source/src_lcao/gint_gamma_rho.cpp b/source/src_lcao/gint_gamma_rho.cpp index 9c233d730a9..b237b59353d 100644 --- a/source/src_lcao/gint_gamma_rho.cpp +++ b/source/src_lcao/gint_gamma_rho.cpp @@ -10,6 +10,10 @@ #include "global_fp.h" // mohan add 2021-01-30 +#ifdef _OPENMP +#include +#endif + #ifdef __MKL #include #endif @@ -173,7 +177,7 @@ Gint_Tools::Array_Pool Gint_Gamma::gamma_charge(const double*const*const mkl_set_num_threads(std::max(1,mkl_threads/GlobalC::GridT.nbx)); // Peize Lin update 2021.01.20 #endif -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel #endif { @@ -184,7 +188,7 @@ Gint_Tools::Array_Pool Gint_Gamma::gamma_charge(const double*const*const const int ncyz = GlobalC::pw.ncy*GlobalC::pw.nczp; // mohan add 2012-03-25 -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp for #endif for (int i=0; i +#endif + #ifdef __MKL #include #endif @@ -316,7 +320,7 @@ Gint_Tools::Array_Pool Gint_Gamma::gamma_vlocal(const double*const vloca mkl_set_num_threads(std::max(1,mkl_threads/GlobalC::GridT.nbx)); // Peize Lin update 2021.01.20 #endif -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel #endif { @@ -344,7 +348,7 @@ Gint_Tools::Array_Pool Gint_Gamma::gamma_vlocal(const double*const vloca const int LD_pool = max_size*GlobalC::ucell.nwmax; -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp for #endif for (int i=0; i< nbx; i++) @@ -419,7 +423,7 @@ Gint_Tools::Array_Pool Gint_Gamma::gamma_vlocal(const double*const vloca }// j }// i -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp critical(cal_vl) #endif { diff --git a/source/src_parallel/parallel_global.cpp b/source/src_parallel/parallel_global.cpp index 4e02bf0ca7f..91337d792ba 100644 --- a/source/src_parallel/parallel_global.cpp +++ b/source/src_parallel/parallel_global.cpp @@ -157,15 +157,18 @@ void Parallel_Global::read_mpi_parameters(int argc,char **argv) std::cout< +#endif PW_Basis::PW_Basis() { @@ -770,7 +773,9 @@ void PW_Basis::setup_structure_factor(void) // Peize Lin optimize and add Open { const int na = Ucell->atoms[it].na; const ModuleBase::Vector3 * const tau = Ucell->atoms[it].tau; +#ifdef _OPENMP #pragma omp parallel for schedule(static) +#endif for (int ig=0; igngmc; ig++) { const ModuleBase::Vector3 gcar_ig = gcar[ig]; diff --git a/source/src_ri/abfs.cpp b/source/src_ri/abfs.cpp index 82692564661..d40fd05c67f 100644 --- a/source/src_ri/abfs.cpp +++ b/source/src_ri/abfs.cpp @@ -5,7 +5,10 @@ #include "exx_abfs-inverse_matrix_double.h" #include "../src_pw/global.h" #include "../module_base/global_function.h" + +#ifdef _OPENMP #include +#endif #ifdef __MKL #include @@ -44,7 +47,9 @@ std::map,std::shared_pt #endif std::map,std::shared_ptr>>> Cs; +#ifdef _OPENMP #pragma omp parallel for +#endif for( int i_iat1=0; i_iat1,std::shared_pt it1, it2, -tau1+tau2+(box2*GlobalC::ucell.latvec), m_abfs_abfs, m_abfslcaos_lcaos, index_abfs, index_lcaos, threshold, true, rwlock_Cw, rwlock_Vw, Cws, Vws ); +#ifdef _OPENMP #pragma omp critical(Abfs_cal_Cs) +#endif Cs[iat1][iat2][box2] = C; } } @@ -180,7 +187,9 @@ std::map,std::shared_pt std::vector> Coulomb_potential_boxes = get_Coulomb_potential_boxes(rmesh_times); std::map,std::shared_ptr>>> Vs; +#ifdef _OPENMP #pragma omp parallel for +#endif for( int i_atom_pair=0; i_atom_pair,std::shared_pt it1, it2, delta_R, m_abfs_abfs, index_abfs, threshold, true, rwlock_Vw, Vws ); +#ifdef _OPENMP #pragma omp critical(Abfs_cal_Vs) +#endif Vs[iat1][iat2][box2] = V; } } diff --git a/source/src_ri/exx_lcao.cpp b/source/src_ri/exx_lcao.cpp index 61f22493321..d77c5458bca 100644 --- a/source/src_ri/exx_lcao.cpp +++ b/source/src_ri/exx_lcao.cpp @@ -23,6 +23,10 @@ #include +#ifdef _OPENMP +#include +#endif + #ifdef __MKL #include #endif @@ -1443,10 +1447,14 @@ std::vector,Mo #endif std::vector,ModuleBase::matrix>>>> HexxR(GlobalV::NSPIN); +#ifdef _OPENMP omp_lock_t Hexx_lock; omp_init_lock(&Hexx_lock); +#endif +#ifdef _OPENMP #pragma omp parallel +#endif { // m_new( i2, i1, i3 ) = m( i1, i2, i3 ) auto transform = []( @@ -1532,7 +1540,9 @@ std::vector,Mo std::vector,ModuleBase::matrix>>>> HexxR_tmp(GlobalV::NSPIN); +#ifdef _OPENMP #pragma omp for +#endif for(size_t i_atom_pair=0; i_atom_pair,Mo } // end for box4 } // end for box3 +#ifdef _OPENMP if( !vector_empty(HexxR_tmp) && omp_test_lock(&Hexx_lock) ) { insert_matrixes(HexxR,HexxR_tmp); omp_unset_lock(&Hexx_lock); } +#else + if( !vector_empty(HexxR_tmp) ) + { + insert_matrixes(HexxR,HexxR_tmp); + } +#endif } // end for iat4 } // end for iat3 } // end omp for i_atom_pair +#ifdef _OPENMP if(!vector_empty(HexxR_tmp)) { omp_set_lock(&Hexx_lock); insert_matrixes(HexxR,HexxR_tmp); omp_unset_lock(&Hexx_lock); - } + } +#else + if(!vector_empty(HexxR_tmp)) + { + insert_matrixes(HexxR,HexxR_tmp); + } +#endif } // end omp parallel +#ifdef _OPENMP omp_destroy_lock(&Hexx_lock); +#endif #ifdef __MKL mkl_set_num_threads(mkl_threads); From c0984e881ae571f14b781a5776b33d462f685504 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Tue, 26 Oct 2021 22:41:14 +0800 Subject: [PATCH 15/27] auther: Qu Xin; fixed bug of DFT+U force and stress calculation. --- source/src_lcao/dftu_relax.cpp | 147 +++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 17 deletions(-) diff --git a/source/src_lcao/dftu_relax.cpp b/source/src_lcao/dftu_relax.cpp index 5bf853c0259..6adf5eb6840 100644 --- a/source/src_lcao/dftu_relax.cpp +++ b/source/src_lcao/dftu_relax.cpp @@ -182,7 +182,7 @@ void DFTU_RELAX::cal_force_k(const int ik, const std::complex* rho_VU) ModuleBase::TITLE("DFTU_RELAX", "cal_force_k"); ModuleBase::timer::tick("DFTU_RELAX", "cal_force_k"); - const char transN = 'N'; + const char transN = 'N', transC='C'; const int one_int = 1; const std::complex zero(0.0,0.0), one(1.0,0.0); @@ -191,17 +191,17 @@ void DFTU_RELAX::cal_force_k(const int ik, const std::complex* rho_VU) for(int dim=0; dim<3; dim++) { - this->fold_dSm_k(ik, dim, &dSm_k[0]); + this->fold_dSm_k(ik, dim, &dSm_k[0]); - pzgemm_(&transN, &transN, + pzgemm_(&transN, &transC, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &one, - rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, - &dSm_k[0], &one_int, &one_int, GlobalC::ParaO.desc, + &dSm_k[0], &one_int, &one_int, GlobalC::ParaO.desc, + rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, &zero, &dm_VU_dSm[0], &one_int, &one_int, GlobalC::ParaO.desc); - for(int ir=0; ir* rho_VU) const int iwt2 = GlobalC::ParaO.MatrixInfo.col_set[ic]; const int irc = ic*GlobalC::ParaO.nrow + ir; - if(iwt1==iwt2) this->force_dftu[iat1][dim] += 2.0*dm_VU_dSm[irc].real(); + if(iwt1==iwt2) this->force_dftu[iat1][dim] += dm_VU_dSm[irc].real(); }//end ic }//end ir + pzgemm_(&transN, &transN, + &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL, + &one, + &dSm_k[0], &one_int, &one_int, GlobalC::ParaO.desc, + rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, + &zero, + &dm_VU_dSm[0], &one_int, &one_int, GlobalC::ParaO.desc); + + for(int it=0; itiatlnmipol2iwt[iat][l][n][m][ipol]; + const int mu = GlobalC::ParaO.trace_loc_row[iwt]; + const int nu = GlobalC::ParaO.trace_loc_col[iwt]; + + if(mu<0 || nu<0) continue; + + this->force_dftu[iat][dim] += dm_VU_dSm[nu*GlobalC::ParaO.nrow+mu].real(); + } + }// + }//n + }//l + }//ia + }//it + }//end dim ModuleBase::timer::tick("DFTU_RELAX", "cal_force_k"); @@ -270,7 +326,7 @@ void DFTU_RELAX::cal_force_gamma(const double* rho_VU) { ModuleBase::TITLE("DFTU_RELAX", "cal_force_gamma"); ModuleBase::timer::tick("DFTU_RELAX", "cal_force_gamma"); - const char transN = 'N'; + const char transN = 'N', transT='T'; const int one_int = 1; const double one = 1.0, zero = 0.0, minus_one=-1.0; @@ -278,20 +334,20 @@ void DFTU_RELAX::cal_force_gamma(const double* rho_VU) for(int dim=0; dim<3; dim++) { - double* tmp_ptr; - if(dim==0) tmp_ptr = GlobalC::LM.DSloc_x; - else if(dim==1) tmp_ptr = GlobalC::LM.DSloc_y; - else if(dim==2) tmp_ptr = GlobalC::LM.DSloc_z; + double* tmp_ptr; + if(dim==0) tmp_ptr = GlobalC::LM.DSloc_x; + else if(dim==1) tmp_ptr = GlobalC::LM.DSloc_y; + else if(dim==2) tmp_ptr = GlobalC::LM.DSloc_z; - pdgemm_(&transN, &transN, + pdgemm_(&transN, &transT, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &one, - rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, - tmp_ptr, &one_int, &one_int, GlobalC::ParaO.desc, + tmp_ptr, &one_int, &one_int, GlobalC::ParaO.desc, + rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, &zero, &dm_VU_dSm[0], &one_int, &one_int, GlobalC::ParaO.desc); - for(int ir=0; irforce_dftu[iat1][dim] += 2.0*dm_VU_dSm[irc]; + if(iwt1==iwt2) this->force_dftu[iat1][dim] += dm_VU_dSm[irc]; }//end ic }//end ir + + pdgemm_(&transN, &transT, + &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL, + &one, + tmp_ptr, &one_int, &one_int, GlobalC::ParaO.desc, + rho_VU, &one_int, &one_int, GlobalC::ParaO.desc, + &zero, + &dm_VU_dSm[0], &one_int, &one_int, GlobalC::ParaO.desc); + + for(int it=0; itiatlnmipol2iwt[iat][l][n][m][ipol]; + const int mu = GlobalC::ParaO.trace_loc_row[iwt]; + const int nu = GlobalC::ParaO.trace_loc_col[iwt]; + + if(mu<0 || nu<0) continue; + + this->force_dftu[iat][dim] += dm_VU_dSm[nu*GlobalC::ParaO.nrow+mu]; + } + }// + }//n + }//l + }//ia + }//it + }// end dim ModuleBase::timer::tick("DFTU_RELAX", "cal_force_gamma"); From 755ef44d5608f642a79ca4fe0a0269973c845b4c Mon Sep 17 00:00:00 2001 From: dyzheng Date: Tue, 26 Oct 2021 22:45:01 +0800 Subject: [PATCH 16/27] reopen test case of DFT+U --- tests/integrate/260_NO_15_PK_PU_AF/result.ref | 6 +++--- tests/integrate/CASES | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integrate/260_NO_15_PK_PU_AF/result.ref b/tests/integrate/260_NO_15_PK_PU_AF/result.ref index 39054365744..d9220f04504 100644 --- a/tests/integrate/260_NO_15_PK_PU_AF/result.ref +++ b/tests/integrate/260_NO_15_PK_PU_AF/result.ref @@ -1,3 +1,3 @@ -etotref -7394.1899828928080751 -etotperatomref -1848.5474957232 -totaltimeref 112.425 +etotref -7250.8489104633490570 +etotperatomref -1812.7122276158 +totaltimeref 7.70 diff --git a/tests/integrate/CASES b/tests/integrate/CASES index 71aa421e181..b7e4344530e 100644 --- a/tests/integrate/CASES +++ b/tests/integrate/CASES @@ -63,7 +63,7 @@ 240_NO_KP_15_SO 250_NO_KP_CR_VDW2 250_NO_KP_CR_VDW3 -#260_NO_15_PK_PU_AF +260_NO_15_PK_PU_AF 301_NO_GO_15_CF_CS 301_NO_GO_DJ_Si #303_NO_GO_HP_15 From ff4699bcf49b82e484beaad6bf437112665b6330 Mon Sep 17 00:00:00 2001 From: wenfei-li Date: Wed, 27 Oct 2021 19:53:15 +0800 Subject: [PATCH 17/27] deepks parallelization : gamma point, energy and force --- source/Makefile.Objects | 2 + source/module_orbital/ORB_gen_tables.cpp | 48 ++- source/src_lcao/CMakeLists.txt | 2 + source/src_lcao/FORCE_STRESS.cpp | 2 +- source/src_lcao/FORCE_gamma.cpp | 5 +- source/src_lcao/LCAO_descriptor.cpp | 427 ++--------------------- source/src_lcao/LCAO_descriptor.h | 1 + source/src_lcao/LCAO_descriptor_new.cpp | 400 +++++++++++++++++++++ source/src_lcao/LCAO_descriptor_old.cpp | 189 ++++++++++ source/src_lcao/LCAO_hamilt.cpp | 8 +- 10 files changed, 655 insertions(+), 429 deletions(-) create mode 100644 source/src_lcao/LCAO_descriptor_new.cpp create mode 100644 source/src_lcao/LCAO_descriptor_old.cpp diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 303f9dbd0e2..44f062de7d3 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -164,6 +164,8 @@ LCAO_nnr.o \ LCAO_diago.o\ LCAO_evolve.o\ LCAO_descriptor.o\ +LCAO_descriptor_old.o\ +LCAO_descriptor_new.o\ ylm.o\ FORCE_STRESS.o\ FORCE_gamma.o\ diff --git a/source/module_orbital/ORB_gen_tables.cpp b/source/module_orbital/ORB_gen_tables.cpp index fa7eb40141d..1e2d269c9b6 100644 --- a/source/module_orbital/ORB_gen_tables.cpp +++ b/source/module_orbital/ORB_gen_tables.cpp @@ -1503,36 +1503,34 @@ void ORB_gen_tables::snap_psialpha_half( { Interp_Vnla_gr = 0.0; } - + } ///////////////////////////////////// /// Overlap value = S_from_table * G * Ylm //////////////////////////////////// - for (int m = 0; m < 2 * L + 1; m++) + for (int m = 0; m < 2 * L + 1; m++) + { + int gindexa = L * L + m; + //double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); + double tmpGaunt, tmpGaunt1; + if(job==1) { - int gindexa = L * L + m; - //double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); - double tmpGaunt, tmpGaunt1; - if(job==1) - { - tmpGaunt = this->MGT.Gaunt_Coefficients(gindex1, gindex0, gindexa); - tmpGaunt1= this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); - } - else - { - tmpGaunt = this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); - } - const int lm = MGT.get_lm_index(L, m); - - term_a += tmpGaunt * Interp_Vnla * rlya[lm]; - if(job==1) + tmpGaunt = this->MGT.Gaunt_Coefficients(gindex1, gindex0, gindexa); + tmpGaunt1= this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); + } + else + { + tmpGaunt = this->MGT.Gaunt_Coefficients(gindex0, gindex1, gindexa); + } + const int lm = MGT.get_lm_index(L, m); + + term_a += tmpGaunt * Interp_Vnla * rlya[lm]; + if(job==1) + { + double tt1 = tmpGaunt1 * Interp_Vnla_gr * rlya[lm] / distance10; + double tt2 = tmpGaunt1 * Interp_Vnla; + for (int ir = 0; ir < 3; ir++) { - double tt1 = tmpGaunt1 * Interp_Vnla_gr * rlya[lm] / distance10; - double tt2 = tmpGaunt1 * Interp_Vnla; - - for (int ir = 0; ir < 3; ir++) - { - term_a_gr[ir] += tt1 * unit_vec_dRa[ir] + tt2 * grlya[lm][ir]; - } + term_a_gr[ir] += tt1 * unit_vec_dRa[ir] + tt2 * grlya[lm][ir]; } } } diff --git a/source/src_lcao/CMakeLists.txt b/source/src_lcao/CMakeLists.txt index 84d3ba5ea2f..ec2fab15a75 100644 --- a/source/src_lcao/CMakeLists.txt +++ b/source/src_lcao/CMakeLists.txt @@ -63,4 +63,6 @@ add_library( if(ENABLE_DEEPKS) target_sources(lcao PRIVATE LCAO_descriptor.cpp) + target_sources(lcao PRIVATE LCAO_descriptor_new.cpp) + target_sources(lcao PRIVATE LCAO_descriptor_old.cpp) endif() diff --git a/source/src_lcao/FORCE_STRESS.cpp b/source/src_lcao/FORCE_STRESS.cpp index 295472c0726..8f887b6003b 100644 --- a/source/src_lcao/FORCE_STRESS.cpp +++ b/source/src_lcao/FORCE_STRESS.cpp @@ -294,7 +294,7 @@ void Force_Stress_LCAO::getForceStress( #ifdef __DEEPKS //DeePKS force, caoyu add 2021-06-03 - if (GlobalV::out_descriptor) + if (GlobalV::out_descriptor && GlobalV::NPROC<=1) //not parallelized yet { GlobalC::ld.save_npy_f(fcs, "f_tot.npy"); //Ty/Bohr, F_tot if (GlobalV::deepks_scf) diff --git a/source/src_lcao/FORCE_gamma.cpp b/source/src_lcao/FORCE_gamma.cpp index 4ad95fe5309..e4b321f678d 100644 --- a/source/src_lcao/FORCE_gamma.cpp +++ b/source/src_lcao/FORCE_gamma.cpp @@ -83,11 +83,14 @@ void Force_LCAO_gamma::ftable_gamma ( //=======method 2: snap_psialpha======== GlobalC::ld.cal_gedm(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); - GlobalC::ld.cal_f_delta_hf(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); + GlobalC::ld.cal_f_delta_hf_new(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); //ld.print_F_delta("F_delta_hf.dat"); GlobalC::ld.cal_f_delta_pulay(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); //ld.print_F_delta("F_delta_pulay.dat"); GlobalC::ld.print_F_delta("F_delta.dat"); +#ifdef __MPI + Parallel_Reduce::reduce_double_all(GlobalC::ld.F_delta.c,GlobalC::ld.F_delta.nr*GlobalC::ld.F_delta.nc); +#endif } #endif diff --git a/source/src_lcao/LCAO_descriptor.cpp b/source/src_lcao/LCAO_descriptor.cpp index 8948b258d09..a227fb267a8 100644 --- a/source/src_lcao/LCAO_descriptor.cpp +++ b/source/src_lcao/LCAO_descriptor.cpp @@ -140,7 +140,6 @@ void LCAO_Descriptor::init( return; } - void LCAO_Descriptor::init_index(void) { delete[] this->alpha_index; @@ -195,7 +194,6 @@ void LCAO_Descriptor::init_index(void) return; } - void LCAO_Descriptor::build_S_descriptor(const bool& calc_deri) { ModuleBase::TITLE("LCAO_Descriptor", "build_S_descriptor"); @@ -303,7 +301,6 @@ void LCAO_Descriptor::build_S_descriptor(const bool& calc_deri) return; } - void LCAO_Descriptor::set_S_mu_alpha( const int &iw1_all, const int &inl, @@ -331,36 +328,15 @@ void LCAO_Descriptor::set_S_mu_alpha( -// compute the full projected density matrix for each atom -// save the matrix for each atom in order to minimize the usage of memory -// --mohan 2021-08-04 -void LCAO_Descriptor::cal_dm_as_descriptor(const ModuleBase::matrix &dm) -{ - ModuleBase::TITLE("LCAO_Descriptor", "cal_proj_dm"); - - for(int it=0; itlmaxd * 2 + 1) * (this->lmaxd * 2 + 1); -#ifdef __MPI if(GlobalV::NPROC>1) { +#ifdef __MPI //This is for first SCF iteration, when density matrix is not available yet if(dm.nr == 0 && dm.nc ==0) { @@ -448,8 +424,9 @@ void LCAO_Descriptor::cal_projected_DM(const ModuleBase::matrix &dm) //step 3 : gather from all ranks GlobalC::ParaD.allsum_deepks(this->inlmax,pdm_size,this->pdm); +#endif } - else + else //serial; or mpi with nproc=1 { //step 1: get dm: the coefficient of wfc, not charge density //now, dm is an input arg of this func, but needed converting to double* @@ -482,38 +459,6 @@ void LCAO_Descriptor::cal_projected_DM(const ModuleBase::matrix &dm) } delete[] tmp_pdm; } -#else - //step 1: get dm: the coefficient of wfc, not charge density - //now, dm is an input arg of this func, but needed converting to double* - this->getdm_double(dm); - - //step 2: get S_alpha_mu and S_nu_beta - double **ss = this->S_mu_alpha; - - //step 3 : multiply: cal ST*DM*S - - //init tmp_pdm* - const int tmp_pdm_size = GlobalV::NLOCAL * (lmaxd*2+1); - double* tmp_pdm = new double[tmp_pdm_size]; - ModuleBase::GlobalFunc::ZEROS(tmp_pdm, tmp_pdm_size); - for (int inl = 0;inl < inlmax;inl++) - { - int nm = 2 * inl_l[inl] + 1; //1,3,5,... - const char t = 'T'; //transpose - const char nt = 'N'; //non transpose - const double alpha = 1; - const double beta = 0; - double *a = this->dm_double; - double *b = ss[inl]; - double *c = tmp_pdm; - dgemm_(&nt, &nt, &GlobalV::NLOCAL, &nm, &GlobalV::NLOCAL, &alpha, a, &GlobalV::NLOCAL, b, &GlobalV::NLOCAL, &beta, c, &GlobalV::NLOCAL); //DM*S - a = ss[inl]; - b = c; - c = this->pdm[inl]; - dgemm_(&t, &nt, &nm, &nm, &GlobalV::NLOCAL, &alpha, a, &GlobalV::NLOCAL, b, &GlobalV::NLOCAL, &beta, c, &nm); //ST*DM*S - } - delete[] tmp_pdm; -#endif ModuleBase::timer::tick("LCAO_Descriptor","cal_projected_DM"); return; @@ -840,237 +785,16 @@ void LCAO_Descriptor::deepks_pre_scf(const string& model_file) delete[] DH_V_delta_x; delete[] DH_V_delta_y; delete[] DH_V_delta_z; - this->DH_V_delta_x = new double[GlobalV::NLOCAL * GlobalV::NLOCAL]; - this->DH_V_delta_y = new double [GlobalV::NLOCAL * GlobalV::NLOCAL]; - this->DH_V_delta_z = new double[GlobalV::NLOCAL * GlobalV::NLOCAL]; - ModuleBase::GlobalFunc::ZEROS(DH_V_delta_x, GlobalV::NLOCAL * GlobalV::NLOCAL); - ModuleBase::GlobalFunc::ZEROS(DH_V_delta_y, GlobalV::NLOCAL * GlobalV::NLOCAL); - ModuleBase::GlobalFunc::ZEROS(DH_V_delta_z, GlobalV::NLOCAL * GlobalV::NLOCAL); - } - return; -} - - -void LCAO_Descriptor::cal_v_delta(const ModuleBase::matrix& dm) -{ - ModuleBase::TITLE("LCAO_Descriptor", "cal_v_delta"); - //1. (dE/dD) (descriptor changes in every scf iter) - this->cal_gedm(dm); - - //2. multiply overlap matrice and sum - double* tmp_v1 = new double[(2 * lmaxd + 1) * GlobalV::NLOCAL]; - double* tmp_v2 = new double[GlobalV::NLOCAL *GlobalV::NLOCAL]; - - ModuleBase::GlobalFunc::ZEROS(this->H_V_delta, GlobalV::NLOCAL * GlobalV::NLOCAL); //init before calculate - - for (int inl = 0;inl < inlmax;inl++) - { - ModuleBase::GlobalFunc::ZEROS(tmp_v1, (2 * lmaxd + 1) * GlobalV::NLOCAL); - ModuleBase::GlobalFunc::ZEROS(tmp_v2, GlobalV::NLOCAL * GlobalV::NLOCAL); - int nm = 2 * inl_l[inl] + 1; //1,3,5,... - const char t = 'T'; //transpose - const char nt = 'N'; //non transpose - const double alpha = 1; - const double beta = 0; - double* a = this->gedm[inl];//[nm][nm] - double* b = S_mu_alpha[inl];//[GlobalV::NLOCAL][nm]--trans->[nm][GlobalV::NLOCAL] - double* c = tmp_v1; - - //2.1 (dE/dD)* - dgemm_(&nt, &t, &nm, &GlobalV::NLOCAL, &nm, &alpha, a, &nm, b, &GlobalV::NLOCAL, &beta, c, &nm); - - //2.2 *(dE/dD)* - a = b; //[GlobalV::NLOCAL][nm] - b = c;//[nm][GlobalV::NLOCAL] - c = tmp_v2;//[GlobalV::NLOCAL][GlobalV::NLOCAL] - dgemm_(&nt, &nt, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &nm, &alpha, a, &GlobalV::NLOCAL, b, &nm, &beta, c, &GlobalV::NLOCAL); - - //3. sum of Inl - for (int i = 0;i < GlobalV::NLOCAL * GlobalV::NLOCAL;++i) - { - this->H_V_delta[i] += c[i]; - } + this->DH_V_delta_x = new double[GlobalC::ParaO.nloc]; + this->DH_V_delta_y = new double [GlobalC::ParaO.nloc]; + this->DH_V_delta_z = new double[GlobalC::ParaO.nloc]; + ModuleBase::GlobalFunc::ZEROS(DH_V_delta_x, GlobalC::ParaO.nloc); + ModuleBase::GlobalFunc::ZEROS(DH_V_delta_y, GlobalC::ParaO.nloc); + ModuleBase::GlobalFunc::ZEROS(DH_V_delta_z, GlobalC::ParaO.nloc); } - delete[] tmp_v1; - delete[] tmp_v2; - - GlobalV::ofs_running << " Finish calculating H_V_delta" << std::endl; return; } -void LCAO_Descriptor::build_v_delta_alpha_new(const bool& calc_deri) -{ - ModuleBase::TITLE("LCAO_Descriptor", "build_v_delta_alpha_new"); - ModuleBase::GlobalFunc::ZEROS(this->H_V_delta,GlobalC::ParaO.nloc); //init before calculate - - const double Rcut_Alpha = GlobalC::ORB.Alpha[0].getRcut(); - //same for all types of atoms - int job; - if(!calc_deri) - { - job=0; - } - else - { - job=1; - } - - for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++) - { - Atom* atom0 = &GlobalC::ucell.atoms[T0]; - for (int I0 =0; I0< atom0->na; I0++) - { - //======================================================= - //Step1: - //saves , where beta runs over L0,M0 on atom I0 - //and psi runs over atomic basis sets on the current core - //======================================================= - std::vector>> nlm_tot; - - //GlobalC::GridD.Find_atom( atom0->tau[I0] ); - const ModuleBase::Vector3 tau0 = atom0->tau[I0]; - GlobalC::GridD.Find_atom(GlobalC::ucell, atom0->tau[I0] ,T0, I0); - - //outermost loop : all adjacent atoms - nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1); - - for (int ad=0; ad tau1 = GlobalC::GridD.getAdjacentTau(ad); - const Atom* atom1 = &GlobalC::ucell.atoms[T1]; - const int nw1_tot = atom1->nw*GlobalV::NPOL; - - //middle loop : atomic basis on current processor (either row or column) - nlm_tot[ad].clear(); - - const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; - if (dist1 > Rcut_Alpha + Rcut_AO1) - { - continue; - } - - for (int iw1=0; iw1> nlm; - //2D, but first dimension is only 1 here - //for force, the right hand side is the gradient - //and the first dimension is then 3 - //inner loop : all projectors (L0,M0) - GlobalC::UOT.snap_psialpha_half( - nlm, job, tau1, T1, - atom1->iw2l[ iw1_0 ], // L1 - atom1->iw2m[ iw1_0 ], // m1 - atom1->iw2n[ iw1_0 ], // N1 - GlobalC::ucell.atoms[T0].tau[I0], T0, I0, this->inl_index); //R0,T0 - - nlm_tot[ad].insert({iw1_all,nlm[0]}); - }//end iw - }//end ad - //======================================================= - //Step2: - //calculate sum_(L0,M0) alpha - //and accumulate the value to Hloc_fixed(i,j) - //======================================================= - - for (int ad1=0; ad1 tau1 = GlobalC::GridD.getAdjacentTau(ad1); - const Atom* atom1 = &GlobalC::ucell.atoms[T1]; - const int nw1_tot = atom1->nw*GlobalV::NPOL; - const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); - - for (int ad2=0; ad2 < GlobalC::GridD.getAdjacentNum()+1 ; ad2++) - { - const int T2 = GlobalC::GridD.getType(ad2); - const int I2 = GlobalC::GridD.getNatom(ad2); - const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0); - const ModuleBase::Vector3 tau2 = GlobalC::GridD.getAdjacentTau(ad2); - const Atom* atom2 = &GlobalC::ucell.atoms[T2]; - const int nw2_tot = atom2->nw*GlobalV::NPOL; - - const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut(); - const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; - const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0; - - if (dist1 > Rcut_Alpha + Rcut_AO1 - || dist2 > Rcut_Alpha + Rcut_AO2) - { - continue; - } - - for (int iw1=0; iw1 nlm1 = nlm_tot[ad1][iw1_all]; - std::vector nlm2 = nlm_tot[ad2][iw2_all]; - - assert(nlm1.size()==nlm2.size()); - - if(calc_deri) - { - - } - else - { - double nlm=0.0; - int ib = 0; - - for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) - { - for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) - { - const int inl = this->inl_index[T0](I0, L0, N0); - const int nm = 2*L0+1; - for (int m01 = 0;m01 < 2 * L0 + 1;++m01) - { - for (int m02 = 0; m02 < 2 * L0 + 1; ++m02) - { - nlm += this->gedm[inl][m01*nm+m02] * nlm1[ib+m01] * nlm2[ib+m02]; - } - } - ib+=(2*L0+1); - } - } - int index = iw2_local * GlobalC::ParaO.nrow+ iw1_local; //for genelpa - this->H_V_delta[index] += nlm; - - assert(ib==nlm1.size()); - } - }//iw2 - }//iw1 - }//ad2 - }//ad1 - }//end I0 - }//end T0 - - ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_alpha_new"); - return; - -} - //for GAMMA_ONLY, search adjacent atoms from I0 void LCAO_Descriptor::build_v_delta_alpha(const bool& calc_deri) { @@ -1179,6 +903,15 @@ void LCAO_Descriptor::build_v_delta_alpha(const bool& calc_deri) }//end ad1 }//end I0 }//end T0 + + for(int iw1=0;iw1H_V_delta[index], 'L'); } } @@ -1426,112 +1159,6 @@ void LCAO_Descriptor::add_v_delta(void) return; } - -void LCAO_Descriptor::cal_f_delta(const ModuleBase::matrix &dm) -{ - ModuleBase::TITLE("LCAO_Descriptor", "cal_f_delta"); - this->F_delta.zero_out(); - //1. cal gedm - this->cal_gedm(dm); - - //2. cal gdmx - this->init_gdmx(); - this->cal_gdmx(dm); - - //3.multiply and sum for each atom - //3.1 Pulay term - // \sum_{Inl}\sum_{mm'} _{mm'} - //notice: sum of multiplied corresponding element(mm') , not matrix multiplication ! - int iat = 0; //check if the index same as GlobalC::ucell.iw2iat or not !! - for (int it = 0;it < GlobalC::ucell.ntype;++it) - { - for (int ia = 0;ia < GlobalC::ucell.atoms[it].na;++ia) - { - for (int inl = 0;inl < this->inlmax;++inl) - { - int nm = 2 * inl_l[inl] + 1; - for (int m1 = 0;m1 < nm;++m1) - { - for (int m2 = 0; m2 < nm;++m2) - { - this->F_delta(iat, 0) += this->gedm[inl][m1 * nm + m2] * gdmx[iat][inl][m1 * nm + m2]; - this->F_delta(iat, 1) += this->gedm[inl][m1 * nm + m2] * gdmy[iat][inl][m1 * nm + m2]; - this->F_delta(iat, 2) += this->gedm[inl][m1 * nm + m2] * gdmz[iat][inl][m1 * nm + m2]; - } - } - }//end inl - ++iat; - } - } - this->print_F_delta("F_delta_pulay_old.dat"); - this->F_delta.zero_out(); - iat = 0; - for (int it = 0;it < GlobalC::ucell.ntype;++it) - { - for (int ia = 0;ia < GlobalC::ucell.atoms[it].na;++ia) - { - //3.2 HF term - double** ss = this->S_mu_alpha; - double** dsx = this->DS_mu_alpha_x; - double** dsy = this->DS_mu_alpha_y; - double** dsz = this->DS_mu_alpha_z; - for (int mu = 0;mu < GlobalV::NLOCAL;++mu) - { - for (int nu = 0;nu < GlobalV::NLOCAL;++nu) - { - for (int l = 0;l <= GlobalC::ORB.Alpha[0].getLmax();++l) - { - for (int n = 0;n < GlobalC::ORB.Alpha[0].getNchi(l);++n) - { - for (int m1 = 0;m1 < 2 * l + 1;++m1) - { - for (int m2 = 0;m2 < 2 * l + 1;++m2) - { - if (GlobalV::KS_SOLVER == "genelpa" || GlobalV::KS_SOLVER == "scalapack_gvx") - { - this->F_delta(iat, 0) -= 2*dm(mu, nu) * dsx[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; - this->F_delta(iat, 1) -= 2*dm(mu, nu) * dsy[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; - this->F_delta(iat, 2) -= 2*dm(mu, nu) * dsz[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; - } - else - { - this->F_delta(iat, 0) -= 2*dm(mu, nu) * dsx[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; - this->F_delta(iat, 1) -= 2*dm(mu, nu) * dsy[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; - this->F_delta(iat, 2) -= 2*dm(mu, nu) * dsz[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] - * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; - } - }//end m2 - }//end m1 - }//end n - }//end l - }//end nu - }//end mu - ++iat; - }//end ia - }//end it - this->print_F_delta("F_delta_hf_old.dat"); - //3.3 Overlap term - //somthing in NN, which not included in Hamiltonian - /* - for (int mu = 0;mu < GlobalV::NLOCAL;++mu) - { - const int iat = GlobalC::ucell.iwt2iat[mu]; - for (int nu = 0;nu < GlobalV::NLOCAL;++nu) - { - this->F_delta(iat, 0) += 2*(this->E_delta - this->e_delta_band)* dm(mu, nu) * GlobalC::LM.DSloc_x[mu * GlobalV::NLOCAL + nu]; - this->F_delta(iat, 1) += 2*(this->E_delta - this->e_delta_band) * dm(mu, nu) * GlobalC::LM.DSloc_y[mu * GlobalV::NLOCAL + nu]; - this->F_delta(iat, 2) += 2*(this->E_delta - this->e_delta_band) * dm(mu, nu) * GlobalC::LM.DSloc_z[mu * GlobalV::NLOCAL + nu]; - } - }*/ - this->del_gdmx(); - return; -} - void LCAO_Descriptor::cal_f_delta_hf(const ModuleBase::matrix& dm) { ModuleBase::TITLE("LCAO_Descriptor", "cal_f_delta_hf"); @@ -1625,7 +1252,7 @@ void LCAO_Descriptor::cal_f_delta_pulay(const ModuleBase::matrix& dm) { ModuleBase::TITLE("LCAO_Descriptor", "cal_f_delta_pulay"); //this->F_delta.zero_out(); - this->build_v_delta_alpha(1); + this->build_v_delta_alpha_new(1); //this->build_v_delta_mu(1); //, if multi-k for (int i = 0;i < GlobalV::NLOCAL;++i) //col, diff { @@ -1637,14 +1264,15 @@ void LCAO_Descriptor::cal_f_delta_pulay(const ModuleBase::matrix& dm) if (mu >= 0 && nu >= 0) { const int index = mu * GlobalC::ParaO.ncol + nu; - this->F_delta(iat, 0) += 2 * dm(mu, nu) * this->DH_V_delta_x[index]; - this->F_delta(iat, 1) += 2 * dm(mu, nu) * this->DH_V_delta_y[index]; - this->F_delta(iat, 2) += 2 * dm(mu, nu) * this->DH_V_delta_z[index]; + this->F_delta(iat, 0) += 2 * dm(nu, mu) * this->DH_V_delta_x[index]; + this->F_delta(iat, 1) += 2 * dm(nu, mu) * this->DH_V_delta_y[index]; + this->F_delta(iat, 2) += 2 * dm(nu, mu) * this->DH_V_delta_z[index]; } } } return; } + void LCAO_Descriptor::cal_descriptor_tensor(void) { ModuleBase::TITLE("LCAO_Descriptor", "cal_descriptor_tensor"); @@ -1707,7 +1335,6 @@ void LCAO_Descriptor::load_model(const string& model_file) return; } - void LCAO_Descriptor::cal_gedm(const ModuleBase::matrix &dm) { //using this->pdm_tensor @@ -2053,9 +1680,9 @@ void LCAO_Descriptor::cal_e_delta_band(const std::vector &dm } } } + Parallel_Reduce::reduce_double_all(this->e_delta_band); return; } -#endif void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) { @@ -2123,4 +1750,6 @@ void LCAO_Descriptor::cal_gvx(const ModuleBase::matrix &dm) assert(this->gvx_tensor.size(3) == this->des_per_atom); return; -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/source/src_lcao/LCAO_descriptor.h b/source/src_lcao/LCAO_descriptor.h index 50e747125dc..64f4e87e8ad 100644 --- a/source/src_lcao/LCAO_descriptor.h +++ b/source/src_lcao/LCAO_descriptor.h @@ -87,6 +87,7 @@ class LCAO_Descriptor ///compute Hellmann-Feynman term of the force contribution of \f$E_\delta\f$ void cal_f_delta_hf(const ModuleBase::matrix& dm/**< [in] density matrix*/); + void cal_f_delta_hf_new(const ModuleBase::matrix& dm/**< [in] density matrix*/); ///compute Pulay term of the force contribution of \f$E_\delta\f$ void cal_f_delta_pulay(const ModuleBase::matrix& dm/**< [in] density matrix*/); diff --git a/source/src_lcao/LCAO_descriptor_new.cpp b/source/src_lcao/LCAO_descriptor_new.cpp new file mode 100644 index 00000000000..fa438d9c915 --- /dev/null +++ b/source/src_lcao/LCAO_descriptor_new.cpp @@ -0,0 +1,400 @@ +//wenfei add 2021 october +#ifdef __DEEPKS + +#include "LCAO_descriptor.h" +#include "LCAO_matrix.h" +#include "../module_base/lapack_connector.h" +#include "../module_base/intarray.h" +#include "../module_base/complexmatrix.h" +#include "global_fp.h" +#include "../src_pw/global.h" +#include "../src_io/winput.h" + +#include +#include +#include +#include + +void LCAO_Descriptor::build_v_delta_alpha_new(const bool& calc_deri) +{ + ModuleBase::TITLE("LCAO_Descriptor", "build_v_delta_alpha_new"); + ModuleBase::GlobalFunc::ZEROS(this->H_V_delta,GlobalC::ParaO.nloc); //init before calculate + + const double Rcut_Alpha = GlobalC::ORB.Alpha[0].getRcut(); + //same for all types of atoms + int job; + if(!calc_deri) + { + job=0; + } + else + { + job=1; + } + + for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++) + { + Atom* atom0 = &GlobalC::ucell.atoms[T0]; + for (int I0 =0; I0< atom0->na; I0++) + { + //======================================================= + //Step1: + //saves , where beta runs over L0,M0 on atom I0 + //and psi runs over atomic basis sets on the current core + //======================================================= + std::vector>>> nlm_tot; + + //GlobalC::GridD.Find_atom( atom0->tau[I0] ); + const ModuleBase::Vector3 tau0 = atom0->tau[I0]; + GlobalC::GridD.Find_atom(GlobalC::ucell, atom0->tau[I0] ,T0, I0); + + //outermost loop : all adjacent atoms + nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1); + + for (int ad=0; ad tau1 = GlobalC::GridD.getAdjacentTau(ad); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + + //middle loop : atomic basis on current processor (either row or column) + nlm_tot[ad].clear(); + + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + + if (dist1 > Rcut_Alpha + Rcut_AO1) + { + continue; + } + + for (int iw1=0; iw1> nlm; + //2D, but first dimension is only 1 here + //for force, the right hand side is the gradient + //and the first dimension is then 3 + //inner loop : all projectors (L0,M0) + GlobalC::UOT.snap_psialpha_half( + nlm, job, tau1, T1, + atom1->iw2l[ iw1_0 ], // L1 + atom1->iw2m[ iw1_0 ], // m1 + atom1->iw2n[ iw1_0 ], // N1 + GlobalC::ucell.atoms[T0].tau[I0], T0, I0, this->inl_index); //R0,T0 + nlm_tot[ad].insert({iw1_all,nlm}); + }//end iw + }//end ad + //======================================================= + //Step2: + //calculate sum_(L0,M0) alpha + //and accumulate the value to Hloc_fixed(i,j) + //======================================================= + + for (int ad1=0; ad1 tau1 = GlobalC::GridD.getAdjacentTau(ad1); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); + + for (int ad2=0; ad2 < GlobalC::GridD.getAdjacentNum()+1 ; ad2++) + { + const int T2 = GlobalC::GridD.getType(ad2); + const int I2 = GlobalC::GridD.getNatom(ad2); + const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0); + const ModuleBase::Vector3 tau2 = GlobalC::GridD.getAdjacentTau(ad2); + const Atom* atom2 = &GlobalC::ucell.atoms[T2]; + const int nw2_tot = atom2->nw*GlobalV::NPOL; + + const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut(); + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0; + + if (dist1 > Rcut_Alpha + Rcut_AO1 + || dist2 > Rcut_Alpha + Rcut_AO2) + { + continue; + } + + for (int iw1=0; iw1 nlm1 = nlm_tot[ad1][iw1_all][0]; + std::vector> nlm2; + nlm2.resize(3); + + for(int dim=0;dim<3;dim++) + { + nlm2[dim] = nlm_tot[ad2][iw2_all][dim+1]; + } + + assert(nlm1.size()==nlm2[0].size()); + int ib=0; + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + const int inl = this->inl_index[T0](I0, L0, N0); + const int nm = 2*L0+1; + for (int m1 = 0;m1 < 2 * L0 + 1;++m1) + { + for (int m2 = 0; m2 < 2 * L0 + 1; ++m2) + { + for(int dim=0;dim<3;dim++) + { + nlm[dim] += this->gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2]; + } + } + } + ib+=(2*L0+1); + } + } + + assert(ib==nlm1.size()); + + int index = iw1_local * GlobalC::ParaO.ncol+ iw2_local; + this->DH_V_delta_x[index] += nlm[0]; + this->DH_V_delta_y[index] += nlm[1]; + this->DH_V_delta_z[index] += nlm[2]; + + } + else + { + std::vector nlm1 = nlm_tot[ad1][iw1_all][0]; + std::vector nlm2 = nlm_tot[ad2][iw2_all][0]; + + assert(nlm1.size()==nlm2.size()); + double nlm=0.0; + int ib = 0; + + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + const int inl = this->inl_index[T0](I0, L0, N0); + const int nm = 2*L0+1; + for (int m01 = 0;m01 < nm;++m01) + { + for (int m02 = 0; m02 < nm; ++m02) + { + nlm += this->gedm[inl][m01*nm+m02]*nlm1[ib+m01]*nlm2[ib+m02]; + } + } + ib+=(2*L0+1); + } + } + int index = iw2_local * GlobalC::ParaO.nrow+ iw1_local; //for genelpa + this->H_V_delta[index] += nlm; + + assert(ib==nlm1.size()); + } + }//iw2 + }//iw1 + }//ad2 + }//ad1 + }//end I0 + }//end T0 +/* + for(int iw1=0;iw1F_delta.zero_out(); + + const double Rcut_Alpha = GlobalC::ORB.Alpha[0].getRcut(); + for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++) + { + Atom* atom0 = &GlobalC::ucell.atoms[T0]; + for (int I0 =0; I0< atom0->na; I0++) + { + int iat = GlobalC::ucell.itia2iat(T0,I0); + const ModuleBase::Vector3 tau0 = atom0->tau[I0]; + GlobalC::GridD.Find_atom(GlobalC::ucell, atom0->tau[I0] ,T0, I0); + + //======================================================= + //Step1: + //saves , where beta runs over L0,M0 on atom I0 + //and psi runs over atomic basis sets on the current core + //======================================================= + std::vector>>> nlm_tot; + + //outermost loop : all adjacent atoms + nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1); + + for (int ad=0; ad tau1 = GlobalC::GridD.getAdjacentTau(ad); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + + //middle loop : atomic basis on current processor (either row or column) + nlm_tot[ad].clear(); + + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + + if (dist1 > Rcut_Alpha + Rcut_AO1) + { + continue; + } + + for (int iw1=0; iw1> nlm; + //2D, but first dimension is only 1 here + //for force, the right hand side is the gradient + //and the first dimension is then 3 + //inner loop : all projectors (L0,M0) + GlobalC::UOT.snap_psialpha_half( + nlm, 1, tau1, T1, + atom1->iw2l[ iw1_0 ], // L1 + atom1->iw2m[ iw1_0 ], // m1 + atom1->iw2n[ iw1_0 ], // N1 + GlobalC::ucell.atoms[T0].tau[I0], T0, I0, this->inl_index); //R0,T0 + + nlm_tot[ad].insert({iw1_all,nlm}); + }//end iw + }//end ad + + //======================================================= + //Step2: + //calculate sum_(L0,M0) alpha + //and accumulate the value to Hloc_fixed(i,j) + //======================================================= + + for (int ad1=0; ad1 tau1 = GlobalC::GridD.getAdjacentTau(ad1); + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + const int nw1_tot = atom1->nw*GlobalV::NPOL; + const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); + + for (int ad2=0; ad2 < GlobalC::GridD.getAdjacentNum()+1 ; ad2++) + { + const int T2 = GlobalC::GridD.getType(ad2); + const int I2 = GlobalC::GridD.getNatom(ad2); + const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0); + const ModuleBase::Vector3 tau2 = GlobalC::GridD.getAdjacentTau(ad2); + const Atom* atom2 = &GlobalC::ucell.atoms[T2]; + const int nw2_tot = atom2->nw*GlobalV::NPOL; + + const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut(); + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; + const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0; + + if (dist1 > Rcut_Alpha + Rcut_AO1 + || dist2 > Rcut_Alpha + Rcut_AO2) + { + continue; + } + + for (int iw1=0; iw1 nlm1 = nlm_tot[ad1][iw1_all][0]; + std::vector> nlm2; + nlm2.resize(3); + + for(int dim=0;dim<3;dim++) + { + nlm2[dim] = nlm_tot[ad2][iw2_all][dim+1]; + } + + assert(nlm1.size()==nlm2[0].size()); + + int ib=0; + for (int L0 = 0; L0 <= GlobalC::ORB.Alpha[0].getLmax();++L0) + { + for (int N0 = 0;N0 < GlobalC::ORB.Alpha[0].getNchi(L0);++N0) + { + const int inl = this->inl_index[T0](I0, L0, N0); + const int nm = 2*L0+1; + for (int m1 = 0;m1 < nm; ++m1) + { + for (int m2 = 0; m2 < nm; ++m2) + { + for(int dim=0;dim<3;dim++) + { + nlm[dim] += this->gedm[inl][m1*nm+m2]*nlm1[ib+m1]*nlm2[dim][ib+m2]; + } + } + } + ib+=nm; + } + } + assert(ib==nlm1.size()); + + // HF term is minus, only one projector for each atom force. + this->F_delta(iat, 0) -= 2 * dm(iw1_local, iw2_local) * nlm[0]; + this->F_delta(iat, 1) -= 2 * dm(iw1_local, iw2_local) * nlm[1]; + this->F_delta(iat, 2) -= 2 * dm(iw1_local, iw2_local) * nlm[2]; + + }//iw2 + }//iw1 + }//ad2 + }//ad1 + }//end I0 + }//end T0 +} + +#endif \ No newline at end of file diff --git a/source/src_lcao/LCAO_descriptor_old.cpp b/source/src_lcao/LCAO_descriptor_old.cpp new file mode 100644 index 00000000000..8dbbc5f9fe1 --- /dev/null +++ b/source/src_lcao/LCAO_descriptor_old.cpp @@ -0,0 +1,189 @@ +//keep some no longer used deepks subroutines here +#ifdef __DEEPKS + +#include "LCAO_descriptor.h" +#include "LCAO_matrix.h" +#include "../module_base/lapack_connector.h" +#include "../module_base/intarray.h" +#include "../module_base/complexmatrix.h" +#include "global_fp.h" +#include "../src_pw/global.h" +#include "../src_io/winput.h" + +#include +#include +#include +#include + +void LCAO_Descriptor::cal_v_delta(const ModuleBase::matrix& dm) +{ + ModuleBase::TITLE("LCAO_Descriptor", "cal_v_delta"); + //1. (dE/dD) (descriptor changes in every scf iter) + this->cal_gedm(dm); + + //2. multiply overlap matrice and sum + double* tmp_v1 = new double[(2 * lmaxd + 1) * GlobalV::NLOCAL]; + double* tmp_v2 = new double[GlobalV::NLOCAL *GlobalV::NLOCAL]; + + ModuleBase::GlobalFunc::ZEROS(this->H_V_delta, GlobalV::NLOCAL * GlobalV::NLOCAL); //init before calculate + + for (int inl = 0;inl < inlmax;inl++) + { + ModuleBase::GlobalFunc::ZEROS(tmp_v1, (2 * lmaxd + 1) * GlobalV::NLOCAL); + ModuleBase::GlobalFunc::ZEROS(tmp_v2, GlobalV::NLOCAL * GlobalV::NLOCAL); + int nm = 2 * inl_l[inl] + 1; //1,3,5,... + const char t = 'T'; //transpose + const char nt = 'N'; //non transpose + const double alpha = 1; + const double beta = 0; + double* a = this->gedm[inl];//[nm][nm] + double* b = S_mu_alpha[inl];//[GlobalV::NLOCAL][nm]--trans->[nm][GlobalV::NLOCAL] + double* c = tmp_v1; + + //2.1 (dE/dD)* + dgemm_(&nt, &t, &nm, &GlobalV::NLOCAL, &nm, &alpha, a, &nm, b, &GlobalV::NLOCAL, &beta, c, &nm); + + //2.2 *(dE/dD)* + a = b; //[GlobalV::NLOCAL][nm] + b = c;//[nm][GlobalV::NLOCAL] + c = tmp_v2;//[GlobalV::NLOCAL][GlobalV::NLOCAL] + dgemm_(&nt, &nt, &GlobalV::NLOCAL, &GlobalV::NLOCAL, &nm, &alpha, a, &GlobalV::NLOCAL, b, &nm, &beta, c, &GlobalV::NLOCAL); + + //3. sum of Inl + for (int i = 0;i < GlobalV::NLOCAL * GlobalV::NLOCAL;++i) + { + this->H_V_delta[i] += c[i]; + } + } + delete[] tmp_v1; + delete[] tmp_v2; + + GlobalV::ofs_running << " Finish calculating H_V_delta" << std::endl; + return; +} + +// compute the full projected density matrix for each atom +// save the matrix for each atom in order to minimize the usage of memory +// --mohan 2021-08-04 +void LCAO_Descriptor::cal_dm_as_descriptor(const ModuleBase::matrix &dm) +{ + ModuleBase::TITLE("LCAO_Descriptor", "cal_proj_dm"); + + for(int it=0; itF_delta.zero_out(); + //1. cal gedm + this->cal_gedm(dm); + + //2. cal gdmx + this->init_gdmx(); + this->cal_gdmx(dm); + + //3.multiply and sum for each atom + //3.1 Pulay term + // \sum_{Inl}\sum_{mm'} _{mm'} + //notice: sum of multiplied corresponding element(mm') , not matrix multiplication ! + int iat = 0; //check if the index same as GlobalC::ucell.iw2iat or not !! + for (int it = 0;it < GlobalC::ucell.ntype;++it) + { + for (int ia = 0;ia < GlobalC::ucell.atoms[it].na;++ia) + { + for (int inl = 0;inl < this->inlmax;++inl) + { + int nm = 2 * inl_l[inl] + 1; + for (int m1 = 0;m1 < nm;++m1) + { + for (int m2 = 0; m2 < nm;++m2) + { + this->F_delta(iat, 0) += this->gedm[inl][m1 * nm + m2] * gdmx[iat][inl][m1 * nm + m2]; + this->F_delta(iat, 1) += this->gedm[inl][m1 * nm + m2] * gdmy[iat][inl][m1 * nm + m2]; + this->F_delta(iat, 2) += this->gedm[inl][m1 * nm + m2] * gdmz[iat][inl][m1 * nm + m2]; + } + } + }//end inl + ++iat; + } + } + this->print_F_delta("F_delta_pulay_old.dat"); + this->F_delta.zero_out(); + iat = 0; + for (int it = 0;it < GlobalC::ucell.ntype;++it) + { + for (int ia = 0;ia < GlobalC::ucell.atoms[it].na;++ia) + { + //3.2 HF term + double** ss = this->S_mu_alpha; + double** dsx = this->DS_mu_alpha_x; + double** dsy = this->DS_mu_alpha_y; + double** dsz = this->DS_mu_alpha_z; + for (int mu = 0;mu < GlobalV::NLOCAL;++mu) + { + for (int nu = 0;nu < GlobalV::NLOCAL;++nu) + { + for (int l = 0;l <= GlobalC::ORB.Alpha[0].getLmax();++l) + { + for (int n = 0;n < GlobalC::ORB.Alpha[0].getNchi(l);++n) + { + for (int m1 = 0;m1 < 2 * l + 1;++m1) + { + for (int m2 = 0;m2 < 2 * l + 1;++m2) + { + if (GlobalV::KS_SOLVER == "genelpa" || GlobalV::KS_SOLVER == "scalapack_gvx") + { + this->F_delta(iat, 0) -= 2*dm(mu, nu) * dsx[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; + this->F_delta(iat, 1) -= 2*dm(mu, nu) * dsy[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; + this->F_delta(iat, 2) -= 2*dm(mu, nu) * dsz[inl_index[it](ia, l, n)][m1 * GlobalV::NLOCAL + mu] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][m2 * GlobalV::NLOCAL + nu]; + } + else + { + this->F_delta(iat, 0) -= 2*dm(mu, nu) * dsx[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; + this->F_delta(iat, 1) -= 2*dm(mu, nu) * dsy[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; + this->F_delta(iat, 2) -= 2*dm(mu, nu) * dsz[inl_index[it](ia, l, n)][mu* (2*l+1) + m1] + * this->gedm[inl_index[it](ia, l, n)][m1 * (2 * l + 1) + m2] * ss[inl_index[it](ia, l, n)][nu* (2*l+1) + m2]; + } + }//end m2 + }//end m1 + }//end n + }//end l + }//end nu + }//end mu + ++iat; + }//end ia + }//end it + this->print_F_delta("F_delta_hf_old.dat"); + //3.3 Overlap term + //somthing in NN, which not included in Hamiltonian + /* + for (int mu = 0;mu < GlobalV::NLOCAL;++mu) + { + const int iat = GlobalC::ucell.iwt2iat[mu]; + for (int nu = 0;nu < GlobalV::NLOCAL;++nu) + { + this->F_delta(iat, 0) += 2*(this->E_delta - this->e_delta_band)* dm(mu, nu) * GlobalC::LM.DSloc_x[mu * GlobalV::NLOCAL + nu]; + this->F_delta(iat, 1) += 2*(this->E_delta - this->e_delta_band) * dm(mu, nu) * GlobalC::LM.DSloc_y[mu * GlobalV::NLOCAL + nu]; + this->F_delta(iat, 2) += 2*(this->E_delta - this->e_delta_band) * dm(mu, nu) * GlobalC::LM.DSloc_z[mu * GlobalV::NLOCAL + nu]; + } + }*/ + this->del_gdmx(); + return; +} +#endif \ No newline at end of file diff --git a/source/src_lcao/LCAO_hamilt.cpp b/source/src_lcao/LCAO_hamilt.cpp index fb1ca44f54d..9e2ae92ee8d 100644 --- a/source/src_lcao/LCAO_hamilt.cpp +++ b/source/src_lcao/LCAO_hamilt.cpp @@ -115,11 +115,13 @@ void LCAO_Hamilt::calculate_Hgamma( const int &ik ) // Peize Lin add ik 2016- //========method 2======== //ld.cal_gedm(LOC.wfc_dm_2d.dm_gamma[0]); //ld.build_v_delta_alpha(0); - GlobalC::ld.cal_gedm(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); + if(GlobalV::GAMMA_ONLY_LOCAL) { + GlobalC::ld.cal_gedm(GlobalC::LOC.wfc_dm_2d.dm_gamma[0]); //GlobalC::ld.build_v_delta_alpha(0); GlobalC::ld.build_v_delta_alpha_new(0); + GlobalC::ld.add_v_delta(); } else { @@ -127,7 +129,7 @@ void LCAO_Hamilt::calculate_Hgamma( const int &ik ) // Peize Lin add ik 2016- } //GlobalC::ld.build_v_delta_mu(0); - //GlobalC::ld.add_v_delta(); + } #endif @@ -878,4 +880,4 @@ void LCAO_Hamilt::calculat_HR_dftu_soc_sparse(const int ¤t_spin, const dou void LCAO_Hamilt::destroy_all_HSR_sparse(void) { GlobalC::LM.destroy_HS_R_sparse(); -} \ No newline at end of file +} From 3bca604ca84826122129f4bb66eb34eeeafa0968 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Mon, 1 Nov 2021 13:19:54 +0800 Subject: [PATCH 18/27] deleted warning of angular momentum output in warning.log --- source/module_orbital/ORB_atomic_lm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/module_orbital/ORB_atomic_lm.cpp b/source/module_orbital/ORB_atomic_lm.cpp index c148848ac4c..714872cb8a6 100644 --- a/source/module_orbital/ORB_atomic_lm.cpp +++ b/source/module_orbital/ORB_atomic_lm.cpp @@ -269,8 +269,8 @@ void Numerical_Orbital_Lm::extra_uniform(const double &dr_uniform_in, const bool case 3: ModuleBase::Mathzone_Add1::SplineD2 (ModuleBase::GlobalFunc::VECTOR_TO_PTR(r_radial), ModuleBase::GlobalFunc::VECTOR_TO_PTR(psi), nr, 100000.0, 100000.0, y2); break; case 4: ModuleBase::Mathzone_Add1::SplineD2 (ModuleBase::GlobalFunc::VECTOR_TO_PTR(r_radial), ModuleBase::GlobalFunc::VECTOR_TO_PTR(psi), nr, 0.0, 0.0, y2); break; default: - GlobalV::ofs_warning << " The angular momentum larger than 4 (g orbitals) may be error about eggbox. " << std::endl; - GlobalV::ofs_warning << " Check file " << __FILE__ << " line " << __LINE__ < Date: Tue, 2 Nov 2021 14:56:31 +0800 Subject: [PATCH 19/27] vnl : make it faster for large system --- source/src_lcao/LCAO_gen_fixedH.cpp | 495 +++++++++++++++------------- 1 file changed, 269 insertions(+), 226 deletions(-) diff --git a/source/src_lcao/LCAO_gen_fixedH.cpp b/source/src_lcao/LCAO_gen_fixedH.cpp index fe9f7dcae4a..756ab254a8e 100644 --- a/source/src_lcao/LCAO_gen_fixedH.cpp +++ b/source/src_lcao/LCAO_gen_fixedH.cpp @@ -441,27 +441,36 @@ void LCAO_gen_fixedH::build_Nonlocal_mu_new(const bool &calc_deri) // while beta is in the supercell. // while phi2 is in the supercell. + + //Step 1 : generate + //type of atom; distance; atomic basis; projectors + std::vector>>> nlm_tot; + std::vector>>>> nlm_tot1; + + if(!calc_deri) + { + nlm_tot.resize(GlobalC::ucell.nat); + } + else + { + nlm_tot1.resize(GlobalC::ucell.nat); + } for(int iat=0;iat - //type of atom; distance; atomic basis; projectors - std::map>> nlm_tot; - std::map>>> nlm_tot1; - const double Rcut_Beta = GlobalC::ucell.infoNL.Beta[it].get_rcut_max(); const ModuleBase::Vector3 tau = GlobalC::ucell.atoms[it].tau[ia]; GlobalC::GridD.Find_atom(GlobalC::ucell, tau ,it, ia); if(!calc_deri) { - nlm_tot.clear(); + nlm_tot[iat].clear(); } else { - nlm_tot1.clear(); + nlm_tot1[iat].clear(); } for (int ad=0; ad + //and accumulate the value to Hloc_fixedR(i,j) + //======================================================= + int nnr = 0; + ModuleBase::Vector3 tau1, tau2, dtau; + ModuleBase::Vector3 dtau1, dtau2, tau0; + ModuleBase::Vector3 dtau1_f, dtau2_f; + double distance = 0.0; + double rcut = 0.0; + double rcut1, rcut2; + + // Record_adj RA; + // RA.for_2d(); - //======================================================= - //Step2: - //calculate sum_(L0,M0) beta - //and accumulate the value to Hloc_fixedR(i,j) - //======================================================= - int nnr = 0; - ModuleBase::Vector3 tau1, tau2, dtau; - ModuleBase::Vector3 dtau1, dtau2, tau0; - ModuleBase::Vector3 dtau1_f, dtau2_f; - double distance = 0.0; - double rcut = 0.0; - double rcut1, rcut2; - - // Record_adj RA; - // RA.for_2d(); - - // psi1 - for (int T1 = 0; T1 < GlobalC::ucell.ntype; ++T1) + // psi1 + for (int T1 = 0; T1 < GlobalC::ucell.ntype; ++T1) + { + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; + for (int I1 =0; I1< atom1->na; ++I1) { - const Atom* atom1 = &GlobalC::ucell.atoms[T1]; - for (int I1 =0; I1< atom1->na; ++I1) + //GlobalC::GridD.Find_atom( atom1->tau[I1] ); + GlobalC::GridD.Find_atom(GlobalC::ucell, atom1->tau[I1] ,T1, I1); + const int iat1 = GlobalC::ucell.itia2iat(T1, I1); + const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0); + tau1 = atom1->tau[I1]; + + // psi2 + for (int ad2=0; ad2tau[I1] ); - GlobalC::GridD.Find_atom(GlobalC::ucell, atom1->tau[I1] ,T1, I1); - const int iat1 = GlobalC::ucell.itia2iat(T1, I1); - const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0); - tau1 = atom1->tau[I1]; - - // psi2 - for (int ad2=0; ad2= rcut) + + dtau = tau2 - tau1; + distance = dtau.norm2() * pow(GlobalC::ucell.lat0,2); + // this rcut is in order to make nnr consistent + // with other matrix. + rcut = pow(GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ORB.Phi[T2].getRcut(),2); + if(distance < rcut) is_adj = true; + else if(distance >= rcut) + { + for (int ad0 = 0; ad0 < GlobalC::GridD.getAdjacentNum()+1; ++ad0) { - for (int ad0 = 0; ad0 < GlobalC::GridD.getAdjacentNum()+1; ++ad0) - { - const int T0 = GlobalC::GridD.getType(ad0); - //const int I0 = GlobalC::GridD.getNatom(ad0); - //const int T0 = RA.info[iat1][ad0][3]; - //const int I0 = RA.info[iat1][ad0][4]; - //const int iat0 = GlobalC::ucell.itia2iat(T0, I0); - //const int start0 = GlobalC::ucell.itiaiw2iwt(T0, I0, 0); + const int T0 = GlobalC::GridD.getType(ad0); + //const int I0 = GlobalC::GridD.getNatom(ad0); + //const int T0 = RA.info[iat1][ad0][3]; + //const int I0 = RA.info[iat1][ad0][4]; + //const int iat0 = GlobalC::ucell.itia2iat(T0, I0); + //const int start0 = GlobalC::ucell.itiaiw2iwt(T0, I0, 0); - tau0 = GlobalC::GridD.getAdjacentTau(ad0); - dtau1 = tau0 - tau1; - dtau2 = tau0 - tau2; + tau0 = GlobalC::GridD.getAdjacentTau(ad0); + dtau1 = tau0 - tau1; + dtau2 = tau0 - tau2; - const double distance1 = dtau1.norm2() * pow(GlobalC::ucell.lat0,2); - const double distance2 = dtau2.norm2() * pow(GlobalC::ucell.lat0,2); + const double distance1 = dtau1.norm2() * pow(GlobalC::ucell.lat0,2); + const double distance2 = dtau2.norm2() * pow(GlobalC::ucell.lat0,2); - rcut1 = pow(GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); - rcut2 = pow(GlobalC::ORB.Phi[T2].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); + rcut1 = pow(GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); + rcut2 = pow(GlobalC::ORB.Phi[T2].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); - if( distance1 < rcut1 && distance2 < rcut2 ) - { - is_adj = true; - break; - } + if( distance1 < rcut1 && distance2 < rcut2 ) + { + is_adj = true; + break; } } + } - if(is_adj) + if(is_adj) + { + // < psi1 | all projectors | psi2 > + // ----------------------------- enter the nnr increaing zone ------------------------- + for (int ad0=0; ad0 < GlobalC::GridD.getAdjacentNum()+1 ; ++ad0) { - // < psi1 | all projectors | psi2 > - // ----------------------------- enter the nnr increaing zone ------------------------- + const int T0 = GlobalC::GridD.getType(ad0); + const int I0 = GlobalC::GridD.getNatom(ad0); + const int iat = GlobalC::ucell.itia2iat(T0,I0); + + // mohan add 2010-12-19 + if( GlobalC::ucell.infoNL.nproj[T0] == 0) continue; + + //const int I0 = GlobalC::GridD.getNatom(ad0); + //const int start0 = GlobalC::ucell.itiaiw2iwt(T0, I0, 0); + tau0 = GlobalC::GridD.getAdjacentTau(ad0); + + dtau1 = tau0 - tau1; + dtau2 = tau0 - tau2; + const double distance1 = dtau1.norm2() * pow(GlobalC::ucell.lat0,2); + const double distance2 = dtau2.norm2() * pow(GlobalC::ucell.lat0,2); + + // seems a bug here!! mohan 2011-06-17 + rcut1 = pow(GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); + rcut2 = pow(GlobalC::ORB.Phi[T2].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); + + if(distance1 >= rcut1 || distance2 >= rcut2) + { + continue; + } + //const Atom* atom0 = &GlobalC::ucell.atoms[T0]; + const int rx0=GlobalC::GridD.getBox(ad0).x; + const int ry0=GlobalC::GridD.getBox(ad0).y; + const int rz0=GlobalC::GridD.getBox(ad0).z; + key_tuple key1(iat1,-rx0,-ry0,-rz0); + key_tuple key2(iat2,rx2-rx0,ry2-ry0,rz2-rz0); + + std::unordered_map> *nlm_cur1_e; //left hand side, for energy + std::unordered_map>> *nlm_cur1_f; //lhs, for force + std::unordered_map> *nlm_cur2_e; //rhs, for energy + std::unordered_map>> *nlm_cur2_f; //rhs, for force + + if(!calc_deri) + { + nlm_cur1_e = &nlm_tot[iat][key1]; + nlm_cur2_e = &nlm_tot[iat][key2]; + } + else + { + nlm_cur1_f = &nlm_tot1[iat][key1]; + nlm_cur2_f = &nlm_tot1[iat][key2]; + } + + int nnr_inner = 0; + for (int j=0; jnw*GlobalV::NPOL; j++) { const int j0 = j/GlobalV::NPOL;//added by zhengdy-soc @@ -641,177 +701,160 @@ void LCAO_gen_fixedH::build_Nonlocal_mu_new(const bool &calc_deri) const int nu = GlobalC::ParaO.trace_loc_col[iw2_all]; if(nu < 0)continue; - - //(3) run over all projectors in nonlocal pseudopotential. - for (int ad0=0; ad0 < GlobalC::GridD.getAdjacentNum()+1 ; ++ad0) + if(!calc_deri) { - const int T0 = GlobalC::GridD.getType(ad0); - const int I0 = GlobalC::GridD.getNatom(ad0); - if(T0!=it || I0!=ia) continue; - - // mohan add 2010-12-19 - if( GlobalC::ucell.infoNL.nproj[T0] == 0) continue; - - //const int I0 = GlobalC::GridD.getNatom(ad0); - //const int start0 = GlobalC::ucell.itiaiw2iwt(T0, I0, 0); - tau0 = GlobalC::GridD.getAdjacentTau(ad0); - - dtau1 = tau0 - tau1; - dtau2 = tau0 - tau2; - const double distance1 = dtau1.norm2() * pow(GlobalC::ucell.lat0,2); - const double distance2 = dtau2.norm2() * pow(GlobalC::ucell.lat0,2); + std::vector nlm_1=(*nlm_cur1_e)[iw1_all]; + std::vector nlm_2=(*nlm_cur2_e)[iw2_all]; + double nlm_tmp = 0.0; - // seems a bug here!! mohan 2011-06-17 - rcut1 = pow(GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); - rcut2 = pow(GlobalC::ORB.Phi[T2].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(),2); - - if(distance1 < rcut1 && distance2 < rcut2) + const int nproj = GlobalC::ucell.infoNL.nproj[T0]; + int ib = 0; + for (int nb = 0; nb < nproj; nb++) { - //const Atom* atom0 = &GlobalC::ucell.atoms[T0]; - const int rx0=GlobalC::GridD.getBox(ad0).x; - const int ry0=GlobalC::GridD.getBox(ad0).y; - const int rz0=GlobalC::GridD.getBox(ad0).z; - key_tuple key1(iat1,-rx0,-ry0,-rz0); - key_tuple key2(iat2,rx2-rx0,ry2-ry0,rz2-rz0); - - if(!calc_deri) + const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); + for(int m=0;m<2*L0+1;m++) { - std::vector nlm_1=nlm_tot[key1][iw1_all]; - std::vector nlm_2=nlm_tot[key2][iw2_all]; - double nlm_tmp = 0.0; - - const int nproj = GlobalC::ucell.infoNL.nproj[T0]; - int ib = 0; - for (int nb = 0; nb < nproj; nb++) + if(nlm_1[ib]!=0.0 && nlm_2[ib]!=0.0) { - const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); - for(int m=0;m<2*L0+1;m++) - { - if(nlm_1[ib]!=0.0 && nlm_2[ib]!=0.0) - { - nlm_tmp += nlm_1[ib]*nlm_2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); - } - ib+=1; - } + nlm_tmp += nlm_1[ib]*nlm_2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); } - assert(ib==nlm_1.size()); + ib+=1; + } + } + assert(ib==nlm_1.size()); - if(GlobalV::GAMMA_ONLY_LOCAL) - { - // mohan add 2010-12-20 - if( nlm_tmp!=0.0 ) - { - // GlobalV::ofs_running << std::setw(10) << iw1_all << std::setw(10) - // << iw2_all << std::setw(20) << nlm[0] << std::endl; - GlobalC::LM.set_HSgamma(iw1_all,iw2_all,nlm_tmp,'N');//N stands for nonlocal. - } - } - else - { - if( nlm_tmp!=0.0 ) - { - GlobalC::LM.Hloc_fixedR[nnr] += nlm_tmp; - } - } - }// calc_deri - else // calculate the derivative + if(GlobalV::GAMMA_ONLY_LOCAL) + { + // mohan add 2010-12-20 + if( nlm_tmp!=0.0 ) { + // GlobalV::ofs_running << std::setw(10) << iw1_all << std::setw(10) + // << iw2_all << std::setw(20) << nlm[0] << std::endl; + GlobalC::LM.set_HSgamma(iw1_all,iw2_all,nlm_tmp,'N');//N stands for nonlocal. + } + } + else + { + if( nlm_tmp!=0.0 ) + { + GlobalC::LM.Hloc_fixedR[nnr+nnr_inner] += nlm_tmp; + } + } + }// calc_deri + else // calculate the derivative + { + if(GlobalV::GAMMA_ONLY_LOCAL) + { + double nlm[3]={0,0,0}; - if(GlobalV::GAMMA_ONLY_LOCAL) - { - double nlm[3]={0,0,0}; - - // sum all projectors for one atom. - std::vector nlm_1 = nlm_tot1[key1][iw1_all][0]; - std::vector> nlm_2; - nlm_2.resize(3); - for(int i=0;i<3;i++) - { - nlm_2[i] = nlm_tot1[key2][iw2_all][i+1]; - } + // sum all projectors for one atom. + std::vector nlm_1 = (*nlm_cur1_f)[iw1_all][0]; + std::vector> nlm_2; + nlm_2.resize(3); + for(int i=0;i<3;i++) + { + nlm_2[i] = (*nlm_cur2_f)[iw2_all][i+1]; + } - assert(nlm_1.size()==nlm_2[0].size()); + assert(nlm_1.size()==nlm_2[0].size()); - const int nproj = GlobalC::ucell.infoNL.nproj[T0]; - int ib = 0; - for (int nb = 0; nb < nproj; nb++) - { - const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); - for(int m=0;m<2*L0+1;m++) - { - for(int ir=0;ir<3;ir++) - { - nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); - } - ib+=1; - } - } - assert(ib==nlm_1.size()); - GlobalC::LM.set_force (iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N'); - } - else + const int nproj = GlobalC::ucell.infoNL.nproj[T0]; + int ib = 0; + for (int nb = 0; nb < nproj; nb++) + { + const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); + for(int m=0;m<2*L0+1;m++) { - // mohan change the order on 2011-06-17 - // origin: < psi1 | beta > < beta | dpsi2/dtau > - //now: < psi1/dtau | beta > < beta | psi2 > - double nlm[3]={0,0,0}; - - // sum all projectors for one atom. - std::vector nlm_1 = nlm_tot1[key2][iw2_all][0]; - std::vector> nlm_2; - nlm_2.resize(3); - for(int i=0;i<3;i++) + for(int ir=0;ir<3;ir++) { - nlm_2[i] = nlm_tot1[key1][iw1_all][i+1]; + nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); } + ib+=1; + } + } + assert(ib==nlm_1.size()); + GlobalC::LM.set_force (iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N'); + } + else + { + // mohan change the order on 2011-06-17 + // origin: < psi1 | beta > < beta | dpsi2/dtau > + //now: < psi1/dtau | beta > < beta | psi2 > + double nlm[3]={0,0,0}; + + // sum all projectors for one atom. + std::vector nlm_1 = (*nlm_cur2_f)[iw2_all][0]; + std::vector> nlm_2; + nlm_2.resize(3); + for(int i=0;i<3;i++) + { + nlm_2[i] = (*nlm_cur1_f)[iw1_all][i+1]; + } - assert(nlm_1.size()==nlm_2[0].size()); + assert(nlm_1.size()==nlm_2[0].size()); - const int nproj = GlobalC::ucell.infoNL.nproj[T0]; - int ib = 0; - for (int nb = 0; nb < nproj; nb++) + const int nproj = GlobalC::ucell.infoNL.nproj[T0]; + int ib = 0; + for (int nb = 0; nb < nproj; nb++) + { + const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); + for(int m=0;m<2*L0+1;m++) + { + for(int ir=0;ir<3;ir++) { - const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); - for(int m=0;m<2*L0+1;m++) - { - for(int ir=0;ir<3;ir++) - { - nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); - } - ib+=1; - } + nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); } - assert(ib==nlm_1.size()); - - GlobalC::LM.DHloc_fixedR_x[nnr] += nlm[0]; - GlobalC::LM.DHloc_fixedR_y[nnr] += nlm[1]; - GlobalC::LM.DHloc_fixedR_z[nnr] += nlm[2]; + ib+=1; } - }//!calc_deri - }// distance - } // ad0 - ++nnr; + } + assert(ib==nlm_1.size()); + + GlobalC::LM.DHloc_fixedR_x[nnr+nnr_inner] += nlm[0]; + GlobalC::LM.DHloc_fixedR_y[nnr+nnr_inner] += nlm[1]; + GlobalC::LM.DHloc_fixedR_z[nnr+nnr_inner] += nlm[2]; + } + }//!calc_deri + nnr_inner++; }// k } // j - }// end is_adj - //---------------------------------------------------------------------------------- - } // ad2 - } // I1 - } // T1 + } // ad0 + //outer circle : accumulate nnr + for (int j=0; jnw*GlobalV::NPOL; j++) + { + const int j0 = j/GlobalV::NPOL;//added by zhengdy-soc + const int iw1_all = start1 + j; + const int mu = GlobalC::ParaO.trace_loc_row[iw1_all]; + if(mu < 0)continue; - if(!GlobalV::GAMMA_ONLY_LOCAL) + // fix a serious bug: atom2[T2] -> atom2 + // mohan 2010-12-20 + for (int k=0; knw*GlobalV::NPOL; k++) + { + const int k0 = k/GlobalV::NPOL; + const int iw2_all = start2 + k; + const int nu = GlobalC::ParaO.trace_loc_col[iw2_all]; + if(nu < 0)continue; + + nnr++; + } + } + }// end is_adj + } // ad2 + } // I1 + } // T1 + + if(!GlobalV::GAMMA_ONLY_LOCAL) + { + // std::cout << " nr=" << nnr << std::endl; + // std::cout << " GlobalC::LNNR.nnr=" << GlobalC::LNNR.nnr << std::endl; + // GlobalV::ofs_running << " nr=" << nnr << std::endl; + // GlobalV::ofs_running << " GlobalC::LNNR.nnr=" << GlobalC::LNNR.nnr << std::endl; + if( nnr!=GlobalC::LNNR.nnr) { - // std::cout << " nr=" << nnr << std::endl; - // std::cout << " GlobalC::LNNR.nnr=" << GlobalC::LNNR.nnr << std::endl; - // GlobalV::ofs_running << " nr=" << nnr << std::endl; - // GlobalV::ofs_running << " GlobalC::LNNR.nnr=" << GlobalC::LNNR.nnr << std::endl; - if( nnr!=GlobalC::LNNR.nnr) - { - ModuleBase::WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu_new","nnr!=GlobalC::LNNR.nnr"); - } + ModuleBase::WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu_new","nnr!=GlobalC::LNNR.nnr"); } - }//iat + } // std::cout << " build_Nonlocal_mu done" << std::endl; From f1c842a7a5a6d6ad8f36311d9fd6b99c6f2a20c3 Mon Sep 17 00:00:00 2001 From: wenfei-li Date: Tue, 2 Nov 2021 15:00:01 +0800 Subject: [PATCH 20/27] change test.yml to include beta branch --- .github/workflows/test.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 60c26bbbd7c..7bcc4d36559 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,9 +4,7 @@ on: pull_request_target: branches: - develop - - reconstruction - - newelpa - + - ABACUS_2.2.0_beta jobs: start-runner: From 995e47aa2549fbc3c9ada75a80e27de2a9c70ed2 Mon Sep 17 00:00:00 2001 From: linpz Date: Wed, 3 Nov 2021 16:31:50 +0800 Subject: [PATCH 21/27] 1. add class Diag_Scalapack_gvx --- source/Makefile.Objects | 1 + source/src_pdiag/diag_scalapack_gvx.cpp | 246 ++++++++++++++++++++++++ source/src_pdiag/diag_scalapack_gvx.h | 59 ++++++ source/src_pdiag/pdiag_double.cpp | 102 +--------- source/src_pdiag/pdiag_double.h | 2 + 5 files changed, 314 insertions(+), 96 deletions(-) create mode 100644 source/src_pdiag/diag_scalapack_gvx.cpp create mode 100644 source/src_pdiag/diag_scalapack_gvx.h diff --git a/source/Makefile.Objects b/source/Makefile.Objects index e7dbb91317b..7bbc7945e86 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -176,6 +176,7 @@ global_fp.o \ pdiag_double.o \ pdiag_basic.o \ pdiag_common.o \ +diag_scalapack_gvx.o \ subgrid_oper.o \ conv_coulomb_pot.o \ conv_coulomb_pot_k.o \ diff --git a/source/src_pdiag/diag_scalapack_gvx.cpp b/source/src_pdiag/diag_scalapack_gvx.cpp new file mode 100644 index 00000000000..16c86ccaa4e --- /dev/null +++ b/source/src_pdiag/diag_scalapack_gvx.cpp @@ -0,0 +1,246 @@ +//===================== +// AUTHOR : Peize Lin +// DATE : 2021-11-02 +//===================== + +#include "diag_scalapack_gvx.h" +#include "module_base/global_variable.h" +#include "module_base/scalapack_connector.h" +#include "module_base/global_function.h" +#include + +std::pair> Diag_Scalapack_gvx::pdsygvx_once( + const int*const desc, + const int ncol, + const int nrow, + const double*const h_mat, + const double*const s_mat, + double*const ekb, + ModuleBase::matrix &wfc_2d) const +{ + ModuleBase::matrix h_tmp(ncol, nrow, false); + memcpy( h_tmp.c, h_mat, sizeof(double)*ncol*nrow ); + ModuleBase::matrix s_tmp(ncol, nrow, false); + memcpy( s_tmp.c, s_mat, sizeof(double)*ncol*nrow ); + wfc_2d.create(ncol, nrow, false); + + const char jobz='V', range='I', uplo='U'; + const int itype=1, il=1, iu=GlobalV::NBANDS, one=1; + int M=0, NZ=0, lwork=-1, liwork=-1, info=0; + const double abstol=0, orfac=-1; + std::vector work(1,0); + std::vector iwork(1,0); + std::vector ifail(GlobalV::NLOCAL,0); + std::vector iclustr(2*GlobalV::DSIZE); + std::vector gap(GlobalV::DSIZE); + + pdsygvx_(&itype, &jobz, &range, &uplo, + &GlobalV::NLOCAL, h_tmp.c, &one, &one, desc, s_tmp.c, &one, &one, desc, + NULL, NULL, &il, &iu, &abstol, + &M, &NZ, ekb, &orfac, wfc_2d.c, &one, &one, desc, + work.data(), &lwork, iwork.data(), &liwork, ifail.data(), iclustr.data(), gap.data(), &info); + if (info) + throw std::runtime_error("info = "+ModuleBase::GlobalFunc::TO_STRING(info)+".\n"+ModuleBase::GlobalFunc::TO_STRING(__FILE__)+" line "+ModuleBase::GlobalFunc::TO_STRING(__LINE__)); + + GlobalV::ofs_running<<"lwork="<> Diag_Scalapack_gvx::pzhegvx_once( + const int*const desc, + const int ncol, + const int nrow, + const std::complex*const h_mat, + const std::complex*const s_mat, + double*const ekb, + ModuleBase::ComplexMatrix &wfc_2d) const +{ + ModuleBase::ComplexMatrix h_tmp(ncol, nrow, false); + memcpy( h_tmp.c, h_mat, sizeof(std::complex)*ncol*nrow ); + ModuleBase::ComplexMatrix s_tmp(ncol, nrow, false); + memcpy( s_tmp.c, s_mat, sizeof(std::complex)*ncol*nrow ); + wfc_2d.create(ncol, nrow, false); + + const char jobz='V', range='I', uplo='U'; + const int itype=1, il=1, iu=GlobalV::NBANDS, one=1; + int M=0, NZ=0, lwork=-1, lrwork=-1, liwork=-1, info=0; + const double abstol=0, orfac=-1; + std::vector> work(1,0); + std::vector rwork(1,0); + std::vector iwork(1,0); + std::vector ifail(GlobalV::NLOCAL,0); + std::vector iclustr(2*GlobalV::DSIZE); + std::vector gap(GlobalV::DSIZE); + + pzhegvx_(&itype, &jobz, &range, &uplo, + &GlobalV::NLOCAL, h_tmp.c, &one, &one, desc, s_tmp.c, &one, &one, desc, + NULL, NULL, &il, &iu, &abstol, + &M, &NZ, ekb, &orfac, wfc_2d.c, &one, &one, desc, + work.data(), &lwork, rwork.data(), &lrwork, + iwork.data(), &liwork, ifail.data(), iclustr.data(), gap.data(), &info); + if (info) + throw std::runtime_error("info="+ModuleBase::GlobalFunc::TO_STRING(info)+". "+ModuleBase::GlobalFunc::TO_STRING(__FILE__)+" line "+ModuleBase::GlobalFunc::TO_STRING(__LINE__)); + + GlobalV::ofs_running<<"lwork="<degeneracy_max * GlobalV::NLOCAL; + rwork.resize(lrwork,0); + liwork = iwork[0]; + iwork.resize(liwork,0); + + pzhegvx_(&itype, &jobz, &range, &uplo, + &GlobalV::NLOCAL, h_tmp.c, &one, &one, desc, s_tmp.c, &one, &one, desc, + NULL, NULL, &il, &iu, &abstol, + &M, &NZ, ekb, &orfac, wfc_2d.c, &one, &one, desc, + work.data(), &lwork, + rwork.data(), &lrwork, + iwork.data(), &liwork, + ifail.data(), iclustr.data(), gap.data(), &info); + GlobalV::ofs_running<<"M="<{}); + else if(info<0) + return std::make_pair(info, std::vector{}); + else if(info%2) + return std::make_pair(info, ifail); + else if(info/2%2) + return std::make_pair(info, iclustr); + else if(info/4%2) + return std::make_pair(info, std::vector{M,NZ}); + else if(info/16%2) + return std::make_pair(info, ifail); + else + throw std::runtime_error("info = "+ModuleBase::GlobalFunc::TO_STRING(info)+".\n"+ModuleBase::GlobalFunc::TO_STRING(__FILE__)+" line "+ModuleBase::GlobalFunc::TO_STRING(__LINE__)); +} + +void Diag_Scalapack_gvx::pdsygvx_diag( + const int*const desc, + const int ncol, + const int nrow, + const double*const h_mat, + const double*const s_mat, + double*const ekb, + ModuleBase::matrix &wfc_2d) +{ + while(true) + { + const std::pair> info_vec = pdsygvx_once(desc, ncol, nrow, h_mat, s_mat, ekb, wfc_2d); + post_processing(info_vec.first, info_vec.second); + if(info_vec.first==0) + break; + } +} + + + +void Diag_Scalapack_gvx::pzhegvx_diag( + const int*const desc, + const int ncol, + const int nrow, + const std::complex*const h_mat, + const std::complex*const s_mat, + double*const ekb, + ModuleBase::ComplexMatrix &wfc_2d) +{ + while(true) + { + const std::pair> info_vec = pzhegvx_once(desc, ncol, nrow, h_mat, s_mat, ekb, wfc_2d); + post_processing(info_vec.first, info_vec.second); + if(info_vec.first==0) + break; + } +} + + + +void Diag_Scalapack_gvx::post_processing(const int info, const std::vector &vec) +{ + const std::string str_info = "info = "+ModuleBase::GlobalFunc::TO_STRING(info)+".\n"; + const std::string str_FILE = ModuleBase::GlobalFunc::TO_STRING(__FILE__)+" line "+ModuleBase::GlobalFunc::TO_STRING(__LINE__)+".\n"; + const std::string str_info_FILE = str_info + str_FILE; + + if(info==0) + { + return; + } + else if(info<0) + { + const int info_negative = -info; + const std::string str_index = (info_negative>100) + ? ModuleBase::GlobalFunc::TO_STRING(info_negative/100)+"-th argument "+ModuleBase::GlobalFunc::TO_STRING(info_negative%100)+"-entry is illegal.\n" + : ModuleBase::GlobalFunc::TO_STRING(info_negative)+"-th argument is illegal.\n"; + throw std::runtime_error(str_info_FILE + str_index); + } + else if(info%2) + { + std::string str_ifail = "ifail = "; + for(const int i : vec) + str_ifail += ModuleBase::GlobalFunc::TO_STRING(i) + " "; + throw std::runtime_error(str_info_FILE + str_ifail); + } + else if(info/2%2) + { + int degeneracy_need = 0; + for(int irank=0; irankdegeneracy_max)+".\n"; + if(degeneracy_need <= this->degeneracy_max) + { + throw std::runtime_error(str_info_FILE + str_need + str_saved); + } + else + { + GlobalV::ofs_running<degeneracy_max = degeneracy_need; + return; + } + } + else if(info/4%2) + { + const std::string str_M = "M = "+ModuleBase::GlobalFunc::TO_STRING(vec[0])+".\n"; + const std::string str_NZ = "NZ = "+ModuleBase::GlobalFunc::TO_STRING(vec[1])+".\n"; + const std::string str_NBANDS = "GlobalV::NBANDS = "+ModuleBase::GlobalFunc::TO_STRING(GlobalV::NBANDS)+".\n"; + throw std::runtime_error(str_info_FILE + str_M + str_NZ + str_NBANDS); + } + else if(info/16%2) + { + const std::string str_npos = "not positive definite = "+ModuleBase::GlobalFunc::TO_STRING(vec[0])+".\n"; + throw std::runtime_error(str_info_FILE + str_npos); + } + else + { + throw std::runtime_error(str_info_FILE); + } +} diff --git a/source/src_pdiag/diag_scalapack_gvx.h b/source/src_pdiag/diag_scalapack_gvx.h new file mode 100644 index 00000000000..58c84dc838f --- /dev/null +++ b/source/src_pdiag/diag_scalapack_gvx.h @@ -0,0 +1,59 @@ +//===================== +// AUTHOR : Peize Lin +// DATE : 2021-11-02 +//===================== + +#ifndef DIAG_SCALAPACK_GVX +#define DIAG_SCALAPACK_GVX + +#include "module_base/matrix.h" +#include "module_base/complexmatrix.h" +#include +#include +#include + +class Diag_Scalapack_gvx +{ + +public: + void pdsygvx_diag( + const int*const desc, + const int ncol, + const int nrow, + const double*const h_mat, + const double*const s_mat, + double*const ekb, + ModuleBase::matrix &wfc_2d); + void pzhegvx_diag( + const int*const desc, + const int ncol, + const int nrow, + const std::complex*const h_mat, + const std::complex*const s_mat, + double*const ekb, + ModuleBase::ComplexMatrix &wfc_2d); + + std::pair> pdsygvx_once( + const int*const desc, + const int ncol, + const int nrow, + const double*const h_mat, + const double*const s_mat, + double*const ekb, + ModuleBase::matrix &wfc_2d) const; + std::pair> pzhegvx_once( + const int*const desc, + const int ncol, + const int nrow, + const std::complex*const h_mat, + const std::complex*const s_mat, + double*const ekb, + ModuleBase::ComplexMatrix &wfc_2d) const; + +private: + int degeneracy_max = 12; // For reorthogonalized memory. 12 followes siesta. + + void post_processing(const int info, const std::vector &vec); +}; + +#endif \ No newline at end of file diff --git a/source/src_pdiag/pdiag_double.cpp b/source/src_pdiag/pdiag_double.cpp index 53425e918ea..84946e736b2 100644 --- a/source/src_pdiag/pdiag_double.cpp +++ b/source/src_pdiag/pdiag_double.cpp @@ -857,54 +857,8 @@ void Pdiag_Double::diago_double_begin( } else if(GlobalV::KS_SOLVER=="scalapack_gvx") { - ModuleBase::matrix h_tmp(this->ncol, this->nrow, false); - memcpy( h_tmp.c, h_mat, sizeof(double)*this->ncol*this->nrow ); - ModuleBase::matrix s_tmp(this->ncol, this->nrow, false); - memcpy( s_tmp.c, s_mat, sizeof(double)*this->ncol*this->nrow ); - wfc_2d.create(this->ncol, this->nrow, false); - - const char jobz='V', range='I', uplo='U'; - const int itype=1, il=1, iu=GlobalV::NBANDS, one=1; - int M=0, NZ=0, lwork=-1, liwork=-1, info=0; - const double abstol=0, orfac=-1; - std::vector work(1,0); - std::vector iwork(1,0); - std::vector ifail(GlobalV::NLOCAL,0); - std::vector iclustr(2*GlobalV::DSIZE); - std::vector gap(GlobalV::DSIZE); + diag_scalapack_gvx.pdsygvx_diag(this->desc, this->ncol, this->nrow, h_mat, s_mat, ekb, wfc_2d); // Peize Lin add 2021.11.02 - pdsygvx_(&itype, &jobz, &range, &uplo, - &GlobalV::NLOCAL, h_tmp.c, &one, &one, desc, s_tmp.c, &one, &one, desc, - NULL, NULL, &il, &iu, &abstol, - &M, &NZ, ekb, &orfac, wfc_2d.c, &one, &one, desc, - work.data(), &lwork, iwork.data(), &liwork, ifail.data(), iclustr.data(), gap.data(), &info); - - GlobalV::ofs_running<<"lwork="<ncol, this->nrow, false); - memcpy( h_tmp.c, ch_mat, sizeof(std::complex)*this->ncol*this->nrow ); - ModuleBase::ComplexMatrix s_tmp(this->ncol, this->nrow, false); - memcpy( s_tmp.c, cs_mat, sizeof(std::complex)*this->ncol*this->nrow ); - wfc_2d.create(this->ncol, this->nrow, false); - - const char jobz='V', range='I', uplo='U'; - const int itype=1, il=1, iu=GlobalV::NBANDS, one=1; - int M=0, NZ=0, lwork=-1, lrwork=-1, liwork=-1, info=0; - const double abstol=0, orfac=-1; - std::vector> work(1,0); - std::vector rwork(1,0); - std::vector iwork(1,0); - std::vector ifail(GlobalV::NLOCAL,0); - std::vector iclustr(2*GlobalV::DSIZE); - std::vector gap(GlobalV::DSIZE); - - pzhegvx_(&itype, &jobz, &range, &uplo, - &GlobalV::NLOCAL, h_tmp.c, &one, &one, desc, s_tmp.c, &one, &one, desc, - NULL, NULL, &il, &iu, &abstol, - &M, &NZ, ekb, &orfac, wfc_2d.c, &one, &one, desc, - work.data(), &lwork, rwork.data(), &lrwork, - iwork.data(), &liwork, ifail.data(), iclustr.data(), gap.data(), &info); - if (info) - throw std::runtime_error("info=" + ModuleBase::GlobalFunc::TO_STRING(info) + ". " + ModuleBase::GlobalFunc::TO_STRING(__FILE__) + " line " + ModuleBase::GlobalFunc::TO_STRING(__LINE__)); - GlobalV::ofs_running<<"lwork="<desc, this->ncol, this->nrow, ch_mat, cs_mat, ekb, wfc_2d); // Peize Lin add 2021.11.02 // if(INPUT.new_dm==0) // throw std::domain_error("INPUT.new_dm must be 1. "+ModuleBase::GlobalFunc::TO_STRING(__FILE__)+" line "+ModuleBase::GlobalFunc::TO_STRING(__LINE__)); @@ -1205,9 +1114,10 @@ void Pdiag_Double::diago_complex_begin( //change eigenvector matrix from block-cycle distribute matrix to column-divided distribute matrix ModuleBase::timer::tick("Diago_LCAO_Matrix","gath_eig_complex"); + int info; long maxnloc; // maximum number of elements in local matrix - MPI_Reduce(&nloc, &maxnloc, 1, MPI_LONG, MPI_MAX, 0, comm_2D); - MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, comm_2D); + info=MPI_Reduce(&nloc, &maxnloc, 1, MPI_LONG, MPI_MAX, 0, comm_2D); + info=MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, comm_2D); std::complex *work=new std::complex[maxnloc]; // work/buffer matrix int naroc[2]; // maximum number of row or column @@ -1217,7 +1127,7 @@ void Pdiag_Double::diago_complex_begin( { const int coord[2]={iprow, ipcol}; int src_rank; - MPI_Cart_rank(comm_2D, coord, &src_rank); + info=MPI_Cart_rank(comm_2D, coord, &src_rank); if(myid==src_rank) { LapackConnector::copy(nloc, wfc_2d.c, inc, work, inc); diff --git a/source/src_pdiag/pdiag_double.h b/source/src_pdiag/pdiag_double.h index 729be8c105b..41aed0bc911 100644 --- a/source/src_pdiag/pdiag_double.h +++ b/source/src_pdiag/pdiag_double.h @@ -3,6 +3,7 @@ #include "../src_pw/tools.h" #include "pdiag_basic.h" +#include "diag_scalapack_gvx.h" class Pdiag_Double : public Pdiag_Basic { @@ -51,6 +52,7 @@ class Pdiag_Double : public Pdiag_Basic int dim0; int dim1; + Diag_Scalapack_gvx diag_scalapack_gvx; // Peize Lin add 2021.11.02 }; #endif From d0ac093d366d175c4adc237c749b718cbd395925 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 3 Nov 2021 19:46:54 +0800 Subject: [PATCH 22/27] fixed cmake for scalapack_gvx update --- source/src_pdiag/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/source/src_pdiag/CMakeLists.txt b/source/src_pdiag/CMakeLists.txt index 203adf13d2f..18244f7adbf 100644 --- a/source/src_pdiag/CMakeLists.txt +++ b/source/src_pdiag/CMakeLists.txt @@ -21,4 +21,5 @@ add_library( pzst2g.cpp pzsteiz.cpp pzt2s.cpp + diag_scalapack_gvx.cpp ) From ce645a259ca67771ef55e7cfe81c2e6a9c097498 Mon Sep 17 00:00:00 2001 From: LiuXiaohui Date: Thu, 4 Nov 2021 09:42:18 +0800 Subject: [PATCH 23/27] OpenMP optimization for local potential and charge density under k-points line --- source/src_lcao/gint_k_rho.cpp | 29 ++++++++++++++++++++ source/src_lcao/gint_k_vl.cpp | 49 +++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/source/src_lcao/gint_k_rho.cpp b/source/src_lcao/gint_k_rho.cpp index d1f95123f2b..b16b5d457f7 100644 --- a/source/src_lcao/gint_k_rho.cpp +++ b/source/src_lcao/gint_k_rho.cpp @@ -7,6 +7,15 @@ #include "../src_pw/global.h" #include "../module_base/blas_connector.h" #include "global_fp.h" // mohan add 2021-01-30 +#include + +#ifdef __OPENMP +#include +#endif + +#ifdef __MKL +#include +#endif inline void setVindex(const int ncyz, const int ibx, const int jby, const int kbz, int* vindex) { @@ -395,6 +404,15 @@ void Gint_k::cal_rho_k(void) ModuleBase::TITLE("Gint_k","cal_rho_k"); ModuleBase::timer::tick("Gint_k","cal_rho_k"); +#ifdef __MKL + const int mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); +#endif + +#ifdef __OPENMP + #pragma omp parallel + { +#endif const double delta_r = GlobalC::ORB.dr_uniform; // it is an uniform grid to save orbital values, so the delta_r is a constant. const int max_size = GlobalC::GridT.max_atom; @@ -453,6 +471,10 @@ void Gint_k::cal_rho_k(void) const int ncyz = GlobalC::pw.ncy*GlobalC::pw.nczp; const int nbyz = nby*nbz; + +#ifdef __OPENMP + #pragma omp for +#endif for(int i=0; i +#ifdef __OPENMP +#include +#endif + +#ifdef __MKL +#include +#endif + inline int find_offset(const int size, const int grid_index, const int ibx, const int jby, const int kbz, const int bx, const int by, const int bz, @@ -351,6 +359,18 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const ModuleBase::timer::tick("Gint_k","vlocal"); +#ifdef __MKL + const int mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); +#endif + +#ifdef __OPENMP + #pragma omp parallel + { + double* pvpR_reduced_thread; + pvpR_reduced_thread = new double[GlobalC::LNNR.nnrg]; + ModuleBase::GlobalFunc::ZEROS(pvpR_reduced_thread, GlobalC::LNNR.nnrg); +#endif // it's a uniform grid to save orbital values, so the delta_r is a constant. double delta_r = GlobalC::ORB.dr_uniform; // possible max atom number in real space grid. @@ -415,6 +435,9 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const double* vldr3 = new double[bxyz]; ModuleBase::GlobalFunc::ZEROS(vldr3, bxyz); +#ifdef __OPENMP + #pragma omp for +#endif for(int i=0; ireduced) { +#ifdef __OPENMP + cal_pvpR_reduced(size, LD_pool, grid_index, + ibx, jby, kbz, + block_size, at, block_index, block_iw, + vldr3, psir_ylm, psir_vlbr3, + distance, cal_flag, pvpR_reduced_thread); +#else cal_pvpR_reduced(size, LD_pool, grid_index, ibx, jby, kbz, block_size, at, block_index, block_iw, vldr3, psir_ylm, psir_vlbr3, distance, cal_flag, this->pvpR_reduced[spin]); +#endif } else { @@ -472,6 +503,15 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const }// int j } // int i +#ifdef __OPENMP + #pragma omp critical(cal_vl_k) + for(int innrg=0; innrg Date: Thu, 4 Nov 2021 09:58:30 +0800 Subject: [PATCH 24/27] delete one useless line in file src_lcao/gint_k_rho.cpp --- source/src_lcao/gint_k_rho.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/source/src_lcao/gint_k_rho.cpp b/source/src_lcao/gint_k_rho.cpp index b16b5d457f7..db29ef65c83 100644 --- a/source/src_lcao/gint_k_rho.cpp +++ b/source/src_lcao/gint_k_rho.cpp @@ -7,7 +7,6 @@ #include "../src_pw/global.h" #include "../module_base/blas_connector.h" #include "global_fp.h" // mohan add 2021-01-30 -#include #ifdef __OPENMP #include From cb0acb3114ec3f1bafffd040214822c7d26a7941 Mon Sep 17 00:00:00 2001 From: LiuXiaohui Date: Thu, 4 Nov 2021 11:14:50 +0800 Subject: [PATCH 25/27] change Macro: __OPENMP --> _OPENMP --- source/src_lcao/gint_k_rho.cpp | 8 ++++---- source/src_lcao/gint_k_vl.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/source/src_lcao/gint_k_rho.cpp b/source/src_lcao/gint_k_rho.cpp index db29ef65c83..f67e4700356 100644 --- a/source/src_lcao/gint_k_rho.cpp +++ b/source/src_lcao/gint_k_rho.cpp @@ -8,7 +8,7 @@ #include "../module_base/blas_connector.h" #include "global_fp.h" // mohan add 2021-01-30 -#ifdef __OPENMP +#ifdef _OPENMP #include #endif @@ -408,7 +408,7 @@ void Gint_k::cal_rho_k(void) mkl_set_num_threads(1); #endif -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel { #endif @@ -471,7 +471,7 @@ void Gint_k::cal_rho_k(void) const int ncyz = GlobalC::pw.ncy*GlobalC::pw.nczp; const int nbyz = nby*nbz; -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp for #endif for(int i=0; i -#ifdef __OPENMP +#ifdef _OPENMP #include #endif @@ -364,7 +364,7 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const mkl_set_num_threads(1); #endif -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp parallel { double* pvpR_reduced_thread; @@ -435,7 +435,7 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const double* vldr3 = new double[bxyz]; ModuleBase::GlobalFunc::ZEROS(vldr3, bxyz); -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp for #endif for(int i=0; ireduced) { -#ifdef __OPENMP +#ifdef _OPENMP cal_pvpR_reduced(size, LD_pool, grid_index, ibx, jby, kbz, block_size, at, block_index, block_iw, @@ -503,7 +503,7 @@ void Gint_k::cal_vlocal_k(const double *vrs1, const Grid_Technique &GridT, const }// int j } // int i -#ifdef __OPENMP +#ifdef _OPENMP #pragma omp critical(cal_vl_k) for(int innrg=0; innrg Date: Thu, 4 Nov 2021 12:01:40 +0800 Subject: [PATCH 26/27] update cmake file for openmp --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e74b923b6c7..4f1f46c92d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ if(USE_OPENMP) find_package(OpenMP REQUIRED) target_link_libraries(${ABACUS_BIN_NAME} OpenMP::OpenMP_CXX) add_compile_options(${OpenMP_CXX_FLAGS}) - add_compile_definitions(__OPENMP) + #add_compile_definitions(_OPENMP) endif() set(CMAKE_CXX_STANDARD 11) From 15c780bc438ac1a19ba2f87c2122cd231900382d Mon Sep 17 00:00:00 2001 From: dyzheng Date: Fri, 5 Nov 2021 14:14:07 +0800 Subject: [PATCH 27/27] modified test.yml and hosted_test.yml --- .github/workflows/hosted_test.yml | 2 +- .github/workflows/test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/hosted_test.yml b/.github/workflows/hosted_test.yml index 990ffbcd4b1..7b4fc13a46e 100644 --- a/.github/workflows/hosted_test.yml +++ b/.github/workflows/hosted_test.yml @@ -11,7 +11,7 @@ jobs: build_args: "" name: "Build with GNU compilers" - tag: gnu - build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1" + build_args: "-DENABLE_LIBXC=1" name: "Build with GNU compilers and extra components" - tag: intel build_args: "" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7bcc4d36559..30ec8f26196 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ on: pull_request_target: branches: - develop - - ABACUS_2.2.0_beta + - update_MD jobs: start-runner: