From cda0e5d483e57eeb7a69b7a5d99a91a968f5e6ce Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 31 Jul 2024 13:09:54 +0800 Subject: [PATCH 1/6] Fix: noncollinear-spin force and stress with PW --- .../module_xc/xc_functional_gradcorr.cpp | 2 +- .../hamilt_pwdft/forces_nl.cpp | 2 +- .../hamilt_pwdft/fs_nonlocal_tools.cpp | 138 +++++++++++++----- .../hamilt_pwdft/kernels/force_op.cpp | 102 +++++++++++++ .../hamilt_pwdft/kernels/force_op.h | 23 +++ .../hamilt_pwdft/kernels/stress_op.cpp | 76 ++++++++++ .../hamilt_pwdft/kernels/stress_op.h | 22 +++ .../hamilt_pwdft/stress_func_kin.cpp | 16 +- .../hamilt_pwdft/stress_func_loc.cpp | 4 +- .../hamilt_pwdft/stress_func_nl.cpp | 2 +- 10 files changed, 343 insertions(+), 44 deletions(-) diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp index 4d94f68ec4..d1082c809b 100644 --- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp +++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp @@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v, } gdr2 = new ModuleBase::Vector3[rhopw->nrxx]; - h2 = new ModuleBase::Vector3[rhopw->nrxx]; + if(!is_stress) h2 = new ModuleBase::Vector3[rhopw->nrxx]; XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba); XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 04caefd1bf..92733745b4 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -45,7 +45,7 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); for (int ipol = 0; ipol < 3; ipol++) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index f6773b3ddb..e189f937d9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -162,9 +162,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int size_becp_act = npm * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, becp, size_becp); } // prepare math tools @@ -248,11 +251,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) } const char transa = 'C'; const char transb = 'N'; + int npm_npol = npm * npol; gemm_op()(this->ctx, transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -267,15 +271,15 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb); - Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb); + resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act); + Parallel_Reduce::reduce_pool(h_becp, size_becp_act); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act); delmem_complex_h_op()(this->cpu_ctx, h_becp); } else { - Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb); + Parallel_Reduce::reduce_pool(becp, size_becp_act); } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); } @@ -286,9 +290,12 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int npm_npol = npm * npol; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, size_becp); } // prepare math tools @@ -400,7 +407,7 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -411,6 +418,8 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i dbecp, nkb); // calculate stress for target (ipol, jpol) + if(npol == 1) + { const int current_spin = this->kv_->isk[ik]; cal_stress_nl_op()(this->ctx, nondiagonal, @@ -434,20 +443,48 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i becp, dbecp, stress); + } + else + { + cal_stress_nl_op()(this->ctx, + nondiagonal, + ipol, + jpol, + nkb, + npm, + this->ntype, + this->nbands, + ik, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + stress); + } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } template void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } - std::complex* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb; + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; const std::complex* vkb_ptr = this->ppcell_vkb; std::complex* vkb_deri_ptr = this->ppcell_vkb; @@ -480,7 +517,7 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) transa, transb, this->nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, vkb_deri_ptr, @@ -490,7 +527,6 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) &ModuleBase::ZERO, dbecp_ptr, nkb); - this->revert_vkb(npw, ipol); this->pre_ik_f = ik; ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); @@ -633,29 +669,57 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; // calculate the force - cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->nbands, - this->ntype, - current_spin, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - force); + if(this->ucell_->get_npol() == 1) + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->nbands, + this->ntype, + current_spin, + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + deeq, + becp, + dbecp, + force); + } + else + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->nbands, + this->ntype, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + force); + } } // template instantiation diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 1584c5c681..041bfc3b19 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -145,6 +145,108 @@ struct cal_force_nl_op } // end it #ifdef _OPENMP } +#endif + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { +#ifdef _OPENMP +#pragma omp parallel + { +#endif + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int Nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(2) +#endif + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int iat = iat0 + ia; + int sum = sum0 + ia * Nprojs; + for (int ip = 0; ip < Nprojs; ip++) + { + const int inkb = sum + ip; + // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + std::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } + } +#ifdef _OPENMP + if (omp_get_num_threads() > 1) + { + for (int ipol = 0; ipol < 3; ipol++) + { +#pragma omp atomic + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + else +#endif + { + for (int ipol = 0; ipol < 3; ipol++) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + } + } // end ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * Nprojs; + } // end it +#ifdef _OPENMP + } #endif } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b04b769df8..b9fb84344a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -92,6 +92,29 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 2f551db297..e58cfb3e7b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -135,6 +135,82 @@ struct cal_stress_nl_op } // end it #ifdef _OPENMP } +#endif + stress[ipol * 3 + jpol] += local_stress; + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const bool& nondiagonal, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; +#ifdef _OPENMP +#pragma omp parallel reduction(+ : local_stress) + { +#endif + int iat = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int Nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(4) +#endif + for (int ib = 0; ib < nbands_occ; ib++) + { + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ip1 = 0; ip1 < Nprojs; ip1++) + { + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + if (!nondiagonal && ip1 != ip2) + { + continue; + } + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ia * Nprojs + ip1; + const int inkb2 = sum + ia * Nprojs + ip2; + // out<<"\n ps = "< dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); + const std::complex dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + const std::complex dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); + const std::complex dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } // end ip + } // ia + } + sum += atom_na[it] * Nprojs; + iat += atom_na[it]; + } // end it +#ifdef _OPENMP + } #endif stress[ipol * 3 + jpol] += local_stress; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index b4dfc35ff8..a2852c00c9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -103,6 +103,28 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const Device* ctx, + const bool& nondiagonal, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; template diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 7a97be57d7..79d2d1ec79 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -39,6 +39,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0; FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); FPTYPE* kfac = new FPTYPE[npwx]; + const int npol = GlobalC::ucell.get_npol(); for (int ik = 0; ik < p_kv->get_nks(); ik++) { @@ -71,8 +72,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++) { - if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0)) - continue; + if (wg(ik, ibnd) == 0.0) continue; const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); @@ -85,6 +85,18 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); } + if(npol == 2) + { + ppsi = &(psi_in[0](ik, ibnd, npwx)); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sum) +#endif + for (int i = 0; i < npw; i++) + { + sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] + * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); + } + } s_kin[l][m] += sum; } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index be64b4e573..b3e5eab6ba 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -50,11 +50,11 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, aux[ir] = std::complex(chr->rho[0][ir], 0.0 ); } } - for (int is = 1; is < nspin_rho; is++) + if(nspin_rho == 2) { for (int ir = irb; ir < ir_end; ++ir) { // accumulate aux - aux[ir] += std::complex(chr->rho[is][ir], 0.0 ); + aux[ir] += std::complex(chr->rho[1][ir], 0.0 ); } } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index b243d1ff5d..1ff91afe6c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -48,7 +48,7 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); From 8937146fc459069241fef148c134494ff785608a Mon Sep 17 00:00:00 2001 From: dyzheng Date: Tue, 6 Aug 2024 07:14:26 +0000 Subject: [PATCH 2/6] Fix: bug of pw force&stress calculation --- source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index e189f937d9..c01aae9e96 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -107,12 +107,6 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix this->atom_na = h_atom_na.data(); this->ppcell_vkb = this->nlpp_->vkb.c; } - - // prepare the memory of the becp and dbecp: - // becp: - // dbecp: - resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp"); - resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp"); } template From 65a93f7534695bf783d94746bd1994fce5545ba8 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 7 Aug 2024 06:40:54 +0000 Subject: [PATCH 3/6] fix: soc error and cuda kernels --- .../hamilt_pwdft/fs_nonlocal_tools.cpp | 2 - .../hamilt_pwdft/kernels/cuda/force_op.cu | 103 +++++++++++++ .../hamilt_pwdft/kernels/cuda/stress_op.cu | 135 ++++++++++++++---- .../hamilt_pwdft/kernels/force_op.cpp | 3 - .../hamilt_pwdft/kernels/force_op.h | 23 ++- .../hamilt_pwdft/kernels/stress_op.cpp | 13 +- .../hamilt_pwdft/kernels/stress_op.h | 22 ++- 7 files changed, 258 insertions(+), 43 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index c01aae9e96..f81f5d0880 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -441,7 +441,6 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i else { cal_stress_nl_op()(this->ctx, - nondiagonal, ipol, jpol, nkb, @@ -692,7 +691,6 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force else { cal_force_nl_op()(this->ctx, - nondiagonal, npm, this->nbands, this->ntype, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index f8df7b3029..fb690754c9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -170,6 +170,109 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + const int jnkb = sum + ip2; + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_nl<<>>( + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + cudaCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 769d6351e9..c379fc1082 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -240,6 +240,114 @@ void cal_stress_nl_op::operator()(const base_de cudaCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] + - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_nl<<>>( + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, @@ -638,33 +746,6 @@ void cal_stress_drhoc_aux_op::operator()( return ; } -// template -// void prepare_vkb_deri_ptr_op::operator()( -// const base_device::DEVICE_GPU* ctx, -// int nbeta, double* nhtol, int nhtol_nc, int npw, int it, -// int ipol, int jpol, -// std::complex*vkb_out, std::complex** vkb_ptrs, -// FPTYPE* ylm_in, FPTYPE** ylm_ptrs, -// FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s, -// FPTYPE* vq_in, FPTYPE** vq_ptrs, -// FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs -// ) -// { -// const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; -// dim3 gridsize(block,nbeta); - - -// prepare_vkb_deri_ptr<<<1,1>>>( -// nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol, -// reinterpret_cast*>(vkb_out), -// reinterpret_cast*>(vkb_ptrs), -// ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s, -// vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs -// ); - -// return ; -// } - template <> void pointer_array_malloc::operator()( diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 041bfc3b19..d152b25a7d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -149,7 +149,6 @@ struct cal_force_nl_op }; void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& nbands_occ, const int& wg_nc, const int& ntype, @@ -199,8 +198,6 @@ struct cal_force_nl_op // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; const int jnkb = sum + ip2; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b9fb84344a..f729280ef6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -94,7 +94,6 @@ struct cal_force_nl_op FPTYPE* force); // interface for nspin=4 only void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& nbands_occ, const int& wg_nc, const int& ntype, @@ -159,6 +158,28 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; /** diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index e58cfb3e7b..b951714e00 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -140,7 +140,6 @@ struct cal_stress_nl_op }; void operator()(const base_device::DEVICE_CPU* ctx, - const bool& nondiagonal, const int& ipol, const int& jpol, const int& nkb, @@ -181,10 +180,6 @@ struct cal_stress_nl_op { for (int ip2 = 0; ip2 < Nprojs; ip2++) { - if (!nondiagonal && ip1 != ip2) - { - continue; - } const int ib2 = ib*2; FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; @@ -197,10 +192,10 @@ struct cal_stress_nl_op const int inkb2 = sum + ia * Nprojs + ip2; // out<<"\n ps = "< dbb0 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); - const std::complex dbb1 = (conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); - const std::complex dbb2 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]).real(); - const std::complex dbb3 = (conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]).real(); + const std::complex dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const std::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); } } // end ip diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index a2852c00c9..177761a95d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -105,7 +105,6 @@ struct cal_stress_nl_op FPTYPE* stress); // interface for nspin=4 only void operator()(const Device* ctx, - const bool& nondiagonal, const int& ipol, const int& jpol, const int& nkb, @@ -259,6 +258,27 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; // cpu version first, gpu version later From 5b629550d1e9495ea915e31ffcdfa5a4d63e2cf5 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 7 Aug 2024 08:30:37 +0000 Subject: [PATCH 4/6] Fix: CUDA and ROCM --- .../hamilt_pwdft/kernels/cuda/force_op.cu | 11 +- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 2 - .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 104 +++++++++++++++++ .../kernels/rocm/stress_op.hip.cu | 106 ++++++++++++++++++ 4 files changed, 216 insertions(+), 7 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index fb690754c9..e23e0ccbc3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -193,6 +193,7 @@ __global__ void cal_force_nl( FPTYPE *force) { const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; const int it = blockIdx.x % ntype; int iat = 0, sum = 0; @@ -217,14 +218,13 @@ __global__ void cal_force_nl( const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; - const int jnkb = sum + ip2; for (int ipol = 0; ipol < 3; ipol++) { const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; const int index1 = ib2 * nkb + jnkb; - const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; - const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; - const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; - const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); atomicAdd(force + iat * forcenl_nc + ipol, tmp); } @@ -236,6 +236,7 @@ __global__ void cal_force_nl( } // interface for nspin=4 only +template void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, const int& nbands_occ, const int& wg_nc, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index c379fc1082..98918c6523 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -282,8 +282,6 @@ __global__ void cal_stress_nl( const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; - FPTYPE ps = deeq_nc[((spin * deeq_2 + iat) * deeq_3 + ip1) * deeq_4 + ip2] - - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const int inkb1 = sum + ip1; const int inkb2 = sum + ip2; //out<<"\n ps = "<::operator()(const base_dev hipCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + hipCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index d7db3b3eea..1afba3a5b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -238,6 +238,112 @@ void cal_stress_nl_op::operator()(const base_de hipCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, From 32ddfaf7e48b37a8f74ce7c83ef0fd404231d7f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:12:02 +0000 Subject: [PATCH 5/6] [pre-commit.ci lite] apply automatic fixes --- source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp | 6 ++++-- source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 79d2d1ec79..f05e64458a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -29,8 +29,9 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, int npwx=0; for (int ik = 0; ik < p_kv->get_nks(); ik++) { - if (npwx < p_kv->ngk[ik]) + if (npwx < p_kv->ngk[ik]) { npwx = p_kv->ngk[ik]; +} } gk[0]= new FPTYPE[npwx]; @@ -72,7 +73,8 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++) { - if (wg(ik, ibnd) == 0.0) continue; + if (wg(ik, ibnd) == 0.0) { continue; +} const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index b3e5eab6ba..cb306fceeb 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -22,7 +22,8 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, const int nspin_rho = (GlobalV::NSPIN == 2) ? 2 : 1; - if (PARAM.inp.gamma_only && is_pw) fact=2.0; + if (PARAM.inp.gamma_only && is_pw) { fact=2.0; +} @@ -70,12 +71,13 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, { for (int ig=0; ignpw; ig++) { - if (rho_basis->ig_gge0 == ig) + if (rho_basis->ig_gge0 == ig) { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig])).real(); - else + } else { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig]) * fact).real(); +} } } } From 34d349545d4324aef955f57ee91c6d0460c278f0 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Mon, 26 Aug 2024 16:15:35 +0800 Subject: [PATCH 6/6] Fix: nspin=4 error on DCU/GPU --- .../hamilt_pwdft/kernels/cuda/nonlocal_op.cu | 8 ++++---- .../hamilt_pwdft/kernels/cuda/stress_op.cu | 14 +++++++++----- .../hamilt_pwdft/kernels/force_op.cpp | 14 +++++++------- .../hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu | 8 ++++---- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu index 275fbfe136..89a74f3c5f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - nonlocal_pw<<>>( + nonlocal_pw<<>>( l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op::operator()(const b template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} // namespace hamilt \ No newline at end of file +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 98918c6523..f451abd26c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -261,22 +261,26 @@ __global__ void cal_stress_nl( const thrust::complex *dbecp, FPTYPE *stress) { - int ib = blockIdx.x / ntype; + const int ib = blockIdx.x / ntype; // index of loop-nbands const int ib2 = ib * 2; - int it = blockIdx.x % ntype; + const int it = blockIdx.x % ntype; // index of loop-ntype - int iat = 0, sum = 0; + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index for (int ii = 0; ii < it; ii++) { iat += atom_na[ii]; sum += atom_na[ii] * atom_nh[ii]; } - FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; const int Nprojs = atom_nh[it]; for (int ia = 0; ia < atom_na[it]; ia++) { for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { - int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const int ip1 = ii / Nprojs; + const int ip2 = ii % Nprojs; const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index d152b25a7d..90a4641123 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -178,7 +178,7 @@ struct cal_force_nl_op int sum0 = 0; for (int it = 0; it < ntype; it++) { - const int Nprojs = atom_nh[it]; + const int nprojs = atom_nh[it]; #ifdef _OPENMP #pragma omp for collapse(2) #endif @@ -191,12 +191,12 @@ struct cal_force_nl_op FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; int iat = iat0 + ia; - int sum = sum0 + ia * Nprojs; - for (int ip = 0; ip < Nprojs; ip++) + int sum = sum0 + ia * nprojs; + for (int ip = 0; ip < nprojs; ip++) { const int inkb = sum + ip; // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; @@ -223,7 +223,7 @@ struct cal_force_nl_op #ifdef _OPENMP if (omp_get_num_threads() > 1) { - for (int ipol = 0; ipol < 3; ipol++) + for (int ipol = 0; ipol < 3; ++ipol) { #pragma omp atomic force[iat * forcenl_nc + ipol] += local_force[ipol]; @@ -232,7 +232,7 @@ struct cal_force_nl_op else #endif { - for (int ipol = 0; ipol < 3; ipol++) + for (int ipol = 0; ipol < 3; ++ipol) { force[iat * forcenl_nc + ipol] += local_force[ipol]; } @@ -240,7 +240,7 @@ struct cal_force_nl_op } } // end ia iat0 += atom_na[it]; - sum0 += atom_na[it] * Nprojs; + sum0 += atom_na[it] * nprojs; } // end it #ifdef _OPENMP } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu index 93df9fe58d..9a43f64ea6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0, l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op::operator()(const b namespace hamilt{ template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} \ No newline at end of file +}