diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp index 4d94f68ec4..d1082c809b 100644 --- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp +++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp @@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v, } gdr2 = new ModuleBase::Vector3[rhopw->nrxx]; - h2 = new ModuleBase::Vector3[rhopw->nrxx]; + if(!is_stress) h2 = new ModuleBase::Vector3[rhopw->nrxx]; XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba); XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 04caefd1bf..92733745b4 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -45,7 +45,7 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm); for (int ipol = 0; ipol < 3; ipol++) diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index f6773b3ddb..f81f5d0880 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -107,12 +107,6 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix this->atom_na = h_atom_na.data(); this->ppcell_vkb = this->nlpp_->vkb.c; } - - // prepare the memory of the becp and dbecp: - // becp: - // dbecp: - resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp"); - resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp"); } template @@ -162,9 +156,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int size_becp_act = npm * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, becp, size_becp); } // prepare math tools @@ -248,11 +245,12 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) } const char transa = 'C'; const char transb = 'N'; + int npm_npol = npm * npol; gemm_op()(this->ctx, transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -267,15 +265,15 @@ void FS_Nonlocal_tools::cal_becp(int ik, int npm) if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb); - Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb); + resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act); + Parallel_Reduce::reduce_pool(h_becp, size_becp_act); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act); delmem_complex_h_op()(this->cpu_ctx, h_becp); } else { - Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb); + Parallel_Reduce::reduce_pool(becp, size_becp_act); } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp"); } @@ -286,9 +284,12 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i { ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int npm_npol = npm * npol; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, size_becp); } // prepare math tools @@ -400,7 +401,7 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i transa, transb, nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, ppcell_vkb, @@ -411,6 +412,8 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i dbecp, nkb); // calculate stress for target (ipol, jpol) + if(npol == 1) + { const int current_spin = this->kv_->isk[ik]; cal_stress_nl_op()(this->ctx, nondiagonal, @@ -434,20 +437,47 @@ void FS_Nonlocal_tools::cal_dbecp_s(int ik, int npm, int ipol, i becp, dbecp, stress); + } + else + { + cal_stress_nl_op()(this->ctx, + ipol, + jpol, + nkb, + npm, + this->ntype, + this->nbands, + ik, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + stress); + } ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s"); } template void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) { - ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s"); + ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f"); ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb); + resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); } - std::complex* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb; + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; const std::complex* vkb_ptr = this->ppcell_vkb; std::complex* vkb_deri_ptr = this->ppcell_vkb; @@ -480,7 +510,7 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) transa, transb, this->nkb, - npm, + npm_npol, npw, &ModuleBase::ONE, vkb_deri_ptr, @@ -490,7 +520,6 @@ void FS_Nonlocal_tools::cal_dbecp_f(int ik, int npm, int ipol) &ModuleBase::ZERO, dbecp_ptr, nkb); - this->revert_vkb(npw, ipol); this->pre_ik_f = ik; ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f"); @@ -633,29 +662,56 @@ void FS_Nonlocal_tools::cal_force(int ik, int npm, FPTYPE* force const int current_spin = this->kv_->isk[ik]; const int force_nc = 3; // calculate the force - cal_force_nl_op()(this->ctx, - nondiagonal, - npm, - this->nbands, - this->ntype, - current_spin, - this->nlpp_->deeq.getBound2(), - this->nlpp_->deeq.getBound3(), - this->nlpp_->deeq.getBound4(), - force_nc, - this->nbands, - ik, - nkb, - atom_nh, - atom_na, - this->ucell_->tpiba, - d_wg, - d_ekb, - qq_nt, - deeq, - becp, - dbecp, - force); + if(this->ucell_->get_npol() == 1) + { + cal_force_nl_op()(this->ctx, + nondiagonal, + npm, + this->nbands, + this->ntype, + current_spin, + this->nlpp_->deeq.getBound2(), + this->nlpp_->deeq.getBound3(), + this->nlpp_->deeq.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + deeq, + becp, + dbecp, + force); + } + else + { + cal_force_nl_op()(this->ctx, + npm, + this->nbands, + this->ntype, + this->nlpp_->deeq_nc.getBound2(), + this->nlpp_->deeq_nc.getBound3(), + this->nlpp_->deeq_nc.getBound4(), + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + d_ekb, + qq_nt, + this->nlpp_->template get_deeq_nc_data(), + becp, + dbecp, + force); + } } // template instantiation diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index f8df7b3029..e23e0ccbc3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -170,6 +170,110 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_nl<<>>( + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + cudaCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu index 275fbfe136..89a74f3c5f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - nonlocal_pw<<>>( + nonlocal_pw<<>>( l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op::operator()(const b template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} // namespace hamilt \ No newline at end of file +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index e965446d77..4f134e0042 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -244,6 +244,116 @@ void cal_stress_nl_op::operator()(const base_de cudaCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + const int ip1 = ii / Nprojs; + const int ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_nl<<>>( + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, @@ -710,34 +820,6 @@ void cal_force_npw_op::operator()( } -// template -// void prepare_vkb_deri_ptr_op::operator()( -// const base_device::DEVICE_GPU* ctx, -// int nbeta, double* nhtol, int nhtol_nc, int npw, int it, -// int ipol, int jpol, -// std::complex*vkb_out, std::complex** vkb_ptrs, -// FPTYPE* ylm_in, FPTYPE** ylm_ptrs, -// FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s, -// FPTYPE* vq_in, FPTYPE** vq_ptrs, -// FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs -// ) -// { -// const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; -// dim3 gridsize(block,nbeta); - - -// prepare_vkb_deri_ptr<<<1,1>>>( -// nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol, -// reinterpret_cast*>(vkb_out), -// reinterpret_cast*>(vkb_ptrs), -// ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s, -// vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs -// ); - -// return ; -// } - - template <> void pointer_array_malloc::operator()( void **ptr, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 1584c5c681..90a4641123 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -145,6 +145,105 @@ struct cal_force_nl_op } // end it #ifdef _OPENMP } +#endif + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { +#ifdef _OPENMP +#pragma omp parallel + { +#endif + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(2) +#endif + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + int iat = iat0 + ia; + int sum = sum0 + ia * nprojs; + for (int ip = 0; ip < nprojs; ip++) + { + const int inkb = sum + ip; + // out<<"\n ps = "< ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + std::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } + } +#ifdef _OPENMP + if (omp_get_num_threads() > 1) + { + for (int ipol = 0; ipol < 3; ++ipol) + { +#pragma omp atomic + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + else +#endif + { + for (int ipol = 0; ipol < 3; ++ipol) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + } + } // end ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * nprojs; + } // end it +#ifdef _OPENMP + } #endif } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b04b769df8..f729280ef6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -92,6 +92,28 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -136,6 +158,28 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; /** diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index fa03fa2a5d..786a6b8ebf 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -166,6 +166,110 @@ void cal_force_nl_op::operator()(const base_dev hipCheckOnDebug(); } +template +__global__ void cal_force_nl( + const int wg_nc, + const int ntype, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int forcenl_nc, + const int nbands, + const int ik, + const int nkb, + const int *atom_nh, + const int *atom_na, + const FPTYPE tpiba, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *force) +{ + const int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + int Nprojs = atom_nh[it]; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + for (int ia = 0; ia < atom_na[it]; ia++) { + for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) { + const int inkb = sum + ip; + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + // Effective values of the D-eS coefficients + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2]; + const int jnkb = sum + ip2; + const thrust::complex ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq; + + for (int ipol = 0; ipol < 3; ipol++) { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const thrust::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const thrust::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const thrust::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const thrust::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + } + iat += 1; + sum += Nprojs; + } +} + +// interface for nspin=4 only +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + wg_nc, ntype, + deeq_2, deeq_3, deeq_4, + forcenl_nc, nbands, ik, nkb, + atom_nh, atom_na, + tpiba, + d_wg, d_ekb, qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force);// array of data + + hipCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu index 93df9fe58d..9a43f64ea6 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu @@ -54,8 +54,8 @@ __global__ void nonlocal_pw( thrust::complex* ps, const thrust::complex* becp) { - const int ii = blockIdx.x / l2; - const int jj = blockIdx.x % l2; + const int ii = blockIdx.x * 2 / l2; + const int jj = blockIdx.x * 2 % l2; for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) { thrust::complex res1(0.0, 0.0); thrust::complex res2(0.0, 0.0); @@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op::operator()(const b { // denghui implement 20221109 // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0, l1, l2, l3, // loop size sum, iat, nkb, // control params deeq_x, deeq_y, deeq_z, @@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op::operator()(const b namespace hamilt{ template struct nonlocal_pw_op; template struct nonlocal_pw_op; -} \ No newline at end of file +} diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index 17cfae404a..2108d6feb8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -238,6 +238,112 @@ void cal_stress_nl_op::operator()(const base_de hipCheckOnDebug(); } +template +__global__ void cal_stress_nl( + const int ipol, + const int jpol, + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int deeq_2, + const int deeq_3, + const int deeq_4, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const thrust::complex *deeq_nc, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + int ib = blockIdx.x / ntype; + const int ib2 = ib * 2; + int it = blockIdx.x % ntype; + + int iat = 0, sum = 0; + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib]; + const int Nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) { + int ip1 = ii / Nprojs, ip2 = ii % Nprojs; + const thrust::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + const thrust::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const thrust::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + const thrust::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ip1; + const int inkb2 = sum + ip2; + //out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const thrust::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const thrust::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + ++iat; + sum+=Nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress + ipol * 3 + jpol, stress_var); + } +} + +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + ipol, + jpol, + nkb, + ntype, + wg_nc, + ik, + deeq_2, + deeq_3, + deeq_4, + atom_nh, + atom_na, + d_wg, + d_ekb, + qq_nt, + reinterpret_cast*>(deeq_nc), + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} + template void cal_stress_mgga_op::operator()( const int& spin, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index c3a8008e04..142ec6e4a8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -135,6 +135,77 @@ struct cal_stress_nl_op } // end it #ifdef _OPENMP } +#endif + stress[ipol * 3 + jpol] += local_stress; + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; +#ifdef _OPENMP +#pragma omp parallel reduction(+ : local_stress) + { +#endif + int iat = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int Nprojs = atom_nh[it]; +#ifdef _OPENMP +#pragma omp for collapse(4) +#endif + for (int ib = 0; ib < nbands_occ; ib++) + { + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ip1 = 0; ip1 < Nprojs; ip1++) + { + for (int ip2 = 0; ip2 < Nprojs; ip2++) + { + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0; + FPTYPE ekb_now = d_ekb[ik * wg_nc + ib]; + std::complex ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2]; + std::complex ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + std::complex ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2]; + std::complex ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq; + const int inkb1 = sum + ia * Nprojs + ip1; + const int inkb2 = sum + ia * Nprojs + ip2; + // out<<"\n ps = "< dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + const std::complex dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2]; + const std::complex dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2]; + local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real(); + } + } // end ip + } // ia + } + sum += atom_na[it] * Nprojs; + iat += atom_na[it]; + } // end it +#ifdef _OPENMP + } #endif stress[ipol * 3 + jpol] += local_stress; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 62e373f1ff..672fd0800c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -103,6 +103,27 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const Device* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; template @@ -250,6 +271,27 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // interface for nspin=4 only + void operator()(const base_device::DEVICE_GPU* ctx, + const int& ipol, + const int& jpol, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int& deeq_2, + const int& deeq_3, + const int& deeq_4, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* d_ekb, + const FPTYPE* qq_nt, + const std::complex* deeq_nc, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; // cpu version first, gpu version later diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp index 7a97be57d7..f05e64458a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp @@ -29,8 +29,9 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, int npwx=0; for (int ik = 0; ik < p_kv->get_nks(); ik++) { - if (npwx < p_kv->ngk[ik]) + if (npwx < p_kv->ngk[ik]) { npwx = p_kv->ngk[ik]; +} } gk[0]= new FPTYPE[npwx]; @@ -39,6 +40,7 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0; FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI); FPTYPE* kfac = new FPTYPE[npwx]; + const int npol = GlobalC::ucell.get_npol(); for (int ik = 0; ik < p_kv->get_nks(); ik++) { @@ -71,8 +73,8 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, { for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++) { - if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0)) - continue; + if (wg(ik, ibnd) == 0.0) { continue; +} const std::complex* ppsi = nullptr; ppsi = &(psi_in[0](ik, ibnd, 0)); @@ -85,6 +87,18 @@ void Stress_Func::stress_kin(ModuleBase::matrix& sigma, sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); } + if(npol == 2) + { + ppsi = &(psi_in[0](ik, ibnd, npwx)); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : sum) +#endif + for (int i = 0; i < npw; i++) + { + sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i] + * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real())); + } + } s_kin[l][m] += sum; } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index be64b4e573..cb306fceeb 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -22,7 +22,8 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, const int nspin_rho = (GlobalV::NSPIN == 2) ? 2 : 1; - if (PARAM.inp.gamma_only && is_pw) fact=2.0; + if (PARAM.inp.gamma_only && is_pw) { fact=2.0; +} @@ -50,11 +51,11 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, aux[ir] = std::complex(chr->rho[0][ir], 0.0 ); } } - for (int is = 1; is < nspin_rho; is++) + if(nspin_rho == 2) { for (int ir = irb; ir < ir_end; ++ir) { // accumulate aux - aux[ir] += std::complex(chr->rho[is][ir], 0.0 ); + aux[ir] += std::complex(chr->rho[1][ir], 0.0 ); } } } @@ -70,12 +71,13 @@ void Stress_Func::stress_loc(ModuleBase::matrix& sigma, { for (int ig=0; ignpw; ig++) { - if (rho_basis->ig_gge0 == ig) + if (rho_basis->ig_gge0 == ig) { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig])).real(); - else + } else { evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig]) * (p_sf->strucFac(it, ig) * conj(aux[ig]) * fact).real(); +} } } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index b243d1ff5d..1ff91afe6c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -48,7 +48,7 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, break; } } - const int npm = ucell_in.get_npol() * nbands_occ; + const int npm = nbands_occ; // calculate becp = for all beta functions nl_tools.cal_becp(ik, npm);