Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
}

gdr2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
if(!is_stress) h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];

XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba);
XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba);
Expand Down
2 changes: 1 addition & 1 deletion source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
break;
}
}
const int npm = ucell_in.get_npol() * nbands_occ;
const int npm = nbands_occ;
// calculate becp = <psi|beta> for all beta functions
nl_tools.cal_becp(ik, npm);
for (int ipol = 0; ipol < 3; ipol++)
Expand Down
142 changes: 99 additions & 43 deletions source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
this->atom_na = h_atom_na.data();
this->ppcell_vkb = this->nlpp_->vkb.c;
}

// prepare the memory of the becp and dbecp:
// becp: <Beta(nkb,npw)|psi(nbnd,npw)>
// dbecp: <dBeta(nkb,npw)/dG|psi(nbnd,npw)>
resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp");
resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp");
}

template <typename FPTYPE, typename Device>
Expand Down Expand Up @@ -162,9 +156,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
{
ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
const int npol = this->ucell_->get_npol();
const int size_becp = this->nbands * npol * this->nkb;
const int size_becp_act = npm * npol * this->nkb;
if (this->becp == nullptr)
{
resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb);
resmem_complex_op()(this->ctx, becp, size_becp);
}

// prepare math tools
Expand Down Expand Up @@ -248,11 +245,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
}
const char transa = 'C';
const char transb = 'N';
int npm_npol = npm * npol;
gemm_op()(this->ctx,
transa,
transb,
nkb,
npm,
npm_npol,
npw,
&ModuleBase::ONE,
ppcell_vkb,
Expand All @@ -267,15 +265,15 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
if (this->device == base_device::GpuDevice)
{
std::complex<FPTYPE>* h_becp = nullptr;
resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb);
syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb);
Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb);
syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb);
resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act);
Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act);
delmem_complex_h_op()(this->cpu_ctx, h_becp);
}
else
{
Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb);
Parallel_Reduce::reduce_pool(becp, size_becp_act);
}
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
}
Expand All @@ -286,9 +284,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
{
ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
const int npol = this->ucell_->get_npol();
const int size_becp = this->nbands * npol * this->nkb;
const int npm_npol = npm * npol;
if (this->dbecp == nullptr)
{
resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb);
resmem_complex_op()(this->ctx, dbecp, size_becp);
}

// prepare math tools
Expand Down Expand Up @@ -400,7 +401,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
transa,
transb,
nkb,
npm,
npm_npol,
npw,
&ModuleBase::ONE,
ppcell_vkb,
Expand All @@ -411,6 +412,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
dbecp,
nkb);
// calculate stress for target (ipol, jpol)
if(npol == 1)
{
const int current_spin = this->kv_->isk[ik];
cal_stress_nl_op()(this->ctx,
nondiagonal,
Expand All @@ -434,20 +437,47 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
becp,
dbecp,
stress);
}
else
{
cal_stress_nl_op()(this->ctx,
ipol,
jpol,
nkb,
npm,
this->ntype,
this->nbands,
ik,
this->nlpp_->deeq_nc.getBound2(),
this->nlpp_->deeq_nc.getBound3(),
this->nlpp_->deeq_nc.getBound4(),
atom_nh,
atom_na,
d_wg,
d_ekb,
qq_nt,
this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
becp,
dbecp,
stress);
}
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
}

template <typename FPTYPE, typename Device>
void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
{
ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
const int npol = this->ucell_->get_npol();
const int npm_npol = npm * npol;
const int size_becp = this->nbands * npol * this->nkb;
if (this->dbecp == nullptr)
{
resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb);
resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
}

std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb;
std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp;
const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;

Expand Down Expand Up @@ -480,7 +510,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
transa,
transb,
this->nkb,
npm,
npm_npol,
npw,
&ModuleBase::ONE,
vkb_deri_ptr,
Expand All @@ -490,7 +520,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
&ModuleBase::ZERO,
dbecp_ptr,
nkb);

this->revert_vkb(npw, ipol);
this->pre_ik_f = ik;
ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
Expand Down Expand Up @@ -633,29 +662,56 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
const int current_spin = this->kv_->isk[ik];
const int force_nc = 3;
// calculate the force
cal_force_nl_op<FPTYPE, Device>()(this->ctx,
nondiagonal,
npm,
this->nbands,
this->ntype,
current_spin,
this->nlpp_->deeq.getBound2(),
this->nlpp_->deeq.getBound3(),
this->nlpp_->deeq.getBound4(),
force_nc,
this->nbands,
ik,
nkb,
atom_nh,
atom_na,
this->ucell_->tpiba,
d_wg,
d_ekb,
qq_nt,
deeq,
becp,
dbecp,
force);
if(this->ucell_->get_npol() == 1)
{
cal_force_nl_op<FPTYPE, Device>()(this->ctx,
nondiagonal,
npm,
this->nbands,
this->ntype,
current_spin,
this->nlpp_->deeq.getBound2(),
this->nlpp_->deeq.getBound3(),
this->nlpp_->deeq.getBound4(),
force_nc,
this->nbands,
ik,
nkb,
atom_nh,
atom_na,
this->ucell_->tpiba,
d_wg,
d_ekb,
qq_nt,
deeq,
becp,
dbecp,
force);
}
else
{
cal_force_nl_op<FPTYPE, Device>()(this->ctx,
npm,
this->nbands,
this->ntype,
this->nlpp_->deeq_nc.getBound2(),
this->nlpp_->deeq_nc.getBound3(),
this->nlpp_->deeq_nc.getBound4(),
force_nc,
this->nbands,
ik,
nkb,
atom_nh,
atom_na,
this->ucell_->tpiba,
d_wg,
d_ekb,
qq_nt,
this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
becp,
dbecp,
force);
}
}

// template instantiation
Expand Down
104 changes: 104 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,110 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
cudaCheckOnDebug();
}

template <typename FPTYPE>
__global__ void cal_force_nl(
const int wg_nc,
const int ntype,
const int deeq_2,
const int deeq_3,
const int deeq_4,
const int forcenl_nc,
const int nbands,
const int ik,
const int nkb,
const int *atom_nh,
const int *atom_na,
const FPTYPE tpiba,
const FPTYPE *d_wg,
const FPTYPE* d_ekb,
const FPTYPE* qq_nt,
const thrust::complex<FPTYPE> *deeq_nc,
const thrust::complex<FPTYPE> *becp,
const thrust::complex<FPTYPE> *dbecp,
FPTYPE *force)
{
const int ib = blockIdx.x / ntype;
const int ib2 = ib * 2;
const int it = blockIdx.x % ntype;

int iat = 0, sum = 0;
for (int ii = 0; ii < it; ii++) {
iat += atom_na[ii];
sum += atom_na[ii] * atom_nh[ii];
}

int Nprojs = atom_nh[it];
FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
for (int ia = 0; ia < atom_na[it]; ia++) {
for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
const int inkb = sum + ip;
for (int ip2 = 0; ip2 < Nprojs; ip2++)
{
// Effective values of the D-eS coefficients
const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
const int jnkb = sum + ip2;
const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;

for (int ipol = 0; ipol < 3; ipol++) {
const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
const int index1 = ib2 * nkb + jnkb;
const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
atomicAdd(force + iat * forcenl_nc + ipol, tmp);
}
}
}
iat += 1;
sum += Nprojs;
}
}

// interface for nspin=4 only
template <typename FPTYPE>
void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
const int& nbands_occ,
const int& wg_nc,
const int& ntype,
const int& deeq_2,
const int& deeq_3,
const int& deeq_4,
const int& forcenl_nc,
const int& nbands,
const int& ik,
const int& nkb,
const int* atom_nh,
const int* atom_na,
const FPTYPE& tpiba,
const FPTYPE* d_wg,
const FPTYPE* d_ekb,
const FPTYPE* qq_nt,
const std::complex<FPTYPE>* deeq_nc,
const std::complex<FPTYPE>* becp,
const std::complex<FPTYPE>* dbecp,
FPTYPE* force)
{
cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
wg_nc, ntype,
deeq_2, deeq_3, deeq_4,
forcenl_nc, nbands, ik, nkb,
atom_nh, atom_na,
tpiba,
d_wg, d_ekb, qq_nt,
reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
force);// array of data

cudaCheckOnDebug();
}

template <typename FPTYPE>
__global__ void saveVkbValues_(
const int *gcar_zero_ptrs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
thrust::complex<FPTYPE>* ps,
const thrust::complex<FPTYPE>* becp)
{
const int ii = blockIdx.x / l2;
const int jj = blockIdx.x % l2;
const int ii = blockIdx.x * 2 / l2;
const int jj = blockIdx.x * 2 % l2;
for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
thrust::complex<FPTYPE> res1(0.0, 0.0);
thrust::complex<FPTYPE> res2(0.0, 0.0);
Expand Down Expand Up @@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
{
// denghui implement 20221109
// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
nonlocal_pw<FPTYPE><<<l1 * l2, THREADS_PER_BLOCK>>>(
nonlocal_pw<FPTYPE><<<l1 * l2 / 2, THREADS_PER_BLOCK>>>(
l1, l2, l3, // loop size
sum, iat, nkb, // control params
deeq_x, deeq_y, deeq_z,
Expand All @@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;

} // namespace hamilt
} // namespace hamilt
Loading