diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
index 4d94f68ec4..d1082c809b 100644
--- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
+++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
@@ -186,7 +186,7 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
 		}
 
 		gdr2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
-		h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
+		if(!is_stress) h2 = new ModuleBase::Vector3<double>[rhopw->nrxx];
 
 		XC_Functional::grad_rho( rhogsum1 , gdr1, rhopw, ucell->tpiba);
 		XC_Functional::grad_rho( rhogsum2 , gdr2, rhopw, ucell->tpiba);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
index 04caefd1bf..92733745b4 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
@@ -45,7 +45,7 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);
         for (int ipol = 0; ipol < 3; ipol++)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index f6773b3ddb..f81f5d0880 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -107,12 +107,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
         this->atom_na = h_atom_na.data();
         this->ppcell_vkb = this->nlpp_->vkb.c;
     }
-
-    // prepare the memory of the becp and dbecp:
-    // becp: <Beta(nkb,npw)|psi(nbnd,npw)>
-    // dbecp: <dBeta(nkb,npw)/dG|psi(nbnd,npw)>
-    resmem_complex_op()(this->ctx, becp, this->nbands * nkb, "Stress::becp");
-    resmem_complex_op()(this->ctx, dbecp, 6 * this->nbands * nkb, "Stress::dbecp");
 }
 
 template <typename FPTYPE, typename Device>
@@ -162,9 +156,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_becp");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int size_becp_act = npm * npol * this->nkb;
     if (this->becp == nullptr)
     {
-        resmem_complex_op()(this->ctx, becp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, becp, size_becp);
     }
 
     // prepare math tools
@@ -248,11 +245,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     }
     const char transa = 'C';
     const char transb = 'N';
+    int npm_npol = npm * npol;
     gemm_op()(this->ctx,
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -267,15 +265,15 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_becp(int ik, int npm)
     if (this->device == base_device::GpuDevice)
     {
         std::complex<FPTYPE>* h_becp = nullptr;
-        resmem_complex_h_op()(this->cpu_ctx, h_becp, this->nbands * nkb);
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, this->nbands * nkb);
-        Parallel_Reduce::reduce_pool(h_becp, this->nbands * nkb);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, this->nbands * nkb);
+        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
+        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp, size_becp_act);
+        Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp, h_becp, size_becp_act);
         delmem_complex_h_op()(this->cpu_ctx, h_becp);
     }
     else
     {
-        Parallel_Reduce::reduce_pool(becp, this->nbands * this->nkb);
+        Parallel_Reduce::reduce_pool(becp, size_becp_act);
     }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_becp");
 }
@@ -286,9 +284,12 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
 {
     ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int npm_npol = npm * npol;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, size_becp);
     }
 
     // prepare math tools
@@ -400,7 +401,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               transa,
               transb,
               nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               ppcell_vkb,
@@ -411,6 +412,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
               dbecp,
               nkb);
     // calculate stress for target (ipol, jpol)
+    if(npol == 1)
+    {
     const int current_spin = this->kv_->isk[ik];
     cal_stress_nl_op()(this->ctx,
                        nondiagonal,
@@ -434,20 +437,47 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
                        becp,
                        dbecp,
                        stress);
+    }
+    else
+    {
+        cal_stress_nl_op()(this->ctx,
+                           ipol,
+                           jpol,
+                           nkb,
+                           npm,
+                           this->ntype,
+                           this->nbands,
+                           ik,
+                           this->nlpp_->deeq_nc.getBound2(),
+                           this->nlpp_->deeq_nc.getBound3(),
+                           this->nlpp_->deeq_nc.getBound4(),
+                           atom_nh,
+                           atom_na,
+                           d_wg,
+                           d_ekb,
+                           qq_nt,
+                           this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                           becp,
+                           dbecp,
+                           stress);
+    }
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_s");
 }
 
 template <typename FPTYPE, typename Device>
 void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
 {
-    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_s");
+    ModuleBase::TITLE("FS_Nonlocal_tools", "cal_dbecp_f");
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
+    const int npol = this->ucell_->get_npol();
+    const int npm_npol = npm * npol;
+    const int size_becp = this->nbands * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, 3 * this->nbands * this->nkb);
+        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
     }
 
-    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * this->nbands * this->nkb;
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp;
     const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
     std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
 
@@ -480,7 +510,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               transa,
               transb,
               this->nkb,
-              npm,
+              npm_npol,
               npw,
               &ModuleBase::ONE,
               vkb_deri_ptr,
@@ -490,7 +520,6 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
               &ModuleBase::ZERO,
               dbecp_ptr,
               nkb);
-
     this->revert_vkb(npw, ipol);
     this->pre_ik_f = ik;
     ModuleBase::timer::tick("FS_Nonlocal_tools", "cal_dbecp_f");
@@ -633,29 +662,56 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_force(int ik, int npm, FPTYPE* force
     const int current_spin = this->kv_->isk[ik];
     const int force_nc = 3;
     // calculate the force
-    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
-                                      nondiagonal,
-                                      npm,
-                                      this->nbands,
-                                      this->ntype,
-                                      current_spin,
-                                      this->nlpp_->deeq.getBound2(),
-                                      this->nlpp_->deeq.getBound3(),
-                                      this->nlpp_->deeq.getBound4(),
-                                      force_nc,
-                                      this->nbands,
-                                      ik,
-                                      nkb,
-                                      atom_nh,
-                                      atom_na,
-                                      this->ucell_->tpiba,
-                                      d_wg,
-                                      d_ekb,
-                                      qq_nt,
-                                      deeq,
-                                      becp,
-                                      dbecp,
-                                      force);
+    if(this->ucell_->get_npol() == 1)
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                        nondiagonal,
+                                        npm,
+                                        this->nbands,
+                                        this->ntype,
+                                        current_spin,
+                                        this->nlpp_->deeq.getBound2(),
+                                        this->nlpp_->deeq.getBound3(),
+                                        this->nlpp_->deeq.getBound4(),
+                                        force_nc,
+                                        this->nbands,
+                                        ik,
+                                        nkb,
+                                        atom_nh,
+                                        atom_na,
+                                        this->ucell_->tpiba,
+                                        d_wg,
+                                        d_ekb,
+                                        qq_nt,
+                                        deeq,
+                                        becp,
+                                        dbecp,
+                                        force);
+    }
+    else
+    {
+        cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                          npm,
+                                          this->nbands,
+                                          this->ntype,
+                                          this->nlpp_->deeq_nc.getBound2(),
+                                          this->nlpp_->deeq_nc.getBound3(),
+                                          this->nlpp_->deeq_nc.getBound4(),
+                                          force_nc,
+                                          this->nbands,
+                                          ik,
+                                          nkb,
+                                          atom_nh,
+                                          atom_na,
+                                          this->ucell_->tpiba,
+                                          d_wg,
+                                          d_ekb,
+                                          qq_nt,
+                                          this->nlpp_->template get_deeq_nc_data<FPTYPE>(),
+                                          becp,
+                                          dbecp,
+                                          force);
+    }
 }
 
 // template instantiation
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index f8df7b3029..e23e0ccbc3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -170,6 +170,110 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int wg_nc,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int ik,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int Nprojs = atom_nh[it];
+    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += Nprojs;
+    }
+}
+
+// interface for nspin=4 only
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    cal_force_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+            wg_nc, ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, ik, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
index 275fbfe136..89a74f3c5f 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/nonlocal_op.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -121,7 +121,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  nonlocal_pw<FPTYPE><<<l1 * l2, THREADS_PER_BLOCK>>>(
+  nonlocal_pw<FPTYPE><<<l1 * l2 / 2, THREADS_PER_BLOCK>>>(
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -138,4 +138,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
 
-}  // namespace hamilt
\ No newline at end of file
+}  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index e965446d77..4f134e0042 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -244,6 +244,116 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
+    const FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    const int Nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
+            const int ip1 = ii / Nprojs;
+	        const int ip2 = ii % Nprojs;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=Nprojs;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& wg_nc,
+                                                                   const int& ik,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     cal_stress_nl<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
@@ -710,34 +820,6 @@ void cal_force_npw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
 }
 
 
-// template <typename FPTYPE>
-// void prepare_vkb_deri_ptr_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-//         const base_device::DEVICE_GPU* ctx,
-//         int nbeta, double* nhtol, int nhtol_nc, int npw, int it,
-//         int ipol, int jpol,
-//         std::complex<FPTYPE>*vkb_out, std::complex<FPTYPE>** vkb_ptrs,
-//         FPTYPE* ylm_in, FPTYPE** ylm_ptrs,
-//         FPTYPE* ylm_deri_in, FPTYPE** ylm_deri_ptr1s, FPTYPE** ylm_deri_ptr2s,
-//         FPTYPE* vq_in, FPTYPE** vq_ptrs,
-//         FPTYPE* vq_deri_in, FPTYPE** vq_deri_ptrs
-//     )
-// {
-//     const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-//     dim3 gridsize(block,nbeta);
-
-
-//     prepare_vkb_deri_ptr<FPTYPE><<<1,1>>>(
-//         nbeta, nhtol, nhtol_nc, npw, it, ipol, jpol,
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_out), 
-//         reinterpret_cast<const thrust::complex<FPTYPE>*>(vkb_ptrs), 
-//         ylm_in, ylm_ptrs, ylm_deri_in, ylm_deri_ptr1s, ylm_deri_ptr2s,
-//         vq_in, vq_ptrs, vq_deri_in, vq_deri_ptrs
-//     );
-
-//     return ;
-// }
-
-
 template <>
 void pointer_array_malloc<base_device::DEVICE_GPU>::operator()(
     void **ptr,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 1584c5c681..90a4641123 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -145,6 +145,105 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             } // end it
 #ifdef _OPENMP
         }
+#endif
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+    {
+#ifdef _OPENMP
+#pragma omp parallel
+        {
+#endif
+            int iat0 = 0;
+            int sum0 = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int nprojs = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(2)
+#endif
+                for (int ia = 0; ia < atom_na[it]; ia++)
+                {
+                    for (int ib = 0; ib < nbands_occ; ib++)
+                    {
+                        const int ib2 = ib*2;
+                        FPTYPE local_force[3] = {0, 0, 0};
+                        FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+                        FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                        int iat = iat0 + ia;
+                        int sum = sum0 + ia * nprojs;
+                        for (int ip = 0; ip < nprojs; ip++)
+                        {
+                            const int inkb = sum + ip;
+                            // out<<"\n ps = "<<ps;
+                            for (int ip2 = 0; ip2 < nprojs; ip2++)
+                            {
+                                // Effective values of the D-eS coefficients
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                                const int jnkb = sum + ip2;
+                                std::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                                
+
+                                for (int ipol = 0; ipol < 3; ipol++)
+                                {
+                                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                                    const int index1 = ib2 * nkb + jnkb;
+                                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                                    local_force[ipol] -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                                }
+                            }
+                        }
+#ifdef _OPENMP
+                        if (omp_get_num_threads() > 1)
+                        {
+                            for (int ipol = 0; ipol < 3; ++ipol)
+                            {
+#pragma omp atomic
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                        else
+#endif
+                        {
+                            for (int ipol = 0; ipol < 3; ++ipol)
+                            {
+                                force[iat * forcenl_nc + ipol] += local_force[ipol];
+                            }
+                        }
+                    }
+                } // end ia
+                iat0 += atom_na[it];
+                sum0 += atom_na[it] * nprojs;
+            } // end it
+#ifdef _OPENMP
+        }
 #endif
     }
 };
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index b04b769df8..f729280ef6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -92,6 +92,28 @@ struct cal_force_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -136,6 +158,28 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 /**
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index fa03fa2a5d..786a6b8ebf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -166,6 +166,110 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_nl(
+        const int wg_nc,
+        const int ntype,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int forcenl_nc,
+        const int nbands,
+        const int ik,
+        const int nkb,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE tpiba,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *force)
+{
+    const int ib = blockIdx.x / ntype;
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    int Nprojs = atom_nh[it];
+    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+    for (int ia = 0; ia < atom_na[it]; ia++) {
+        for (int ip = threadIdx.x; ip < Nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip;
+            for (int ip2 = 0; ip2 < Nprojs; ip2++) 
+            {
+                // Effective values of the D-eS coefficients
+                const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip * deeq_4 + ip2];
+                const int jnkb = sum + ip2;
+                const thrust::complex<FPTYPE> ps0 = deeq_nc[((0 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+                const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2];
+                const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat) * deeq_3 + ip) * deeq_4 + ip2] + ps_qq;
+
+                for (int ipol = 0; ipol < 3; ipol++) {
+                    const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                    const int index1 = ib2 * nkb + jnkb;
+                    const thrust::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                    const thrust::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                    const thrust::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                    const FPTYPE tmp = - fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                    atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+                }
+            }
+        }
+        iat += 1;
+        sum += Nprojs;
+    }
+}
+
+// interface for nspin=4 only
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+            wg_nc, ntype,
+            deeq_2, deeq_3, deeq_4,
+            forcenl_nc, nbands, ik, nkb,
+            atom_nh, atom_na,
+            tpiba,
+            d_wg, d_ekb, qq_nt, 
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+            reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+            force);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
index 93df9fe58d..9a43f64ea6 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/nonlocal_op.hip.cu
@@ -54,8 +54,8 @@ __global__ void nonlocal_pw(
     thrust::complex<FPTYPE>* ps,
     const thrust::complex<FPTYPE>* becp)
 {
-  const int ii = blockIdx.x / l2;
-  const int jj = blockIdx.x % l2;
+  const int ii = blockIdx.x * 2 / l2;
+  const int jj = blockIdx.x * 2 % l2;
   for (int kk = threadIdx.x; kk < l3; kk += blockDim.x) {
     thrust::complex<FPTYPE> res1(0.0, 0.0);
     thrust::complex<FPTYPE> res2(0.0, 0.0);
@@ -122,7 +122,7 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 {
   // denghui implement 20221109
   // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2), dim3(THREADS_PER_BLOCK), 0, 0,
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(nonlocal_pw<FPTYPE>), dim3(l1 * l2 / 2), dim3(THREADS_PER_BLOCK), 0, 0,
     l1, l2, l3, // loop size
     sum, iat, nkb,   // control params
     deeq_x, deeq_y, deeq_z,
@@ -140,4 +140,4 @@ void hamilt::nonlocal_pw_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const b
 namespace hamilt{
 template struct nonlocal_pw_op<float, base_device::DEVICE_GPU>;
 template struct nonlocal_pw_op<double, base_device::DEVICE_GPU>;
-}
\ No newline at end of file
+}
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index 17cfae404a..2108d6feb8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -238,6 +238,112 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_nl(
+        const int ipol,
+        const int jpol,
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int deeq_2,
+        const int deeq_3,
+        const int deeq_4,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const FPTYPE* d_ekb,
+        const FPTYPE* qq_nt,
+        const thrust::complex<FPTYPE> *deeq_nc,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    int ib = blockIdx.x / ntype;
+    const int ib2  = ib * 2;
+    int it = blockIdx.x % ntype;
+
+    int iat = 0, sum = 0;
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0, fac = d_wg[ik * wg_nc + ib] * 1.0, ekb_now = d_ekb[ik * wg_nc + ib];
+    const int Nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int ii = threadIdx.x; ii < Nprojs * Nprojs; ii += blockDim.x) {
+            int ip1 = ii / Nprojs, ip2 = ii % Nprojs;
+            const thrust::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const thrust::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+            const thrust::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+            const int inkb1 = sum + ip1;
+            const int inkb2 = sum + ip2;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+            stress_var -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+        }
+        ++iat;
+        sum+=Nprojs;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress + ipol * 3 + jpol, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                   const int& ipol,
+                                                                   const int& jpol,
+                                                                   const int& nkb,
+                                                                   const int& nbands_occ,
+                                                                   const int& ntype,
+                                                                   const int& wg_nc,
+                                                                   const int& ik,
+                                                                   const int& deeq_2,
+                                                                   const int& deeq_3,
+                                                                   const int& deeq_4,
+                                                                   const int* atom_nh,
+                                                                   const int* atom_na,
+                                                                   const FPTYPE* d_wg,
+                                                                   const FPTYPE* d_ekb,
+                                                                   const FPTYPE* qq_nt,
+                                                                   const std::complex<FPTYPE>* deeq_nc,
+                                                                   const std::complex<FPTYPE>* becp,
+                                                                   const std::complex<FPTYPE>* dbecp,
+                                                                   FPTYPE* stress)
+{
+     hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_nl<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+             ipol,
+             jpol,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             deeq_2,
+             deeq_3,
+             deeq_4,
+             atom_nh,
+             atom_na,
+             d_wg,
+             d_ekb,
+             qq_nt,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(deeq_nc),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename T, typename Device>
 void cal_stress_mgga_op<T, Device>::operator()(
     const int& spin,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index c3a8008e04..142ec6e4a8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -135,6 +135,77 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             } // end it
 #ifdef _OPENMP
         }
+#endif
+        stress[ipol * 3 + jpol] += local_stress;
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+    {
+        FPTYPE local_stress = 0;
+#ifdef _OPENMP
+#pragma omp parallel reduction(+ : local_stress)
+        {
+#endif
+            int iat = 0, sum = 0;
+            for (int it = 0; it < ntype; it++)
+            {
+                const int Nprojs = atom_nh[it];
+#ifdef _OPENMP
+#pragma omp for collapse(4)
+#endif
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    for (int ia = 0; ia < atom_na[it]; ia++)
+                    {
+                        for (int ip1 = 0; ip1 < Nprojs; ip1++)
+                        {
+                            for (int ip2 = 0; ip2 < Nprojs; ip2++)
+                            {
+                                const int ib2 = ib*2;
+                                FPTYPE fac = d_wg[ik * wg_nc + ib] * 1.0;
+                                FPTYPE ekb_now = d_ekb[ik * wg_nc + ib];
+                                std::complex<FPTYPE> ps_qq = - ekb_now * qq_nt[it * deeq_3 * deeq_4 + ip1 * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps0 = deeq_nc[((iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                std::complex<FPTYPE> ps1 = deeq_nc[((1 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps2 = deeq_nc[((2 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2];
+                                std::complex<FPTYPE> ps3 = deeq_nc[((3 * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
+                                const int inkb1 = sum + ia * Nprojs + ip1;
+                                const int inkb2 = sum + ia * Nprojs + ip2;
+                                // out<<"\n ps = "<<ps;
+
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[ib2 * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[ib2 * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[ib2 * nkb + inkb2];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[(ib2+1) * nkb + inkb1]) * becp[(ib2+1) * nkb + inkb2];
+                                local_stress -= fac * (ps0 * dbb0 + ps1 * dbb1 + ps2 * dbb2 + ps3 * dbb3).real();
+                            }
+                        } // end ip
+                    }     // ia
+                }
+                sum += atom_na[it] * Nprojs;
+                iat += atom_na[it];
+            } // end it
+#ifdef _OPENMP
+        }
 #endif
         stress[ipol * 3 + jpol] += local_stress;
     }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 62e373f1ff..672fd0800c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -103,6 +103,27 @@ struct cal_stress_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const Device* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 template <typename T, typename Device>
@@ -250,6 +271,27 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // interface for nspin=4 only
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& ipol,
+                    const int& jpol,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int& deeq_2,
+                    const int& deeq_3,
+                    const int& deeq_4,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* d_ekb,
+                    const FPTYPE* qq_nt,
+                    const std::complex<FPTYPE>* deeq_nc,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 // cpu version first, gpu version later
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
index 7a97be57d7..f05e64458a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_kin.cpp
@@ -29,8 +29,9 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
 	int npwx=0;
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
-        if (npwx < p_kv->ngk[ik])
+        if (npwx < p_kv->ngk[ik]) {
             npwx = p_kv->ngk[ik];
+}
     }
 
     gk[0]= new FPTYPE[npwx];
@@ -39,6 +40,7 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
     FPTYPE tpiba = ModuleBase::TWO_PI / GlobalC::ucell.lat0;
     FPTYPE twobysqrtpi = 2.0 / std::sqrt(ModuleBase::PI);
     FPTYPE* kfac = new FPTYPE[npwx];
+    const int npol = GlobalC::ucell.get_npol();
 
     for (int ik = 0; ik < p_kv->get_nks(); ik++)
     {
@@ -71,8 +73,8 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
             {
                 for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++)
                 {
-                    if (std::fabs(wg(ik, ibnd)) < ModuleBase::threshold_wg * wg(ik, 0))
-                        continue;
+                    if (wg(ik, ibnd) == 0.0) { continue;
+}
                     const std::complex<FPTYPE>* ppsi = nullptr;
                     ppsi = &(psi_in[0](ik, ibnd, 0));
 
@@ -85,6 +87,18 @@ void Stress_Func<FPTYPE, Device>::stress_kin(ModuleBase::matrix& sigma,
                         sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
                                * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
                     }
+                    if(npol == 2)
+                    {
+                        ppsi = &(psi_in[0](ik, ibnd, npwx));
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sum)
+#endif
+                        for (int i = 0; i < npw; i++)
+                        {
+                            sum += wg(ik, ibnd) * gk[l][i] * gk[m][i] * kfac[i]
+                                   * (FPTYPE((conj(ppsi[i]) * ppsi[i]).real()));
+                        }
+                    }
                     s_kin[l][m] += sum;
                 }
             }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
index be64b4e573..cb306fceeb 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -22,7 +22,8 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 
 	const int nspin_rho = (GlobalV::NSPIN == 2) ? 2 : 1;
 
-	if (PARAM.inp.gamma_only && is_pw) fact=2.0;
+	if (PARAM.inp.gamma_only && is_pw) { fact=2.0;
+}
 
     
 
@@ -50,11 +51,11 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 				aux[ir] = std::complex<FPTYPE>(chr->rho[0][ir], 0.0 );
 			}
 		}
-		for (int is = 1; is < nspin_rho; is++)
+		if(nspin_rho == 2)
 		{
 			for (int ir = irb; ir < ir_end; ++ir)
 			{ // accumulate aux
-				aux[ir] += std::complex<FPTYPE>(chr->rho[is][ir], 0.0 );
+				aux[ir] += std::complex<FPTYPE>(chr->rho[1][ir], 0.0 );
 			}
 		}
  	}
@@ -70,12 +71,13 @@ void Stress_Func<FPTYPE, Device>::stress_loc(ModuleBase::matrix& sigma,
 		{
 			for (int ig=0; ig<rho_basis->npw; ig++)
 			{
-                if (rho_basis->ig_gge0 == ig)
+                if (rho_basis->ig_gge0 == ig) {
                     evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig])
                              * (p_sf->strucFac(it, ig) * conj(aux[ig])).real();
-                else
+                } else {
                     evloc += GlobalC::ppcell.vloc(it, rho_basis->ig2igg[ig])
                              * (p_sf->strucFac(it, ig) * conj(aux[ig]) * fact).real();
+}
             }
 		}
     }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
index b243d1ff5d..1ff91afe6c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
@@ -48,7 +48,7 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
                 break;
             }
         }
-        const int npm = ucell_in.get_npol() * nbands_occ;
+        const int npm = nbands_occ;
 
         // calculate becp = <psi|beta> for all beta functions
         nl_tools.cal_becp(ik, npm);