pass integer zero to memset

`memset`, `cudaMemset`, and `hipMemset` only accept integer as value. It's meaningless to pass a float, whether the compiler optimizes or not. References: https://www.cplusplus.com/reference/cstring/memset/ https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gf7338650f7683c51ee26aadc6973c63a https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP_API/Memory-Management.html#hipmemset
deepmodeling · Feb 22, 2022 · 90995b6 · 90995b6
1 parent d9a4a86
commit 90995b6
Show file tree

Hide file tree

Showing 14 changed files with 60 additions and 60 deletions.
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
@@ -539,9 +539,9 @@ void prod_env_mat_a_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -578,9 +578,9 @@ void prod_env_mat_r_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 

diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
@@ -109,7 +109,7 @@ void prod_force_a_gpu_cuda(
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(
       force, 
-      0.0, sizeof(FPTYPE) * nall * 3));
+      0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 
@@ -141,7 +141,7 @@ void prod_force_r_gpu_cuda(
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(
       force, 
-      0.0, sizeof(FPTYPE) * nall * 3));
+      0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 

diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
@@ -89,7 +89,7 @@ void prod_force_grad_a_gpu_cuda(
     const int ndescrpt = nnei * 4;
     DPErrcheck(cudaMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(TPB, 1);
@@ -122,7 +122,7 @@ void prod_force_grad_r_gpu_cuda(
     const int ndescrpt = nnei * 1;
     DPErrcheck(cudaMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(TPB, 1);

diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
@@ -116,10 +116,10 @@ void prod_virial_a_gpu_cuda(
 {
   DPErrcheck(cudaMemset(
       virial, 
-      0.0, sizeof(FPTYPE) * 9));
+      0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(
       atom_virial, 
-      0.0, sizeof(FPTYPE) * 9 * nall));
+      0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;
@@ -153,10 +153,10 @@ void prod_virial_r_gpu_cuda(
 {
   DPErrcheck(cudaMemset(
       virial, 
-      0.0, sizeof(FPTYPE) * 9));
+      0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(
       atom_virial, 
-      0.0, sizeof(FPTYPE) * 9 * nall));
+      0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;

diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
@@ -100,7 +100,7 @@ void prod_virial_grad_a_gpu_cuda(
     const int ndescrpt = nnei * 4;
     DPErrcheck(cudaMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
     const int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
@@ -125,7 +125,7 @@ void prod_virial_grad_r_gpu_cuda(
     const int ndescrpt = nnei;
     DPErrcheck(cudaMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
     const int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);

diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
@@ -648,10 +648,10 @@ void tabulate_fusion_se_a_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
       dy_dem_x,
-      0.0, sizeof(FPTYPE) * nloc * nnei));
+      0, sizeof(FPTYPE) * nloc * nnei));
   DPErrcheck(cudaMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei * 4));
+      0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dy_dem_x, dy_dem,
@@ -676,7 +676,7 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dz_dy,
       table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
@@ -721,10 +721,10 @@ void tabulate_fusion_se_t_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
       dy_dem_x,
-      0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+      0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
   DPErrcheck(cudaMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+      0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
   tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size>>>(
       dy_dem_x, dy_dem,
@@ -750,7 +750,7 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, last_layer_size>>>(
       dz_dy,
@@ -791,7 +791,7 @@ void tabulate_fusion_se_r_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei));
+      0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dy_dem,
@@ -814,7 +814,7 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(
   if (nloc <= 0) {return;}
   DPErrcheck(cudaMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dz_dy,
       table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);

diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
@@ -34,7 +34,7 @@ prod_force_a_cpu(
 {
   const int ndescrpt = 4 * nnei;
 
-  memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+  memset(force, 0, sizeof(FPTYPE) * nall * 3);
   // compute force of a frame
   #pragma omp parallel
   for (int i_idx = 0; i_idx < nloc; ++i_idx) {

diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -537,9 +537,9 @@ void prod_env_mat_a_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -576,9 +576,9 @@ void prod_env_mat_r_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 

diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
@@ -109,7 +109,7 @@ namespace deepmd {
     const int ndescrpt = nnei * 4;
     DPErrcheck(hipMemset(
         force, 
-        0.0, sizeof(FPTYPE) * nall * 3));
+        0, sizeof(FPTYPE) * nall * 3));
 
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 
@@ -141,7 +141,7 @@ namespace deepmd {
     const int ndescrpt = nnei * 1;
     DPErrcheck(hipMemset(
         force, 
-        0.0, sizeof(FPTYPE) * nall * 3));
+        0, sizeof(FPTYPE) * nall * 3));
 
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 

diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -89,7 +89,7 @@ void prod_force_grad_a_gpu_rocm(
     const int ndescrpt = nnei * 4;
     DPErrcheck(hipMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(TPB, 1);
@@ -121,7 +121,7 @@ void prod_force_grad_r_gpu_rocm(
     const int ndescrpt = nnei * 1;
     DPErrcheck(hipMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(TPB, 1);

diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
@@ -113,10 +113,10 @@ void prod_virial_a_gpu_rocm(
 {
     DPErrcheck(hipMemset(
         virial, 
-        0.0, sizeof(FPTYPE) * 9));
+        0, sizeof(FPTYPE) * 9));
     DPErrcheck(hipMemset(
       atom_virial, 
-      0.0, sizeof(FPTYPE) * 9 * nall));
+      0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN -1) / LEN;
@@ -150,10 +150,10 @@ void prod_virial_r_gpu_rocm(
 {
     DPErrcheck(hipMemset(
         virial, 
-        0.0, sizeof(FPTYPE) * 9));
+        0, sizeof(FPTYPE) * 9));
     DPErrcheck(hipMemset(
       atom_virial, 
-      0.0, sizeof(FPTYPE) * 9 * nall));
+      0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN -1) / LEN;

diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -100,7 +100,7 @@ void prod_virial_grad_a_gpu_rocm(
     const int ndescrpt = nnei * 4;
     DPErrcheck(hipMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
     const int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
@@ -125,7 +125,7 @@ void prod_virial_grad_r_gpu_rocm(
     const int ndescrpt = nnei;
     DPErrcheck(hipMemset(
         grad_net, 
-        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+        0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
     const int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);

diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
@@ -637,10 +637,10 @@ void tabulate_fusion_se_a_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
       dy_dem_x,
-      0.0, sizeof(FPTYPE) * nloc * nnei));
+      0, sizeof(FPTYPE) * nloc * nnei));
   DPErrcheck(hipMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei * 4));
+      0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, 
       dy_dem_x, dy_dem,
@@ -665,7 +665,7 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, 
     dz_dy,
     table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
@@ -710,10 +710,10 @@ void tabulate_fusion_se_t_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
       dy_dem_x,
-      0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+      0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
   DPErrcheck(hipMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+      0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size, 0, 
       dy_dem_x, dy_dem,
@@ -739,7 +739,7 @@ void tabulate_fusion_se_t_grad_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * last_layer_size));
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, 0, 0, 
     dz_dy,
     table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size);
@@ -779,7 +779,7 @@ void tabulate_fusion_se_r_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
       dy_dem,
-      0.0, sizeof(FPTYPE) * nloc * nnei));
+      0, sizeof(FPTYPE) * nloc * nnei));
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, 
       dy_dem,
@@ -802,7 +802,7 @@ void tabulate_fusion_se_r_grad_grad_gpu_rocm(
   if(nloc <= 0) {return;}
   DPErrcheck(hipMemset(
     dz_dy,
-    0.0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
+    0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, 
     dz_dy,
     table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);