cp2k · oschuett · May 16, 2023 · May 15, 2023 · May 15, 2023 · May 15, 2023
diff --git a/src/grid/Makefile b/src/grid/Makefile
@@ -5,7 +5,7 @@ all: grid_miniapp.x grid_unittest.x
 clean:
 	rm -fv *.o */*.o *.x ../offload/*.o
 
-CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra
+CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra -Wno-vla-parameter
 NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__OFFLOAD_CUDA
 LIBS := -lm -lblas
 

diff --git a/src/grid/README.md b/src/grid/README.md
@@ -36,7 +36,7 @@ Beware that MPI ranks can overwrite each other's files.
 The resulting .task files are human readable and diffable:
 
 ```task-file
-#Grid collocate task v9
+#Grid task v10
 orthorhombic 1
 border_mask 0
 func 100
@@ -49,18 +49,18 @@ For more information see [grid_replay.c](grid_replay.c).
 
 ## MiniApp
 
-The `grid_collocate_miniapp.x` binary allows to run individual .task files.
+The `grid_miniapp.x` binary allows to run individual .task files.
 By default `grid_ref_collocate_pgf_product` is called. When the `--batch` flag
 is set then `grid_collocate_task_list` is called instead.
 
 ```shell
 $ cd cp2k/src/grid
 $ make
-$ ./grid_collocate_miniapp.x
-Usage: grid_base_ref_miniapp.x [--batch <cycles-per-block>] <cycles> <task-file>
+$ ./grid_miniapp.x
+Usage: grid_miniapp.x [--integrate] [--batch <cycles-per-block>] <cycles> <task-file>
 
-$ ./grid_collocate_miniapp.x --batch 10 100 sample_tasks/collocate_ortho_density_l2200.task
-Task: sample_tasks/collocate_ortho_density_l2200.task                     Batched: yes   Cycles: 1.000000e+02   Max value: 1.579830e+02   Max diff: 1.705303e-13   Time: 1.884854e-03 sec
+$ ./grid_miniapp.x --batch 10 100 ./sample_tasks/ortho_density_l2200.task
+Task: ./sample_tasks/ortho_density_l2200.task                   Collocate Batched   Cycles: 1.000000e+02   Max value: 1.579830e+02   Max rel diff: 7.435177e-11   Time: 1.438550e-04 sec
 ```
 
 ## Unit Test
@@ -105,3 +105,71 @@ Task: ../../src/grid/sample_tasks/general_overflow.task         Collocate Batche
 
 All tests have passed :-)
 ```
+
+## CUDA Register Usage
+
+When modifying the CUDA kernels keep an eye on the register usage:
+
+```shell
+$ cd cp2k/src/grid
+$ make
+$ cuobjdump --dump-resource-usage ./gpu/grid_gpu_collocate.o
+
+Fatbin elf code:
+================
+arch = sm_70
+code version = [1,7]
+host = linux
+compile_size = 64bit
+identifier = grid_gpu_collocate.cu
+
+Resource usage:
+ Common:
+  GLOBAL:328 CONSTANT[3]:18848
+ Function _Z24collocate_kernel_anyfunc13kernel_params:
+  REG:147 STACK:0 SHARED:416 LOCAL:0 CONSTANT[0]:600 CONSTANT[2]:320 TEXTURE:0 SURFACE:0 SAMPLER:0
+ Function _Z24collocate_kernel_density13kernel_params:
+  REG:71 STACK:0 SHARED:416 LOCAL:0 CONSTANT[0]:600 CONSTANT[2]:104 TEXTURE:0 SURFACE:0 SAMPLER:0
+
+Fatbin ptx code:
+================
+arch = sm_70
+code version = [7,8]
+host = linux
+compile_size = 64bit
+compressed
+identifier = grid_gpu_collocate.cu
+ptxasOptions =  --generate-line-info
+
+$ cuobjdump --dump-resource-usage ./gpu/grid_gpu_integrate.o
+
+Fatbin elf code:
+================
+arch = sm_70
+code version = [1,7]
+host = linux
+compile_size = 64bit
+identifier = grid_gpu_integrate.cu
+
+Resource usage:
+ Common:
+  GLOBAL:328 CONSTANT[3]:18848
+ Function _Z25grid_integrate_tau_forces13kernel_params:
+  REG:135 STACK:64 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
+ Function _Z29grid_integrate_density_forces13kernel_params:
+  REG:130 STACK:16 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
+ Function _Z18grid_integrate_tau13kernel_params:
+  REG:88 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
+ Function _Z22grid_integrate_density13kernel_params:
+  REG:78 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
+
+Fatbin ptx code:
+================
+arch = sm_70
+code version = [7,8]
+host = linux
+compile_size = 64bit
+compressed
+identifier = grid_gpu_integrate.cu
+ptxasOptions =  --generate-line-info
+```
diff --git a/src/grid/common/grid_common.h b/src/grid/common/grid_common.h
@@ -71,12 +71,13 @@ GRID_HOST_DEVICE static inline double fac(const int i) {
  * \author Ole Schuett
  ******************************************************************************/
 GRID_HOST_DEVICE static inline int ncoset(const int l) {
-  static const int table[] = {1,  // l=0
+  static const int table[] = {0,  // l=-1, usefull for computing loop bounds
+                              1,  // l=0
                               4,  // l=1
                               10, // l=2 ...
                               20,  35,  56,  84,  120, 165, 220,  286,
                               364, 455, 560, 680, 816, 969, 1140, 1330};
-  return table[l];
+  return table[l + 1];
 }
 
 /*******************************************************************************