Merge pull request #341 from denghuilu/devel

fix bug of nbor sorting
deepmodeling · Feb 1, 2021 · 64eaa2e · 64eaa2e
2 parents 71d4568 + 9a39df9
commit 64eaa2e
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 291 deletions.
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
@@ -169,7 +169,7 @@ void compute_descriptor_se_a_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -235,8 +235,8 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -432,7 +432,7 @@ void compute_descriptor_se_r_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -498,8 +498,8 @@ void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************

diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
@@ -84,9 +84,9 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
                             const float rcut,
                             int_64 * key,
                             int * i_idx,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 {   
-    // <<<nloc, MAGIC_NUMBER>>>
+    // <<<nloc, MAX_NBOR_SIZE>>>
     const unsigned int idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -98,7 +98,7 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
     const int * nei_idx = jlist + jrange[i_idx[idx]];
     // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
 
-    int_64 * key_in = key + idx * MAGIC_NUMBER;
+    int_64 * key_in = key + idx * MAX_NBOR_SIZE;
 
     FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
@@ -121,7 +121,7 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
                             const int * sec_a,
                             const int sec_a_size,
                             int * nei_iter_dev,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -132,13 +132,13 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 
     int * row_nlist = nlist + idy * nlist_size;
     int * nei_iter = nei_iter_dev + idy * sec_a_size;
-    int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
+    int_64 * key_out = key + nloc * MAX_NBOR_SIZE + idy * MAX_NBOR_SIZE;
 
     for (int ii = 0; ii < sec_a_size; ii++) {
         nei_iter[ii] = sec_a[ii];
     }
 
-    for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
+    for (unsigned int kk = 0; key_out[kk] != key_out[MAX_NBOR_SIZE - 1]; kk++) {
         const int & nei_type = key_out[kk] / 1E15;
         if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
             row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
@@ -228,73 +228,6 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
     }
 }
 
-template<typename FPTYPE>
-void format_nbor_list_256 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 256;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_a
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
-
-template<typename FPTYPE>
-void format_nbor_list_512 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 512;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_a
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
 
 template<typename FPTYPE>
 void format_nbor_list_1024 (
@@ -309,8 +242,8 @@ void format_nbor_list_1024 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 1024;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 1024;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -322,12 +255,12 @@ void format_nbor_list_1024 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -343,8 +276,8 @@ void format_nbor_list_2048 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 2048;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 2048;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -356,12 +289,12 @@ void format_nbor_list_2048 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -377,8 +310,8 @@ void format_nbor_list_4096 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 4096;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 4096;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -390,16 +323,16 @@ void format_nbor_list_4096 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 16;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template <typename FPTYPE>
-void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
+void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -409,7 +342,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
 
     cudaError_t res = cudaSuccess;
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
-    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
     res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
     res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
@@ -419,29 +352,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         // cudaProfilerStart();
         get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        if (nnei <= 256) {
-            format_nbor_list_256 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 512) {
-            format_nbor_list_512 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 1024) {
+        if (max_nbor_size <= 1024) {
             format_nbor_list_1024 (
                 coord,
                 type,
@@ -452,7 +363,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 2048) {
+        } else if (max_nbor_size <= 2048) {
             format_nbor_list_2048 (
                 coord,
                 type,
@@ -463,7 +374,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 4096) {
+        } else if (max_nbor_size <= 4096) {
             format_nbor_list_4096 (
                 coord,
                 type,
@@ -486,7 +397,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                             sec_a_dev,
                             sec_a.size(),
                             nei_iter,
-                            MAGIC_NUMBER
+                            max_nbor_size
         );
     }