Make clusterizer kernels independent of the grid size (#588)

cms-patatrack · Dec 29, 2020 · e149c26 · e149c26
1 parent ddf8b5a
commit e149c26
Show file tree

Hide file tree

Showing 3 changed files with 304 additions and 322 deletions.
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -19,105 +19,106 @@ namespace gpuClustering {
       uint32_t const* __restrict__ moduleId,     // module id of each module
       int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
       uint32_t numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
-
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
-    assert(thisModuleId == moduleId[blockIdx.x]);
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
 
-    auto nclus = nClustersInModule[thisModuleId];
-    if (nclus == 0)
-      return;
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+      assert(thisModuleId == moduleId[module]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        continue;
+
+      if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx.x,
+               nclus,
+               MaxNumClustersPerModules);
+
+      auto first = firstPixel + threadIdx.x;
+
+      if (nclus > MaxNumClustersPerModules) {
+        // remove excess  FIXME find a way to cut charge first....
+        for (auto i = first; i < numElements; i += blockDim.x) {
+          if (id[i] == InvId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= MaxNumClustersPerModules) {
+            id[i] = InvId;
+            clusterId[i] = InvId;
+          }
+        }
+        nclus = MaxNumClustersPerModules;
+      }
 
-    if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
-      printf("Warning too many clusters in module %d in block %d: %d > %d\n",
-             thisModuleId,
-             blockIdx.x,
-             nclus,
-             MaxNumClustersPerModules);
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
 
-    auto first = firstPixel + threadIdx.x;
+      assert(nclus <= MaxNumClustersPerModules);
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        charge[i] = 0;
+      }
+      __syncthreads();
 
-    if (nclus > MaxNumClustersPerModules) {
-      // remove excess  FIXME find a way to cut charge first....
       for (auto i = first; i < numElements; i += blockDim.x) {
         if (id[i] == InvId)
           continue;  // not valid
         if (id[i] != thisModuleId)
           break;  // end of module
-        if (clusterId[i] >= MaxNumClustersPerModules) {
-          id[i] = InvId;
-          clusterId[i] = InvId;
-        }
+        atomicAdd(&charge[clusterId[i]], adc[i]);
       }
-      nclus = MaxNumClustersPerModules;
-    }
+      __syncthreads();
 
-#ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
-#endif
+      auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
+      }
 
-    __shared__ int32_t charge[MaxNumClustersPerModules];
-    __shared__ uint8_t ok[MaxNumClustersPerModules];
-    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+      __syncthreads();
+
+      // renumber
+      __shared__ uint16_t ws[32];
+      cms::cuda::blockPrefixScan(newclusId, nclus, ws);
+
+      assert(nclus >= newclusId[nclus - 1]);
+
+      if (nclus == newclusId[nclus - 1])
+        continue;
+
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      __syncthreads();
+
+      // mark bad cluster again
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        if (0 == ok[i])
+          newclusId[i] = InvId + 1;
+      }
+      __syncthreads();
+
+      // reassign id
+      for (auto i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == InvId)
+          id[i] = InvId;
+      }
 
-    assert(nclus <= MaxNumClustersPerModules);
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      charge[i] = 0;
-    }
-    __syncthreads();
-
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      atomicAdd(&charge[clusterId[i]], adc[i]);
-    }
-    __syncthreads();
-
-    auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
-    }
-
-    __syncthreads();
-
-    // renumber
-    __shared__ uint16_t ws[32];
-    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
-
-    assert(nclus >= newclusId[nclus - 1]);
-
-    if (nclus == newclusId[nclus - 1])
-      return;
-
-    nClustersInModule[thisModuleId] = newclusId[nclus - 1];
-    __syncthreads();
-
-    // mark bad cluster again
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      if (0 == ok[i])
-        newclusId[i] = InvId + 1;
-    }
-    __syncthreads();
-
-    // reassign id
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      clusterId[i] = newclusId[clusterId[i]] - 1;
-      if (clusterId[i] == InvId)
-        id[i] = InvId;
-    }
-
-    //done
+      //done
+    }  // loop on modules
   }
 
 }  // namespace gpuClustering