rename cuda_config/hip_config to config

ginkgo-project · Nov 11, 2019 · aa76ecd · aa76ecd
1 parent a5c63fe
commit aa76ecd
Show file tree

Hide file tree

Showing 24 changed files with 186 additions and 205 deletions.
diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc
@@ -114,7 +114,8 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
 GKO_BIND_ATOMIC_ADD(float);
 
 
-#if !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
+#if !defined(__HIPCC__) || \
+    (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
 
 
 #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
@@ -142,7 +143,8 @@ GKO_BIND_ATOMIC_ADD(__half2);
 #endif
 
 
-#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_HCC)
+#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
+        // GINKGO_HIP_PLATFORM_HCC)
 
 
 #undef GKO_BIND_ATOMIC_ADD
diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp
@@ -39,7 +39,7 @@ namespace kernels {
 namespace cuda {
 
 
-struct cuda_config {
+struct config {
     /**
      * The number of threads within a CUDA warp.
      */

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
@@ -180,7 +180,7 @@ void CudaExecutor::set_gpu_property()
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &num_multiprocessor_, cudaDevAttrMultiProcessorCount, device_id_));
         num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) /
-                            kernels::cuda::cuda_config::warp_size;
+                            kernels::cuda::config::warp_size;
     }
 }
 

diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
@@ -459,8 +459,8 @@ __device__ __forceinline__ auto tiled_partition(const Group &g)
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes
 template <size_type Size, typename Group>
 __device__ __forceinline__ gko::xstd::enable_if_t<
-    (Size <= kernels::cuda::cuda_config::warp_size) && (Size > 0) &&
-        (kernels::cuda::cuda_config::warp_size % Size == 0),
+    (Size <= kernels::cuda::config::warp_size) && (Size > 0) &&
+        (kernels::cuda::config::warp_size % Size == 0),
     thread_block_tile<Size>>
 tiled_partition(const Group &)
 {

diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
@@ -48,7 +48,7 @@ namespace csr {
  * @internal
  *
  * @note assumes that block dimensions are in "standard format":
- *       (subwarp_size, cuda_config::warp_size / subwarp_size, z)
+ *       (subwarp_size, config::warp_size / subwarp_size, z)
  */
 template <
     int max_block_size, int warps_per_block, typename Group, typename ValueType,
@@ -64,7 +64,7 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
     ValueType *__restrict__ workspace)
 {
     const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
+    const auto warp = group::tiled_partition<config::warp_size>(group);
     auto bid = static_cast<size_type>(blockIdx.x) * warps_per_block *
                    processed_blocks +
                threadIdx.z * processed_blocks;
@@ -90,7 +90,7 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
             const auto rstart = row_ptrs[row] + tid;
             const auto rend = row_ptrs[row + 1];
             // use the entire warp to ensure coalesced memory access
-            for (auto j = rstart; j < rend; j += cuda_config::warp_size) {
+            for (auto j = rstart; j < rend; j += config::warp_size) {
                 const auto col = col_idxs[j] - bstart;
                 if (col >= bsize) {
                     break;

diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
@@ -92,22 +92,21 @@ namespace host_kernel {
  * It calculates the number of warps used in Coo Spmv depending on the GPU
  * architecture and the number of stored elements.
  */
-template <size_type subwarp_size = cuda_config::warp_size>
+template <size_type subwarp_size = config::warp_size>
 __host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
                                     const size_type nnz)
 {
     size_type warps_per_sm =
-        exec->get_num_warps_per_sm() * cuda_config::warp_size / subwarp_size;
+        exec->get_num_warps_per_sm() * config::warp_size / subwarp_size;
     size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
     size_type multiple = 8;
     if (nnz >= 2000000) {
         multiple = 128;
     } else if (nnz >= 200000) {
         multiple = 32;
     }
-    return std::min(
-        multiple * nwarps_in_cuda,
-        static_cast<size_type>(ceildiv(nnz, cuda_config::warp_size)));
+    return std::min(multiple * nwarps_in_cuda,
+                    static_cast<size_type>(ceildiv(nnz, config::warp_size)));
 }
 
 

diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
@@ -134,14 +134,14 @@ __device__ void reduce(const Group &__restrict__ group,
     const auto local_id = group.thread_rank();
 
 #pragma unroll
-    for (int k = group.size() / 2; k >= cuda_config::warp_size; k /= 2) {
+    for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
         group.sync();
         if (local_id < k) {
             data[local_id] = reduce_op(data[local_id], data[local_id + k]);
         }
     }
 
-    const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
+    const auto warp = group::tiled_partition<config::warp_size>(group);
     const auto warp_id = group.thread_rank() / warp.size();
     if (warp_id > 0) {
         return;

diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
@@ -89,8 +89,8 @@ __device__ __forceinline__ size_type get_block_id()
  *         to
  *
  * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
  */
 __device__ __forceinline__ size_type get_local_warp_id()
 {
@@ -110,13 +110,13 @@ __device__ __forceinline__ size_type get_local_warp_id()
  *         belongs to
  *
  * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
  */
 template <int subwarp_size>
 __device__ __forceinline__ size_type get_local_subwarp_id()
 {
-    constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
     return get_local_warp_id() * subwarps_per_warp + threadIdx.y;
 }
 
@@ -132,8 +132,8 @@ __device__ __forceinline__ size_type get_local_subwarp_id()
  * @return the local ID of the thread (relative to the block)
  *
  * @note Assumes that block dimensions are in standard format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)`
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
  */
 template <int subwarp_size>
 __device__ __forceinline__ size_type get_local_thread_id()
@@ -153,8 +153,8 @@ __device__ __forceinline__ size_type get_local_thread_id()
  *
  * @note Assumes that block dimensions and grid dimensions are in standard
  *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
  *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
  *       respectively.
  */
@@ -176,15 +176,15 @@ __device__ __forceinline__ size_type get_warp_id()
  *
  * @note Assumes that block dimensions and grid dimensions are in standard
  *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
  *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
  *       respectively.
  */
 template <int subwarp_size, int warps_per_block>
 __device__ __forceinline__ size_type get_subwarp_id()
 {
-    constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
     return get_warp_id<warps_per_block>() * subwarps_per_warp + threadIdx.y;
 }
 
@@ -200,8 +200,8 @@ __device__ __forceinline__ size_type get_subwarp_id()
  *
  * @note Assumes that block dimensions and grid dimensions are in standard
  *       format:
- *       `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
- *         cuda_config::warp_size)` and
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
  *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
  *       respectively.
  */

diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
@@ -70,8 +70,7 @@ namespace coo {
 
 constexpr int default_block_size = 512;
 constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size;
-using device_config = cuda_config;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
 
 
 #include "common/matrix/coo_kernels.hpp.inc"
@@ -113,23 +112,23 @@ void spmv2(std::shared_ptr<const CudaExecutor> exec,
 {
     const auto nnz = a->get_num_stored_elements();
     const auto b_ncols = b->get_size()[1];
-    const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
     if (nwarps > 0) {
         if (b_ncols < 4) {
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
-            int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             abstract_spmv<<<coo_grid, coo_block>>>(
                 nnz, num_lines, as_cuda_type(a->get_const_values()),
                 a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
                 as_cuda_type(b->get_const_values()), b->get_stride(),
                 as_cuda_type(c->get_values()), c->get_stride());
         } else {
-            int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
-                            cuda_config::warp_size;
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, cuda_config::warp_size));
+                                ceildiv(b_ncols, config::warp_size));
             abstract_spmm<<<coo_grid, coo_block>>>(
                 nnz, num_elems, as_cuda_type(a->get_const_values()),
                 a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
@@ -151,12 +150,12 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
 {
     const auto nnz = a->get_num_stored_elements();
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
-    const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
+    const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto b_ncols = b->get_size()[1];
 
     if (nwarps > 0) {
         if (b_ncols < 4) {
-            int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
+            int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
             abstract_spmv<<<coo_grid, coo_block>>>(
                 nnz, num_lines, as_cuda_type(alpha->get_const_values()),
@@ -165,10 +164,10 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(b->get_const_values()), b->get_stride(),
                 as_cuda_type(c->get_values()), c->get_stride());
         } else {
-            int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
-                            cuda_config::warp_size;
+            int num_elems =
+                ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
-                                ceildiv(b_ncols, cuda_config::warp_size));
+                                ceildiv(b_ncols, config::warp_size));
             abstract_spmm<<<coo_grid, coo_block>>>(
                 nnz, num_elems, as_cuda_type(alpha->get_const_values()),
                 as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
@@ -226,9 +225,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const auto nnz = source->get_num_stored_elements();
 
-    const dim3 block_size(cuda_config::warp_size,
-                          cuda_config::max_block_size / cuda_config::warp_size,
-                          1);
+    const dim3 block_size(config::warp_size,
+                          config::max_block_size / config::warp_size, 1);
     const dim3 init_grid_dim(ceildiv(stride, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(