Skip to content

Commit

Permalink
rename cuda_config/hip_config to config
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Nov 11, 2019
1 parent a5c63fe commit aa76ecd
Show file tree
Hide file tree
Showing 24 changed files with 186 additions and 205 deletions.
6 changes: 4 additions & 2 deletions common/components/atomic.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
GKO_BIND_ATOMIC_ADD(float);


#if !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
#if !defined(__HIPCC__) || \
(defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)


#if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
Expand Down Expand Up @@ -142,7 +143,8 @@ GKO_BIND_ATOMIC_ADD(__half2);
#endif


#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_HCC)
#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
// GINKGO_HIP_PLATFORM_HCC)


#undef GKO_BIND_ATOMIC_ADD
2 changes: 1 addition & 1 deletion cuda/base/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace kernels {
namespace cuda {


struct cuda_config {
struct config {
/**
* The number of threads within a CUDA warp.
*/
Expand Down
2 changes: 1 addition & 1 deletion cuda/base/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ void CudaExecutor::set_gpu_property()
GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
&num_multiprocessor_, cudaDevAttrMultiProcessorCount, device_id_));
num_warps_per_sm_ = convert_sm_ver_to_cores(major_, minor_) /
kernels::cuda::cuda_config::warp_size;
kernels::cuda::config::warp_size;
}
}

Expand Down
4 changes: 2 additions & 2 deletions cuda/components/cooperative_groups.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -459,8 +459,8 @@ __device__ __forceinline__ auto tiled_partition(const Group &g)
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes
template <size_type Size, typename Group>
__device__ __forceinline__ gko::xstd::enable_if_t<
(Size <= kernels::cuda::cuda_config::warp_size) && (Size > 0) &&
(kernels::cuda::cuda_config::warp_size % Size == 0),
(Size <= kernels::cuda::config::warp_size) && (Size > 0) &&
(kernels::cuda::config::warp_size % Size == 0),
thread_block_tile<Size>>
tiled_partition(const Group &)
{
Expand Down
6 changes: 3 additions & 3 deletions cuda/components/diagonal_block_manipulation.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace csr {
* @internal
*
* @note assumes that block dimensions are in "standard format":
* (subwarp_size, cuda_config::warp_size / subwarp_size, z)
* (subwarp_size, config::warp_size / subwarp_size, z)
*/
template <
int max_block_size, int warps_per_block, typename Group, typename ValueType,
Expand All @@ -64,7 +64,7 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
ValueType *__restrict__ workspace)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
const auto warp = group::tiled_partition<config::warp_size>(group);
auto bid = static_cast<size_type>(blockIdx.x) * warps_per_block *
processed_blocks +
threadIdx.z * processed_blocks;
Expand All @@ -90,7 +90,7 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
const auto rstart = row_ptrs[row] + tid;
const auto rend = row_ptrs[row + 1];
// use the entire warp to ensure coalesced memory access
for (auto j = rstart; j < rend; j += cuda_config::warp_size) {
for (auto j = rstart; j < rend; j += config::warp_size) {
const auto col = col_idxs[j] - bstart;
if (col >= bsize) {
break;
Expand Down
9 changes: 4 additions & 5 deletions cuda/components/format_conversion.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,22 +92,21 @@ namespace host_kernel {
* It calculates the number of warps used in Coo Spmv depending on the GPU
* architecture and the number of stored elements.
*/
template <size_type subwarp_size = cuda_config::warp_size>
template <size_type subwarp_size = config::warp_size>
__host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
const size_type nnz)
{
size_type warps_per_sm =
exec->get_num_warps_per_sm() * cuda_config::warp_size / subwarp_size;
exec->get_num_warps_per_sm() * config::warp_size / subwarp_size;
size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
size_type multiple = 8;
if (nnz >= 2000000) {
multiple = 128;
} else if (nnz >= 200000) {
multiple = 32;
}
return std::min(
multiple * nwarps_in_cuda,
static_cast<size_type>(ceildiv(nnz, cuda_config::warp_size)));
return std::min(multiple * nwarps_in_cuda,
static_cast<size_type>(ceildiv(nnz, config::warp_size)));
}


Expand Down
4 changes: 2 additions & 2 deletions cuda/components/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,14 @@ __device__ void reduce(const Group &__restrict__ group,
const auto local_id = group.thread_rank();

#pragma unroll
for (int k = group.size() / 2; k >= cuda_config::warp_size; k /= 2) {
for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
group.sync();
if (local_id < k) {
data[local_id] = reduce_op(data[local_id], data[local_id + k]);
}
}

const auto warp = group::tiled_partition<cuda_config::warp_size>(group);
const auto warp = group::tiled_partition<config::warp_size>(group);
const auto warp_id = group.thread_rank() / warp.size();
if (warp_id > 0) {
return;
Expand Down
28 changes: 14 additions & 14 deletions cuda/components/thread_ids.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ __device__ __forceinline__ size_type get_block_id()
* to
*
* @note Assumes that block dimensions are in standard format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)`
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)`
*/
__device__ __forceinline__ size_type get_local_warp_id()
{
Expand All @@ -110,13 +110,13 @@ __device__ __forceinline__ size_type get_local_warp_id()
* belongs to
*
* @note Assumes that block dimensions are in standard format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)`
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)`
*/
template <int subwarp_size>
__device__ __forceinline__ size_type get_local_subwarp_id()
{
constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
return get_local_warp_id() * subwarps_per_warp + threadIdx.y;
}

Expand All @@ -132,8 +132,8 @@ __device__ __forceinline__ size_type get_local_subwarp_id()
* @return the local ID of the thread (relative to the block)
*
* @note Assumes that block dimensions are in standard format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)`
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)`
*/
template <int subwarp_size>
__device__ __forceinline__ size_type get_local_thread_id()
Expand All @@ -153,8 +153,8 @@ __device__ __forceinline__ size_type get_local_thread_id()
*
* @note Assumes that block dimensions and grid dimensions are in standard
* format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)` and
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)` and
* `(block_group_size, first_grid_dimension, second grid_dimension)`,
* respectively.
*/
Expand All @@ -176,15 +176,15 @@ __device__ __forceinline__ size_type get_warp_id()
*
* @note Assumes that block dimensions and grid dimensions are in standard
* format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)` and
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)` and
* `(block_group_size, first_grid_dimension, second grid_dimension)`,
* respectively.
*/
template <int subwarp_size, int warps_per_block>
__device__ __forceinline__ size_type get_subwarp_id()
{
constexpr auto subwarps_per_warp = cuda_config::warp_size / subwarp_size;
constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
return get_warp_id<warps_per_block>() * subwarps_per_warp + threadIdx.y;
}

Expand All @@ -200,8 +200,8 @@ __device__ __forceinline__ size_type get_subwarp_id()
*
* @note Assumes that block dimensions and grid dimensions are in standard
* format:
* `(subwarp_size, cuda_config::warp_size / subwarp_size, block_size /
* cuda_config::warp_size)` and
* `(subwarp_size, config::warp_size / subwarp_size, block_size /
* config::warp_size)` and
* `(block_group_size, first_grid_dimension, second grid_dimension)`,
* respectively.
*/
Expand Down
28 changes: 13 additions & 15 deletions cuda/matrix/coo_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ namespace coo {

constexpr int default_block_size = 512;
constexpr int warps_in_block = 4;
constexpr int spmv_block_size = warps_in_block * cuda_config::warp_size;
using device_config = cuda_config;
constexpr int spmv_block_size = warps_in_block * config::warp_size;


#include "common/matrix/coo_kernels.hpp.inc"
Expand Down Expand Up @@ -113,23 +112,23 @@ void spmv2(std::shared_ptr<const CudaExecutor> exec,
{
const auto nnz = a->get_num_stored_elements();
const auto b_ncols = b->get_size()[1];
const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
const dim3 coo_block(config::warp_size, warps_in_block, 1);
const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);

if (nwarps > 0) {
if (b_ncols < 4) {
const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
int num_lines = ceildiv(nnz, nwarps * config::warp_size);
abstract_spmv<<<coo_grid, coo_block>>>(
nnz, num_lines, as_cuda_type(a->get_const_values()),
a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
} else {
int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
cuda_config::warp_size;
int num_elems =
ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
ceildiv(b_ncols, cuda_config::warp_size));
ceildiv(b_ncols, config::warp_size));
abstract_spmm<<<coo_grid, coo_block>>>(
nnz, num_elems, as_cuda_type(a->get_const_values()),
a->get_const_col_idxs(), as_cuda_type(a->get_const_row_idxs()),
Expand All @@ -151,12 +150,12 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
{
const auto nnz = a->get_num_stored_elements();
const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
const dim3 coo_block(cuda_config::warp_size, warps_in_block, 1);
const dim3 coo_block(config::warp_size, warps_in_block, 1);
const auto b_ncols = b->get_size()[1];

if (nwarps > 0) {
if (b_ncols < 4) {
int num_lines = ceildiv(nnz, nwarps * cuda_config::warp_size);
int num_lines = ceildiv(nnz, nwarps * config::warp_size);
const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
abstract_spmv<<<coo_grid, coo_block>>>(
nnz, num_lines, as_cuda_type(alpha->get_const_values()),
Expand All @@ -165,10 +164,10 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
} else {
int num_elems = ceildiv(nnz, nwarps * cuda_config::warp_size) *
cuda_config::warp_size;
int num_elems =
ceildiv(nnz, nwarps * config::warp_size) * config::warp_size;
const dim3 coo_grid(ceildiv(nwarps, warps_in_block),
ceildiv(b_ncols, cuda_config::warp_size));
ceildiv(b_ncols, config::warp_size));
abstract_spmm<<<coo_grid, coo_block>>>(
nnz, num_elems, as_cuda_type(alpha->get_const_values()),
as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
Expand Down Expand Up @@ -226,9 +225,8 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,

const auto nnz = source->get_num_stored_elements();

const dim3 block_size(cuda_config::warp_size,
cuda_config::max_block_size / cuda_config::warp_size,
1);
const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(stride, block_size.x),
ceildiv(num_rows, block_size.y), 1);
kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
Expand Down
Loading

0 comments on commit aa76ecd

Please sign in to comment.