Skip to content

Commit

Permalink
Merge pull request #3520 from leofang/cub_block_c_in_f_out
Browse files Browse the repository at this point in the history
Small fixes for CUB block reduction kernels
  • Loading branch information
asi1024 committed Jul 22, 2020
2 parents 23f5634 + 06bfd25 commit e028d74
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 31 deletions.
2 changes: 1 addition & 1 deletion cupy/core/_cub_reduction.pxd
Expand Up @@ -7,4 +7,4 @@ cdef bint _try_to_call_cub_reduction(
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
ndarray ret)
ndarray ret) except *
28 changes: 9 additions & 19 deletions cupy/core/_cub_reduction.pyx
Expand Up @@ -15,6 +15,7 @@ import math
import string
from cupy import _environment
from cupy.core._kernel import _get_param_info
from cupy.cuda import driver
from cupy import util


Expand Down Expand Up @@ -279,18 +280,6 @@ cpdef inline tuple _can_use_cub_block_reduction(
if axis_permutes_cub != tuple(range(in_arr.ndim)):
return None

# To support generic reductions, note that some NumPy casting rules
# are not applicable in the C++ space (unless we tweak the type
# definitions). To circumvent this, we fall back to the old kernel.
# TODO(leofang): can we relax this?
if in_arr.dtype.kind != out_arr.dtype.kind:
# cannot cast complex to anything else
if in_arr.dtype.kind == 'c':
return None
# cannot cast float16 to complex
if in_arr.dtype.char == 'e' and out_arr.dtype.kind == 'c':
return None

# full-reduction of N-D array: need to invoke the kernel twice
cdef bint full_reduction = True if len(out_axis) == 0 else False

Expand Down Expand Up @@ -370,13 +359,13 @@ cdef _scalar.CScalar _cub_convert_to_c_scalar(
return _scalar.CScalar.from_int32(value)


cdef inline _cub_two_pass_launch(
cdef inline void _cub_two_pass_launch(
str name, Py_ssize_t block_size, Py_ssize_t segment_size,
Py_ssize_t items_per_thread, str reduce_type, tuple params,
list in_args, list out_args,
str identity, str pre_map_expr, str reduce_expr, str post_map_expr,
_kernel._TypeMap type_map, str input_expr, str output_expr,
str preamble, tuple options, stream):
str preamble, tuple options, stream) except *:
'''
Notes:
1. Two-pass reduction: the first pass distributes an even share over
Expand Down Expand Up @@ -472,11 +461,11 @@ cdef inline _cub_two_pass_launch(
func.linear_launch(gridx, inout_args, 0, blockx, stream)


cdef inline _launch_cub(
cdef inline void _launch_cub(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params, cub_params):
stream, params, cub_params) except *:
cdef bint full_reduction
cdef Py_ssize_t contiguous_size, items_per_thread
cdef function.Function func
Expand Down Expand Up @@ -537,21 +526,22 @@ def _get_cub_optimized_params(
post_map_expr, reduce_type, stream, params, cub_params)

def suggest_func(trial):
block_size_log = trial.suggest_int('block_size_log', 5, 9)
block_size_log = trial.suggest_int('block_size_log', 5, 10)
block_size = 2 ** block_size_log
items_per_thread = trial.suggest_int(
'items_per_thread', 2, 32, step=2)

trial.set_user_attr('block_size', block_size)
return block_size, items_per_thread

# CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error
optimize_impl = optimize_config.optimize_impl
best = optimize_impl(
optimize_config, target_func, suggest_func,
default_best={
'block_size_log': default_block_size_log,
'items_per_thread': default_items_per_thread,
})
}, ignore_error=(driver.CUDADriverError,))

return best.params['items_per_thread'], best.user_attrs['block_size']

Expand All @@ -561,7 +551,7 @@ cdef bint _try_to_call_cub_reduction(
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
ndarray ret):
ndarray ret) except *:
"""Try to use cub.
Updates `ret` and returns a boolean value whether cub is used.
Expand Down
45 changes: 44 additions & 1 deletion cupy/core/include/cupy/complex/complex.h
Expand Up @@ -75,12 +75,29 @@ struct complex {

/* --- Constructors --- */

/*! Construct a complex number with an imaginary part of 0.
*
* \param re The real part of the number.
*/
inline __host__ __device__ complex(const T& re);

/*! Construct a complex number from its real and imaginary parts.
*
* \param re The real part of the number.
* \param im The imaginary part of the number.
*/
inline __host__ __device__ complex(const T& re = T(), const T& im = T());
inline __host__ __device__ complex(const T& re, const T& im);

/*! Default construct a complex number.
*/
inline __host__ __device__ complex();

/*! This copy constructor copies from a \p complex with a type that is
* convertible to this \p complex's \c value_type.
*
* \param z The \p complex to copy from.
*/
inline __host__ __device__ complex(const complex<T>& z);

/*! This copy constructor copies from a \p complex with a type that
* is convertible to this \p complex \c value_type.
Expand All @@ -92,6 +109,32 @@ struct complex {
template <typename X>
inline __host__ __device__ complex(const complex<X>& z);

/* --- Assignment Operators --- */

/*! Assign `re` to the real part of this \p complex and set the imaginary part
* to 0.
*
* \param re The real part of the number.
*/
inline __host__ __device__ complex& operator=(const T& re);

/*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
* \p complex respectively.
*
* \param z The \p complex to copy from.
*/
inline __host__ __device__ complex& operator=(const complex<T>& z);

/*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
* \p complex respectively.
*
* \param z The \p complex to copy from.
*
* \tparam U is convertible to \c value_type.
*/
template <typename U>
inline __host__ __device__ complex& operator=(const complex<U>& z);

/* --- Compound Assignment Operators --- */

/*! Adds a \p complex to this \p complex and
Expand Down
42 changes: 41 additions & 1 deletion cupy/core/include/cupy/complex/complex_inl.h
Expand Up @@ -22,14 +22,30 @@
namespace thrust {

/* --- Constructors --- */
// TODO(leofang): support more kinds of constructors from upstream
template <typename T>
inline __host__ __device__ complex<T>::complex(const T& re) {
real(re);
imag(T());
}

template <typename T>
inline __host__ __device__ complex<T>::complex(const T& re, const T& im) {
real(re);
imag(im);
}

template <typename T>
inline __host__ __device__ complex<T>::complex() {
real(T());
imag(T());
}

template <typename T>
inline __host__ __device__ complex<T>::complex(const complex<T>& z) {
real(z.real());
imag(z.imag());
}

template <typename T>
template <typename X>
inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
Expand All @@ -39,6 +55,30 @@ inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
imag(T(z.imag()));
}

/* --- Assignment Operators --- */

template <typename T>
inline __host__ __device__ complex<T>& complex<T>::operator=(const T& re) {
real(re);
imag(T());
return *this;
}

template <typename T>
inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<T>& z) {
real(z.real());
imag(z.imag());
return *this;
}

template <typename T>
template <typename U>
inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<U>& z) {
real(T(z.real()));
imag(T(z.imag()));
return *this;
}

/* --- Compound Assignment Operators --- */
// TODO(leofang): support operators with argument of type T, see upstream

Expand Down
15 changes: 10 additions & 5 deletions tests/cupy_tests/core_tests/test_cub_reduction.py
Expand Up @@ -6,6 +6,7 @@
from cupy import _environment
from cupy import testing
from cupy.core import _accelerator
from cupy.core import _cub_reduction
from cupy.cuda import memory


Expand Down Expand Up @@ -130,13 +131,17 @@ def test_can_use_accelerator_set_unset(self):

a = cupy.random.random((10, 10))
# this is the only function we can mock; the rest is cdef'd
func = ''.join(('cupy.core._cub_reduction.',
'_SimpleCubReductionKernel_get_cached_function'))
with testing.AssertFunctionIsCalled(func):
func_name = ''.join(('cupy.core._cub_reduction.',
'_SimpleCubReductionKernel_get_cached_function'))
func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=2): # two passes
a.sum()
with testing.AssertFunctionIsCalled(func):
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=1): # one pass
a.sum(axis=1)
with testing.AssertFunctionIsCalled(func, times_called=0):
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=0): # not used
a.sum(axis=0)

_accelerator.set_routine_accelerators(old_routine_accelerators)
8 changes: 4 additions & 4 deletions tests/cupy_tests/core_tests/test_raw.py
Expand Up @@ -558,7 +558,7 @@ def test_cuFloatComplex(self):

ker = mod.get_function('test_mulf')
ker((grid,), (block,), (a, b, out))
assert (out == a * b).all()
assert cupy.allclose(out, a * b)

ker = mod.get_function('test_divf')
ker((grid,), (block,), (a, b, out))
Expand All @@ -574,7 +574,7 @@ def test_cuFloatComplex(self):

ker = mod.get_function('test_fmaf')
ker((grid,), (block,), (a, b, c, out))
assert (out == a * b + c).all()
assert cupy.allclose(out, a * b + c)

ker = mod.get_function('test_makef')
ker((grid,), (block,), (out,))
Expand Down Expand Up @@ -620,7 +620,7 @@ def test_cuDoubleComplex(self):

ker = mod.get_function('test_mul')
ker((grid,), (block,), (a, b, out))
assert (out == a * b).all()
assert cupy.allclose(out, a * b)

ker = mod.get_function('test_div')
ker((grid,), (block,), (a, b, out))
Expand All @@ -636,7 +636,7 @@ def test_cuDoubleComplex(self):

ker = mod.get_function('test_fma')
ker((grid,), (block,), (a, b, c, out))
assert (out == a * b + c).all()
assert cupy.allclose(out, a * b + c)

ker = mod.get_function('test_make')
ker((grid,), (block,), (out,))
Expand Down

0 comments on commit e028d74

Please sign in to comment.