Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small fixes for CUB block reduction kernels #3520

Merged
merged 1 commit into from Jul 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion cupy/core/_cub_reduction.pxd
Expand Up @@ -7,4 +7,4 @@ cdef bint _try_to_call_cub_reduction(
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
ndarray ret)
ndarray ret) except *
28 changes: 9 additions & 19 deletions cupy/core/_cub_reduction.pyx
Expand Up @@ -15,6 +15,7 @@ import math
import string
from cupy import _environment
from cupy.core._kernel import _get_param_info
from cupy.cuda import driver
from cupy import util


Expand Down Expand Up @@ -279,18 +280,6 @@ cpdef inline tuple _can_use_cub_block_reduction(
if axis_permutes_cub != tuple(range(in_arr.ndim)):
return None

# To support generic reductions, note that some NumPy casting rules
# are not applicable in the C++ space (unless we tweak the type
# definitions). To circumvent this, we fall back to the old kernel.
# TODO(leofang): can we relax this?
if in_arr.dtype.kind != out_arr.dtype.kind:
# cannot cast complex to anything else
if in_arr.dtype.kind == 'c':
return None
# cannot cast float16 to complex
if in_arr.dtype.char == 'e' and out_arr.dtype.kind == 'c':
return None

# full-reduction of N-D array: need to invoke the kernel twice
cdef bint full_reduction = True if len(out_axis) == 0 else False

Expand Down Expand Up @@ -370,13 +359,13 @@ cdef _scalar.CScalar _cub_convert_to_c_scalar(
return _scalar.CScalar.from_int32(value)


cdef inline _cub_two_pass_launch(
cdef inline void _cub_two_pass_launch(
str name, Py_ssize_t block_size, Py_ssize_t segment_size,
Py_ssize_t items_per_thread, str reduce_type, tuple params,
list in_args, list out_args,
str identity, str pre_map_expr, str reduce_expr, str post_map_expr,
_kernel._TypeMap type_map, str input_expr, str output_expr,
str preamble, tuple options, stream):
str preamble, tuple options, stream) except *:
'''
Notes:
1. Two-pass reduction: the first pass distributes an even share over
Expand Down Expand Up @@ -472,11 +461,11 @@ cdef inline _cub_two_pass_launch(
func.linear_launch(gridx, inout_args, 0, blockx, stream)


cdef inline _launch_cub(
cdef inline void _launch_cub(
self, out_block_num, block_size, block_stride,
in_args, out_args, in_shape, out_shape, type_map,
map_expr, reduce_expr, post_map_expr, reduce_type,
stream, params, cub_params):
stream, params, cub_params) except *:
cdef bint full_reduction
cdef Py_ssize_t contiguous_size, items_per_thread
cdef function.Function func
Expand Down Expand Up @@ -537,21 +526,22 @@ def _get_cub_optimized_params(
post_map_expr, reduce_type, stream, params, cub_params)

def suggest_func(trial):
block_size_log = trial.suggest_int('block_size_log', 5, 9)
block_size_log = trial.suggest_int('block_size_log', 5, 10)
block_size = 2 ** block_size_log
items_per_thread = trial.suggest_int(
'items_per_thread', 2, 32, step=2)

trial.set_user_attr('block_size', block_size)
return block_size, items_per_thread

# CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error
optimize_impl = optimize_config.optimize_impl
best = optimize_impl(
optimize_config, target_func, suggest_func,
default_best={
'block_size_log': default_block_size_log,
'items_per_thread': default_items_per_thread,
})
}, ignore_error=(driver.CUDADriverError,))
Copy link
Member Author

@leofang leofang Jul 1, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish we have a better way to filter out the particular error...CUDADriverError seems too general, although I can't think of other possibilities for how it can be thrown without being out of resources.


return best.params['items_per_thread'], best.user_attrs['block_size']

Expand All @@ -561,7 +551,7 @@ cdef bint _try_to_call_cub_reduction(
stream, optimize_context, tuple key,
map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
ndarray ret):
ndarray ret) except *:
"""Try to use cub.

Updates `ret` and returns a boolean value whether cub is used.
Expand Down
45 changes: 44 additions & 1 deletion cupy/core/include/cupy/complex/complex.h
Expand Up @@ -75,12 +75,29 @@ struct complex {

/* --- Constructors --- */

/*! Construct a complex number with an imaginary part of 0.
*
* \param re The real part of the number.
*/
inline __host__ __device__ complex(const T& re);

/*! Construct a complex number from its real and imaginary parts.
*
* \param re The real part of the number.
* \param im The imaginary part of the number.
*/
inline __host__ __device__ complex(const T& re = T(), const T& im = T());
inline __host__ __device__ complex(const T& re, const T& im);

/*! Default construct a complex number.
*/
inline __host__ __device__ complex();

/*! This copy constructor copies from a \p complex with a type that is
* convertible to this \p complex's \c value_type.
*
* \param z The \p complex to copy from.
*/
inline __host__ __device__ complex(const complex<T>& z);

/*! This copy constructor copies from a \p complex with a type that
* is convertible to this \p complex \c value_type.
Expand All @@ -92,6 +109,32 @@ struct complex {
template <typename X>
inline __host__ __device__ complex(const complex<X>& z);

/* --- Assignment Operators --- */

/*! Assign `re` to the real part of this \p complex and set the imaginary part
* to 0.
*
* \param re The real part of the number.
*/
inline __host__ __device__ complex& operator=(const T& re);

/*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
* \p complex respectively.
*
* \param z The \p complex to copy from.
*/
inline __host__ __device__ complex& operator=(const complex<T>& z);

/*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
* \p complex respectively.
*
* \param z The \p complex to copy from.
*
* \tparam U is convertible to \c value_type.
*/
template <typename U>
inline __host__ __device__ complex& operator=(const complex<U>& z);

/* --- Compound Assignment Operators --- */

/*! Adds a \p complex to this \p complex and
Expand Down
42 changes: 41 additions & 1 deletion cupy/core/include/cupy/complex/complex_inl.h
Expand Up @@ -22,14 +22,30 @@
namespace thrust {

/* --- Constructors --- */
// TODO(leofang): support more kinds of constructors from upstream
template <typename T>
inline __host__ __device__ complex<T>::complex(const T& re) {
real(re);
imag(T());
}

template <typename T>
inline __host__ __device__ complex<T>::complex(const T& re, const T& im) {
real(re);
imag(im);
}

template <typename T>
inline __host__ __device__ complex<T>::complex() {
real(T());
imag(T());
}

template <typename T>
inline __host__ __device__ complex<T>::complex(const complex<T>& z) {
real(z.real());
imag(z.imag());
}

template <typename T>
template <typename X>
inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
Expand All @@ -39,6 +55,30 @@ inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
imag(T(z.imag()));
}

/* --- Assignment Operators --- */

template <typename T>
inline __host__ __device__ complex<T>& complex<T>::operator=(const T& re) {
real(re);
imag(T());
return *this;
}

template <typename T>
inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<T>& z) {
real(z.real());
imag(z.imag());
return *this;
}

template <typename T>
template <typename U>
inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<U>& z) {
real(T(z.real()));
imag(T(z.imag()));
return *this;
}

/* --- Compound Assignment Operators --- */
// TODO(leofang): support operators with argument of type T, see upstream

Expand Down
15 changes: 10 additions & 5 deletions tests/cupy_tests/core_tests/test_cub_reduction.py
Expand Up @@ -6,6 +6,7 @@
from cupy import _environment
from cupy import testing
from cupy.core import _accelerator
from cupy.core import _cub_reduction
from cupy.cuda import memory


Expand Down Expand Up @@ -130,13 +131,17 @@ def test_can_use_accelerator_set_unset(self):

a = cupy.random.random((10, 10))
# this is the only function we can mock; the rest is cdef'd
func = ''.join(('cupy.core._cub_reduction.',
'_SimpleCubReductionKernel_get_cached_function'))
with testing.AssertFunctionIsCalled(func):
func_name = ''.join(('cupy.core._cub_reduction.',
'_SimpleCubReductionKernel_get_cached_function'))
func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=2): # two passes
a.sum()
with testing.AssertFunctionIsCalled(func):
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=1): # one pass
a.sum(axis=1)
with testing.AssertFunctionIsCalled(func, times_called=0):
with testing.AssertFunctionIsCalled(
func_name, wraps=func, times_called=0): # not used
a.sum(axis=0)

_accelerator.set_routine_accelerators(old_routine_accelerators)
8 changes: 4 additions & 4 deletions tests/cupy_tests/core_tests/test_raw.py
Expand Up @@ -558,7 +558,7 @@ def test_cuFloatComplex(self):

ker = mod.get_function('test_mulf')
ker((grid,), (block,), (a, b, out))
assert (out == a * b).all()
assert cupy.allclose(out, a * b)

ker = mod.get_function('test_divf')
ker((grid,), (block,), (a, b, out))
Expand All @@ -574,7 +574,7 @@ def test_cuFloatComplex(self):

ker = mod.get_function('test_fmaf')
ker((grid,), (block,), (a, b, c, out))
assert (out == a * b + c).all()
assert cupy.allclose(out, a * b + c)

ker = mod.get_function('test_makef')
ker((grid,), (block,), (out,))
Expand Down Expand Up @@ -620,7 +620,7 @@ def test_cuDoubleComplex(self):

ker = mod.get_function('test_mul')
ker((grid,), (block,), (a, b, out))
assert (out == a * b).all()
assert cupy.allclose(out, a * b)

ker = mod.get_function('test_div')
ker((grid,), (block,), (a, b, out))
Expand All @@ -636,7 +636,7 @@ def test_cuDoubleComplex(self):

ker = mod.get_function('test_fma')
ker((grid,), (block,), (a, b, c, out))
assert (out == a * b + c).all()
assert cupy.allclose(out, a * b + c)

ker = mod.get_function('test_make')
ker((grid,), (block,), (out,))
Expand Down