From 06bfd2549cfc0d83af85ffa971995dfa3be73097 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Jun 2020 22:56:45 -0400 Subject: [PATCH] Small fixes for CUB block reduction kernels 1. Remove all of the type constraints 2. Add a possible exception for optimizer 3. Allow compiler exceptions to propagate upward 4. Make complex (almost) obey the rule of three (to fix fp16 -> complex conversion) 5. Fix tests --- cupy/core/_cub_reduction.pxd | 2 +- cupy/core/_cub_reduction.pyx | 28 ++++-------- cupy/core/include/cupy/complex/complex.h | 45 ++++++++++++++++++- cupy/core/include/cupy/complex/complex_inl.h | 42 ++++++++++++++++- .../core_tests/test_cub_reduction.py | 15 ++++--- tests/cupy_tests/core_tests/test_raw.py | 8 ++-- 6 files changed, 109 insertions(+), 31 deletions(-) diff --git a/cupy/core/_cub_reduction.pxd b/cupy/core/_cub_reduction.pxd index b10b1be724f..9145ef044fb 100644 --- a/cupy/core/_cub_reduction.pxd +++ b/cupy/core/_cub_reduction.pxd @@ -7,4 +7,4 @@ cdef bint _try_to_call_cub_reduction( stream, optimize_context, tuple key, map_expr, reduce_expr, post_map_expr, reduce_type, type_map, tuple reduce_axis, tuple out_axis, const shape_t& out_shape, - ndarray ret) + ndarray ret) except * diff --git a/cupy/core/_cub_reduction.pyx b/cupy/core/_cub_reduction.pyx index 119f42a9b88..8518ef011d7 100644 --- a/cupy/core/_cub_reduction.pyx +++ b/cupy/core/_cub_reduction.pyx @@ -15,6 +15,7 @@ import math import string from cupy import _environment from cupy.core._kernel import _get_param_info +from cupy.cuda import driver from cupy import util @@ -279,18 +280,6 @@ cpdef inline tuple _can_use_cub_block_reduction( if axis_permutes_cub != tuple(range(in_arr.ndim)): return None - # To support generic reductions, note that some NumPy casting rules - # are not applicable in the C++ space (unless we tweak the type - # definitions). To circumvent this, we fall back to the old kernel. - # TODO(leofang): can we relax this? - if in_arr.dtype.kind != out_arr.dtype.kind: - # cannot cast complex to anything else - if in_arr.dtype.kind == 'c': - return None - # cannot cast float16 to complex - if in_arr.dtype.char == 'e' and out_arr.dtype.kind == 'c': - return None - # full-reduction of N-D array: need to invoke the kernel twice cdef bint full_reduction = True if len(out_axis) == 0 else False @@ -370,13 +359,13 @@ cdef _scalar.CScalar _cub_convert_to_c_scalar( return _scalar.CScalar.from_int32(value) -cdef inline _cub_two_pass_launch( +cdef inline void _cub_two_pass_launch( str name, Py_ssize_t block_size, Py_ssize_t segment_size, Py_ssize_t items_per_thread, str reduce_type, tuple params, list in_args, list out_args, str identity, str pre_map_expr, str reduce_expr, str post_map_expr, _kernel._TypeMap type_map, str input_expr, str output_expr, - str preamble, tuple options, stream): + str preamble, tuple options, stream) except *: ''' Notes: 1. Two-pass reduction: the first pass distributes an even share over @@ -472,11 +461,11 @@ cdef inline _cub_two_pass_launch( func.linear_launch(gridx, inout_args, 0, blockx, stream) -cdef inline _launch_cub( +cdef inline void _launch_cub( self, out_block_num, block_size, block_stride, in_args, out_args, in_shape, out_shape, type_map, map_expr, reduce_expr, post_map_expr, reduce_type, - stream, params, cub_params): + stream, params, cub_params) except *: cdef bint full_reduction cdef Py_ssize_t contiguous_size, items_per_thread cdef function.Function func @@ -537,7 +526,7 @@ def _get_cub_optimized_params( post_map_expr, reduce_type, stream, params, cub_params) def suggest_func(trial): - block_size_log = trial.suggest_int('block_size_log', 5, 9) + block_size_log = trial.suggest_int('block_size_log', 5, 10) block_size = 2 ** block_size_log items_per_thread = trial.suggest_int( 'items_per_thread', 2, 32, step=2) @@ -545,13 +534,14 @@ def _get_cub_optimized_params( trial.set_user_attr('block_size', block_size) return block_size, items_per_thread + # CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error optimize_impl = optimize_config.optimize_impl best = optimize_impl( optimize_config, target_func, suggest_func, default_best={ 'block_size_log': default_block_size_log, 'items_per_thread': default_items_per_thread, - }) + }, ignore_error=(driver.CUDADriverError,)) return best.params['items_per_thread'], best.user_attrs['block_size'] @@ -561,7 +551,7 @@ cdef bint _try_to_call_cub_reduction( stream, optimize_context, tuple key, map_expr, reduce_expr, post_map_expr, reduce_type, type_map, tuple reduce_axis, tuple out_axis, const shape_t& out_shape, - ndarray ret): + ndarray ret) except *: """Try to use cub. Updates `ret` and returns a boolean value whether cub is used. diff --git a/cupy/core/include/cupy/complex/complex.h b/cupy/core/include/cupy/complex/complex.h index 4219f33f0b5..61e5f4712fe 100644 --- a/cupy/core/include/cupy/complex/complex.h +++ b/cupy/core/include/cupy/complex/complex.h @@ -75,12 +75,29 @@ struct complex { /* --- Constructors --- */ + /*! Construct a complex number with an imaginary part of 0. + * + * \param re The real part of the number. + */ + inline __host__ __device__ complex(const T& re); + /*! Construct a complex number from its real and imaginary parts. * * \param re The real part of the number. * \param im The imaginary part of the number. */ - inline __host__ __device__ complex(const T& re = T(), const T& im = T()); + inline __host__ __device__ complex(const T& re, const T& im); + + /*! Default construct a complex number. + */ + inline __host__ __device__ complex(); + + /*! This copy constructor copies from a \p complex with a type that is + * convertible to this \p complex's \c value_type. + * + * \param z The \p complex to copy from. + */ + inline __host__ __device__ complex(const complex& z); /*! This copy constructor copies from a \p complex with a type that * is convertible to this \p complex \c value_type. @@ -92,6 +109,32 @@ struct complex { template inline __host__ __device__ complex(const complex& z); + /* --- Assignment Operators --- */ + + /*! Assign `re` to the real part of this \p complex and set the imaginary part + * to 0. + * + * \param re The real part of the number. + */ + inline __host__ __device__ complex& operator=(const T& re); + + /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this + * \p complex respectively. + * + * \param z The \p complex to copy from. + */ + inline __host__ __device__ complex& operator=(const complex& z); + + /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this + * \p complex respectively. + * + * \param z The \p complex to copy from. + * + * \tparam U is convertible to \c value_type. + */ + template + inline __host__ __device__ complex& operator=(const complex& z); + /* --- Compound Assignment Operators --- */ /*! Adds a \p complex to this \p complex and diff --git a/cupy/core/include/cupy/complex/complex_inl.h b/cupy/core/include/cupy/complex/complex_inl.h index b31c05d07c5..678dc7dd2a2 100644 --- a/cupy/core/include/cupy/complex/complex_inl.h +++ b/cupy/core/include/cupy/complex/complex_inl.h @@ -22,7 +22,11 @@ namespace thrust { /* --- Constructors --- */ -// TODO(leofang): support more kinds of constructors from upstream +template +inline __host__ __device__ complex::complex(const T& re) { + real(re); + imag(T()); +} template inline __host__ __device__ complex::complex(const T& re, const T& im) { @@ -30,6 +34,18 @@ inline __host__ __device__ complex::complex(const T& re, const T& im) { imag(im); } +template +inline __host__ __device__ complex::complex() { + real(T()); + imag(T()); +} + +template +inline __host__ __device__ complex::complex(const complex& z) { + real(z.real()); + imag(z.imag()); +} + template template inline __host__ __device__ complex::complex(const complex& z) { @@ -39,6 +55,30 @@ inline __host__ __device__ complex::complex(const complex& z) { imag(T(z.imag())); } +/* --- Assignment Operators --- */ + +template +inline __host__ __device__ complex& complex::operator=(const T& re) { + real(re); + imag(T()); + return *this; +} + +template +inline __host__ __device__ complex& complex::operator=(const complex& z) { + real(z.real()); + imag(z.imag()); + return *this; +} + +template +template +inline __host__ __device__ complex& complex::operator=(const complex& z) { + real(T(z.real())); + imag(T(z.imag())); + return *this; +} + /* --- Compound Assignment Operators --- */ // TODO(leofang): support operators with argument of type T, see upstream diff --git a/tests/cupy_tests/core_tests/test_cub_reduction.py b/tests/cupy_tests/core_tests/test_cub_reduction.py index a45978ba459..8b303227d6d 100644 --- a/tests/cupy_tests/core_tests/test_cub_reduction.py +++ b/tests/cupy_tests/core_tests/test_cub_reduction.py @@ -6,6 +6,7 @@ from cupy import _environment from cupy import testing from cupy.core import _accelerator +from cupy.core import _cub_reduction from cupy.cuda import memory @@ -130,13 +131,17 @@ def test_can_use_accelerator_set_unset(self): a = cupy.random.random((10, 10)) # this is the only function we can mock; the rest is cdef'd - func = ''.join(('cupy.core._cub_reduction.', - '_SimpleCubReductionKernel_get_cached_function')) - with testing.AssertFunctionIsCalled(func): + func_name = ''.join(('cupy.core._cub_reduction.', + '_SimpleCubReductionKernel_get_cached_function')) + func = _cub_reduction._SimpleCubReductionKernel_get_cached_function + with testing.AssertFunctionIsCalled( + func_name, wraps=func, times_called=2): # two passes a.sum() - with testing.AssertFunctionIsCalled(func): + with testing.AssertFunctionIsCalled( + func_name, wraps=func, times_called=1): # one pass a.sum(axis=1) - with testing.AssertFunctionIsCalled(func, times_called=0): + with testing.AssertFunctionIsCalled( + func_name, wraps=func, times_called=0): # not used a.sum(axis=0) _accelerator.set_routine_accelerators(old_routine_accelerators) diff --git a/tests/cupy_tests/core_tests/test_raw.py b/tests/cupy_tests/core_tests/test_raw.py index 67eaf215caa..18140a5b7de 100644 --- a/tests/cupy_tests/core_tests/test_raw.py +++ b/tests/cupy_tests/core_tests/test_raw.py @@ -558,7 +558,7 @@ def test_cuFloatComplex(self): ker = mod.get_function('test_mulf') ker((grid,), (block,), (a, b, out)) - assert (out == a * b).all() + assert cupy.allclose(out, a * b) ker = mod.get_function('test_divf') ker((grid,), (block,), (a, b, out)) @@ -574,7 +574,7 @@ def test_cuFloatComplex(self): ker = mod.get_function('test_fmaf') ker((grid,), (block,), (a, b, c, out)) - assert (out == a * b + c).all() + assert cupy.allclose(out, a * b + c) ker = mod.get_function('test_makef') ker((grid,), (block,), (out,)) @@ -620,7 +620,7 @@ def test_cuDoubleComplex(self): ker = mod.get_function('test_mul') ker((grid,), (block,), (a, b, out)) - assert (out == a * b).all() + assert cupy.allclose(out, a * b) ker = mod.get_function('test_div') ker((grid,), (block,), (a, b, out)) @@ -636,7 +636,7 @@ def test_cuDoubleComplex(self): ker = mod.get_function('test_fma') ker((grid,), (block,), (a, b, c, out)) - assert (out == a * b + c).all() + assert cupy.allclose(out, a * b + c) ker = mod.get_function('test_make') ker((grid,), (block,), (out,))