diff --git a/cupy/_statistics/order.py b/cupy/_statistics/order.py index 109c34fac02..8195f230057 100644 --- a/cupy/_statistics/order.py +++ b/cupy/_statistics/order.py @@ -26,6 +26,10 @@ def amin(a, axis=None, out=None, keepdims=False): Returns: cupy.ndarray: The minimum of ``a``, along the axis if specified. + .. note:: + When cuTENSOR accelerator is used, the output value might be collapsed + for reduction axes that have one or more NaN elements. + .. seealso:: :func:`numpy.amin` """ @@ -59,6 +63,10 @@ def amax(a, axis=None, out=None, keepdims=False): Returns: cupy.ndarray: The maximum of ``a``, along the axis if specified. + .. note:: + When cuTENSOR accelerator is used, the output value might be collapsed + for reduction axes that have one or more NaN elements. + .. seealso:: :func:`numpy.amax` """ @@ -156,6 +164,10 @@ def ptp(a, axis=None, out=None, keepdims=False): Returns: cupy.ndarray: The minimum of ``a``, along the axis if specified. + .. note:: + When cuTENSOR accelerator is used, the output value might be collapsed + for reduction axes that have one or more NaN elements. + .. seealso:: :func:`numpy.amin` """ diff --git a/cupy/core/_routines_math.pyx b/cupy/core/_routines_math.pyx index 48f2a363070..823ba2b53d0 100644 --- a/cupy/core/_routines_math.pyx +++ b/cupy/core/_routines_math.pyx @@ -88,12 +88,16 @@ cdef ndarray _ndarray_imag_setter(ndarray self, value): cdef ndarray _ndarray_prod(ndarray self, axis, dtype, out, keepdims): for accelerator in _accelerator._routine_accelerators: + result = None if accelerator == _accelerator.ACCELERATOR_CUB: # result will be None if the reduction is not compatible with CUB result = cub.cub_reduction( self, cub.CUPY_CUB_PROD, axis, dtype, out, keepdims) - if result is not None: - return result + if accelerator == _accelerator.ACCELERATOR_CUTENSOR: + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MUL, 1, 0) + if result is not None: + return result if dtype is None: return _prod_auto_dtype(self, axis, dtype, out, keepdims) else: diff --git a/cupy/core/_routines_statistics.pyx b/cupy/core/_routines_statistics.pyx index d563fba0d2e..2e9e60862ff 100644 --- a/cupy/core/_routines_statistics.pyx +++ b/cupy/core/_routines_statistics.pyx @@ -18,26 +18,47 @@ if not cupy.cuda.runtime.is_hip: else: cub = None +if cupy.cuda.cutensor.available: + import cupy_backends.cuda.libs.cutensor as cuda_cutensor + from cupy import cutensor +else: + cuda_cutensor = None + cutensor = None + cdef ndarray _ndarray_max(ndarray self, axis, out, dtype, keepdims): for accelerator in _accelerator._routine_accelerators: + result = None if accelerator == _accelerator.ACCELERATOR_CUB: # result will be None if the reduction is not compatible with CUB result = cub.cub_reduction( self, cub.CUPY_CUB_MAX, axis, dtype, out, keepdims) - if result is not None: - return result + if accelerator == _accelerator.ACCELERATOR_CUTENSOR: + if self.dtype.kind == 'c' or dtype in ('F', 'D'): + # Complex dtype is not supported + continue + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MAX, 1, 0) + if result is not None: + return result return _amax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) cdef ndarray _ndarray_min(ndarray self, axis, out, dtype, keepdims): for accelerator in _accelerator._routine_accelerators: + result = None if accelerator == _accelerator.ACCELERATOR_CUB: # result will be None if the reduction is not compatible with CUB result = cub.cub_reduction( self, cub.CUPY_CUB_MIN, axis, out, dtype, keepdims) - if result is not None: - return result + if accelerator == _accelerator.ACCELERATOR_CUTENSOR: + if self.dtype.kind == 'c' or dtype in ('F', 'D'): + # Complex dtype is not supported + continue + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MIN, 1, 0) + if result is not None: + return result return _amin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) @@ -51,6 +72,16 @@ cdef ndarray _ndarray_ptp(ndarray self, axis, out, keepdims): result -= cub.cub_reduction( self, cub.CUPY_CUB_MIN, axis, None, None, keepdims) return result + if accelerator == _accelerator.ACCELERATOR_CUTENSOR: + if self.dtype.kind == 'c': + # Complex dtype is not supported + continue + maxv = cutensor._try_reduction_routine( + self, axis, None, out, keepdims, cuda_cutensor.OP_MAX, 1, 0) + if maxv is None: + continue + return cutensor._try_reduction_routine( + self, axis, None, maxv, keepdims, cuda_cutensor.OP_MIN, -1, 1) result = _amax(self, axis=axis, out=out, keepdims=keepdims) result -= _amin(self, axis=axis, out=None, keepdims=keepdims) @@ -82,6 +113,8 @@ cdef ndarray _ndarray_argmin(ndarray self, axis, out, dtype, keepdims): cdef ndarray _ndarray_mean(ndarray self, axis, dtype, out, keepdims): + cdef Py_ssize_t n + dtype_sum = dtype_out = dtype if dtype is None: if self.dtype.kind in 'iub': @@ -103,6 +136,17 @@ cdef ndarray _ndarray_mean(ndarray self, axis, dtype, out, keepdims): n = self.size // result.size cupy.true_divide(result, n, out=result, casting='unsafe') break + if accelerator == _accelerator.ACCELERATOR_CUTENSOR: + reduce_axis, _ = _reduction._get_axis(axis, self._shape.size()) + n = 1 + for i in reduce_axis: + n *= self._shape[i] + n = max(n, 1) + result = cutensor._try_reduction_routine( + self, axis, dtype_sum, out, keepdims, + cuda_cutensor.OP_ADD, 1.0 / n, 0) + if result is not None: + break else: result = _mean( self, axis=axis, dtype=dtype_sum, out=out, keepdims=keepdims) diff --git a/cupy/cutensor.pyx b/cupy/cutensor.pyx index df6021550f8..6bc0f7333ec 100644 --- a/cupy/cutensor.pyx +++ b/cupy/cutensor.pyx @@ -729,6 +729,8 @@ def _try_reduction_routine( if dtype != x.dtype: return None + if x.ndim == 0: + return None if x.size == 0: return None if not x._c_contiguous: diff --git a/tests/cupy_tests/core_tests/test_ndarray_reduction.py b/tests/cupy_tests/core_tests/test_ndarray_reduction.py index 22172fbcc91..940d80f671f 100644 --- a/tests/cupy_tests/core_tests/test_ndarray_reduction.py +++ b/tests/cupy_tests/core_tests/test_ndarray_reduction.py @@ -1,9 +1,10 @@ import unittest import numpy +import pytest import cupy -from cupy.core import _accelerator +import cupy.core._accelerator as _acc from cupy import testing @@ -61,6 +62,8 @@ def test_max_multiple_axes_keepdims(self, xp, dtype): @testing.for_float_dtypes() @testing.numpy_cupy_allclose() def test_max_nan(self, xp, dtype): + if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): + pytest.skip() a = xp.array([float('nan'), 1, -1], dtype) return a.max() @@ -127,6 +130,8 @@ def test_min_multiple_axes_keepdims(self, xp, dtype): @testing.for_float_dtypes() @testing.numpy_cupy_allclose() def test_min_nan(self, xp, dtype): + if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): + pytest.skip() a = xp.array([float('nan'), 1, -1], dtype) return a.min() @@ -197,6 +202,8 @@ def test_ptp_multiple_axes_keepdims(self, xp, dtype): @testing.for_float_dtypes() @testing.numpy_cupy_allclose() def test_ptp_nan(self, xp, dtype): + if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): + pytest.skip() a = xp.array([float('nan'), 1, -1], dtype) return a.ptp() @@ -223,11 +230,11 @@ def test_ptp_nan_imag(self, xp, dtype): class TestCubReduction(unittest.TestCase): def setUp(self): - self.old_accelerators = _accelerator.get_routine_accelerators() - _accelerator.set_routine_accelerators(['cub']) + self.old_accelerators = _acc.get_routine_accelerators() + _acc.set_routine_accelerators(['cub']) def tearDown(self): - _accelerator.set_routine_accelerators(self.old_accelerators) + _acc.set_routine_accelerators(self.old_accelerators) @testing.for_contiguous_axes() @testing.for_all_dtypes(no_bool=True, no_float16=True) diff --git a/tests/cupy_tests/statistics_tests/test_order.py b/tests/cupy_tests/statistics_tests/test_order.py index b9b90486cb7..32404c41bb0 100644 --- a/tests/cupy_tests/statistics_tests/test_order.py +++ b/tests/cupy_tests/statistics_tests/test_order.py @@ -5,6 +5,7 @@ import pytest import cupy +import cupy.core._accelerator as _acc from cupy import testing @@ -236,11 +237,15 @@ def test_ptp_axis2(self, xp, dtype): @testing.for_float_dtypes() @testing.numpy_cupy_allclose() def test_ptp_nan(self, xp, dtype): + if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): + pytest.skip() a = xp.array([float('nan'), 1, -1], dtype) return xp.ptp(a) @testing.for_float_dtypes() @testing.numpy_cupy_allclose() def test_ptp_all_nan(self, xp, dtype): + if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): + pytest.skip() a = xp.array([float('nan'), float('nan')], dtype) return xp.ptp(a)