Merge pull request #3765 from asi1024/use-cutensor

Use cuTENSOR in `cupy.prod`, `cupy.max`, `cupy.min`, `cupy.ptp` and `cupy.mean`
cupy · Sep 30, 2020 · 34cb08d · 34cb08d
2 parents 624e669 + 28ba922
commit 34cb08d
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 10 deletions.
diff --git a/cupy/_statistics/order.py b/cupy/_statistics/order.py
@@ -26,6 +26,10 @@ def amin(a, axis=None, out=None, keepdims=False):
     Returns:
         cupy.ndarray: The minimum of ``a``, along the axis if specified.
 
+    .. note::
+       When cuTENSOR accelerator is used, the output value might be collapsed
+       for reduction axes that have one or more NaN elements.
+
     .. seealso:: :func:`numpy.amin`
 
     """
@@ -59,6 +63,10 @@ def amax(a, axis=None, out=None, keepdims=False):
     Returns:
         cupy.ndarray: The maximum of ``a``, along the axis if specified.
 
+    .. note::
+       When cuTENSOR accelerator is used, the output value might be collapsed
+       for reduction axes that have one or more NaN elements.
+
     .. seealso:: :func:`numpy.amax`
 
     """
@@ -156,6 +164,10 @@ def ptp(a, axis=None, out=None, keepdims=False):
     Returns:
         cupy.ndarray: The minimum of ``a``, along the axis if specified.
 
+    .. note::
+       When cuTENSOR accelerator is used, the output value might be collapsed
+       for reduction axes that have one or more NaN elements.
+
     .. seealso:: :func:`numpy.amin`
 
     """

diff --git a/cupy/core/_routines_math.pyx b/cupy/core/_routines_math.pyx
@@ -88,12 +88,16 @@ cdef ndarray _ndarray_imag_setter(ndarray self, value):
 
 cdef ndarray _ndarray_prod(ndarray self, axis, dtype, out, keepdims):
     for accelerator in _accelerator._routine_accelerators:
+        result = None
         if accelerator == _accelerator.ACCELERATOR_CUB:
             # result will be None if the reduction is not compatible with CUB
             result = cub.cub_reduction(
                 self, cub.CUPY_CUB_PROD, axis, dtype, out, keepdims)
-            if result is not None:
-                return result
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MUL, 1, 0)
+        if result is not None:
+            return result
     if dtype is None:
         return _prod_auto_dtype(self, axis, dtype, out, keepdims)
     else:

diff --git a/cupy/core/_routines_statistics.pyx b/cupy/core/_routines_statistics.pyx
@@ -18,26 +18,47 @@ if not cupy.cuda.runtime.is_hip:
 else:
     cub = None
 
+if cupy.cuda.cutensor.available:
+    import cupy_backends.cuda.libs.cutensor as cuda_cutensor
+    from cupy import cutensor
+else:
+    cuda_cutensor = None
+    cutensor = None
+
 
 cdef ndarray _ndarray_max(ndarray self, axis, out, dtype, keepdims):
     for accelerator in _accelerator._routine_accelerators:
+        result = None
         if accelerator == _accelerator.ACCELERATOR_CUB:
             # result will be None if the reduction is not compatible with CUB
             result = cub.cub_reduction(
                 self, cub.CUPY_CUB_MAX, axis, dtype, out, keepdims)
-            if result is not None:
-                return result
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            if self.dtype.kind == 'c' or dtype in ('F', 'D'):
+                # Complex dtype is not supported
+                continue
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
+        if result is not None:
+            return result
     return _amax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
 
 
 cdef ndarray _ndarray_min(ndarray self, axis, out, dtype, keepdims):
     for accelerator in _accelerator._routine_accelerators:
+        result = None
         if accelerator == _accelerator.ACCELERATOR_CUB:
             # result will be None if the reduction is not compatible with CUB
             result = cub.cub_reduction(
                 self, cub.CUPY_CUB_MIN, axis, out, dtype, keepdims)
-            if result is not None:
-                return result
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            if self.dtype.kind == 'c' or dtype in ('F', 'D'):
+                # Complex dtype is not supported
+                continue
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_MIN, 1, 0)
+        if result is not None:
+            return result
     return _amin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims)
 
 
@@ -51,6 +72,16 @@ cdef ndarray _ndarray_ptp(ndarray self, axis, out, keepdims):
                 result -= cub.cub_reduction(
                     self, cub.CUPY_CUB_MIN, axis, None, None, keepdims)
                 return result
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            if self.dtype.kind == 'c':
+                # Complex dtype is not supported
+                continue
+            maxv = cutensor._try_reduction_routine(
+                self, axis, None, out, keepdims, cuda_cutensor.OP_MAX, 1, 0)
+            if maxv is None:
+                continue
+            return cutensor._try_reduction_routine(
+                self, axis, None, maxv, keepdims, cuda_cutensor.OP_MIN, -1, 1)
 
     result = _amax(self, axis=axis, out=out, keepdims=keepdims)
     result -= _amin(self, axis=axis, out=None, keepdims=keepdims)
@@ -82,6 +113,8 @@ cdef ndarray _ndarray_argmin(ndarray self, axis, out, dtype, keepdims):
 
 
 cdef ndarray _ndarray_mean(ndarray self, axis, dtype, out, keepdims):
+    cdef Py_ssize_t n
+
     dtype_sum = dtype_out = dtype
     if dtype is None:
         if self.dtype.kind in 'iub':
@@ -103,6 +136,17 @@ cdef ndarray _ndarray_mean(ndarray self, axis, dtype, out, keepdims):
                 n = self.size // result.size
                 cupy.true_divide(result, n, out=result, casting='unsafe')
                 break
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            reduce_axis, _ = _reduction._get_axis(axis, self._shape.size())
+            n = 1
+            for i in reduce_axis:
+                n *= self._shape[i]
+            n = max(n, 1)
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype_sum, out, keepdims,
+                cuda_cutensor.OP_ADD, 1.0 / n, 0)
+            if result is not None:
+                break
     else:
         result = _mean(
             self, axis=axis, dtype=dtype_sum, out=out, keepdims=keepdims)

diff --git a/cupy/cutensor.pyx b/cupy/cutensor.pyx
@@ -729,6 +729,8 @@ def _try_reduction_routine(
     if dtype != x.dtype:
         return None
 
+    if x.ndim == 0:
+        return None
     if x.size == 0:
         return None
     if not x._c_contiguous:

diff --git a/tests/cupy_tests/core_tests/test_ndarray_reduction.py b/tests/cupy_tests/core_tests/test_ndarray_reduction.py
@@ -1,9 +1,10 @@
 import unittest
 
 import numpy
+import pytest
 
 import cupy
-from cupy.core import _accelerator
+import cupy.core._accelerator as _acc
 from cupy import testing
 
 
@@ -61,6 +62,8 @@ def test_max_multiple_axes_keepdims(self, xp, dtype):
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_max_nan(self, xp, dtype):
+        if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+            pytest.skip()
         a = xp.array([float('nan'), 1, -1], dtype)
         return a.max()
 
@@ -127,6 +130,8 @@ def test_min_multiple_axes_keepdims(self, xp, dtype):
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_min_nan(self, xp, dtype):
+        if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+            pytest.skip()
         a = xp.array([float('nan'), 1, -1], dtype)
         return a.min()
 
@@ -197,6 +202,8 @@ def test_ptp_multiple_axes_keepdims(self, xp, dtype):
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_ptp_nan(self, xp, dtype):
+        if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+            pytest.skip()
         a = xp.array([float('nan'), 1, -1], dtype)
         return a.ptp()
 
@@ -223,11 +230,11 @@ def test_ptp_nan_imag(self, xp, dtype):
 class TestCubReduction(unittest.TestCase):
 
     def setUp(self):
-        self.old_accelerators = _accelerator.get_routine_accelerators()
-        _accelerator.set_routine_accelerators(['cub'])
+        self.old_accelerators = _acc.get_routine_accelerators()
+        _acc.set_routine_accelerators(['cub'])
 
     def tearDown(self):
-        _accelerator.set_routine_accelerators(self.old_accelerators)
+        _acc.set_routine_accelerators(self.old_accelerators)
 
     @testing.for_contiguous_axes()
     @testing.for_all_dtypes(no_bool=True, no_float16=True)

diff --git a/tests/cupy_tests/statistics_tests/test_order.py b/tests/cupy_tests/statistics_tests/test_order.py
@@ -5,6 +5,7 @@
 import pytest
 
 import cupy
+import cupy.core._accelerator as _acc
 from cupy import testing
 
 
@@ -236,11 +237,15 @@ def test_ptp_axis2(self, xp, dtype):
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_ptp_nan(self, xp, dtype):
+        if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+            pytest.skip()
         a = xp.array([float('nan'), 1, -1], dtype)
         return xp.ptp(a)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_ptp_all_nan(self, xp, dtype):
+        if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+            pytest.skip()
         a = xp.array([float('nan'), float('nan')], dtype)
         return xp.ptp(a)