cupy · mergify · Jul 27, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020
diff --git a/cupy/core/_routines_math.pyx b/cupy/core/_routines_math.pyx
@@ -23,6 +23,13 @@ if not cupy.cuda.runtime.is_hip:
 else:
     cub = None
 
+if cupy.cuda.cutensor_enabled:
+    import cupy_backends.cuda.libs.cutensor as cuda_cutensor
+    from cupy import cutensor
+else:
+    cuda_cutensor = None
+    cutensor = None
+
 
 # ndarray members
 
@@ -95,12 +102,17 @@ cdef ndarray _ndarray_prod(ndarray self, axis, dtype, out, keepdims):
 
 cdef ndarray _ndarray_sum(ndarray self, axis, dtype, out, keepdims):
     for accelerator in _accelerator._routine_accelerators:
+        result = None
         if accelerator == _accelerator.ACCELERATOR_CUB:
             # result will be None if the reduction is not compatible with CUB
             result = cub.cub_reduction(
                 self, cub.CUPY_CUB_SUM, axis, dtype, out, keepdims)
-            if result is not None:
-                return result
+        if accelerator == _accelerator.ACCELERATOR_CUTENSOR:
+            result = cutensor._try_reduction_routine(
+                self, axis, dtype, out, keepdims, cuda_cutensor.OP_ADD, 1, 0)
+        if result is not None:
+            return result
+
     if dtype is None:
         return _sum_auto_dtype(self, axis, dtype, out, keepdims)
     else:

diff --git a/cupy/cutensor.py b/cupy/cutensor.py
@@ -5,6 +5,7 @@
 from cupy_backends.cuda.api import runtime
 from cupy.cuda import cutensor
 from cupy.cuda import device
+from cupy.core import _reduction
 
 _handles = {}
 _tensor_descriptors = {}
@@ -496,3 +497,66 @@ def reduction(alpha, A, desc_A, mode_A, beta, C, desc_C, mode_C,
                        out.data.ptr, desc_C, mode_C.data,
                        reduce_op, cutensor_dtype, ws.data.ptr, ws_size)
     return out
+
+
+_cutensor_dtypes = [
+    # TODO(asi1024): Support float16
+    # numpy.float16,
+    numpy.float32,
+    numpy.float64,
+    numpy.complex64,
+    numpy.complex128,
+]
+
+
+def _try_reduction_routine(x, axis, dtype, out, keepdims, op, alpha, beta):
+    if dtype is None:
+        dtype = x.dtype
+
+    if dtype not in _cutensor_dtypes:
+        return None
+    if dtype != x.dtype:
+        return None
+
+    if x.size == 0:
+        return None
+    if not x._c_contiguous:
+        # TODO(asi1024): Support also for F-contiguous array
+        return None
+
+    in_arg = x
+
+    reduce_axis, out_axis = _reduction._get_axis(axis, x.ndim)
+    out_shape = _reduction._get_out_shape(
+        x.shape, reduce_axis, out_axis, keepdims)
+    if out is None:
+        out = cupy.empty(out_shape, dtype)
+    elif out.shape != out_shape:
+        # TODO(asi1024): Support broadcast
+        return None
+    elif out.dtype != dtype:
+        return None
+    elif not out._c_contiguous:
+        # TODO(asi1024): Support also for F-contiguous array
+        return None
+
+    if keepdims:
+        out_arg = out.reshape(
+            _reduction._get_out_shape(x.shape, reduce_axis, out_axis, False))
+    else:
+        out_arg = out
+
+    # TODO(asi1024): Remove temporary fix
+    in_arg._set_contiguous_strides(in_arg.itemsize, True)
+    out_arg._set_contiguous_strides(out_arg.itemsize, True)
+
+    desc_in = create_tensor_descriptor(in_arg)
+    desc_out = create_tensor_descriptor(out_arg)
+    mode_in = list(range(in_arg.ndim))
+    mode_out = [axis for axis in mode_in if (axis not in reduce_axis)]
+
+    reduction(
+        alpha, in_arg, desc_in, mode_in, beta, out_arg, desc_out, mode_out,
+        op, dtype)
+
+    return out