From 06bfd2549cfc0d83af85ffa971995dfa3be73097 Mon Sep 17 00:00:00 2001
From: Leo Fang <leofang@bnl.gov>
Date: Tue, 30 Jun 2020 22:56:45 -0400
Subject: [PATCH] Small fixes for CUB block reduction kernels

1. Remove all of the type constraints
2. Add a possible exception for optimizer
3. Allow compiler exceptions to propagate upward
4. Make complex<T> (almost) obey the rule of three (to fix fp16 -> complex conversion)
5. Fix tests
---
 cupy/core/_cub_reduction.pxd                  |  2 +-
 cupy/core/_cub_reduction.pyx                  | 28 ++++--------
 cupy/core/include/cupy/complex/complex.h      | 45 ++++++++++++++++++-
 cupy/core/include/cupy/complex/complex_inl.h  | 42 ++++++++++++++++-
 .../core_tests/test_cub_reduction.py          | 15 ++++---
 tests/cupy_tests/core_tests/test_raw.py       |  8 ++--
 6 files changed, 109 insertions(+), 31 deletions(-)
diff --git a/cupy/core/_cub_reduction.pxd b/cupy/core/_cub_reduction.pxd
index b10b1be724f..9145ef044fb 100644
--- a/cupy/core/_cub_reduction.pxd
+++ b/cupy/core/_cub_reduction.pxd
@@ -7,4 +7,4 @@ cdef bint _try_to_call_cub_reduction(
     stream, optimize_context, tuple key,
     map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
     tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
-    ndarray ret)
+    ndarray ret) except *
diff --git a/cupy/core/_cub_reduction.pyx b/cupy/core/_cub_reduction.pyx
index 119f42a9b88..8518ef011d7 100644
--- a/cupy/core/_cub_reduction.pyx
+++ b/cupy/core/_cub_reduction.pyx
@@ -15,6 +15,7 @@ import math
 import string
 from cupy import _environment
 from cupy.core._kernel import _get_param_info
+from cupy.cuda import driver
 from cupy import util
 
 
@@ -279,18 +280,6 @@ cpdef inline tuple _can_use_cub_block_reduction(
     if axis_permutes_cub != tuple(range(in_arr.ndim)):
         return None
 
-    # To support generic reductions, note that some NumPy casting rules
-    # are not applicable in the C++ space (unless we tweak the type
-    # definitions). To circumvent this, we fall back to the old kernel.
-    # TODO(leofang): can we relax this?
-    if in_arr.dtype.kind != out_arr.dtype.kind:
-        # cannot cast complex to anything else
-        if in_arr.dtype.kind == 'c':
-            return None
-        # cannot cast float16 to complex
-        if in_arr.dtype.char == 'e' and out_arr.dtype.kind == 'c':
-            return None
-
     # full-reduction of N-D array: need to invoke the kernel twice
     cdef bint full_reduction = True if len(out_axis) == 0 else False
 
@@ -370,13 +359,13 @@ cdef _scalar.CScalar _cub_convert_to_c_scalar(
         return _scalar.CScalar.from_int32(value)
 
 
-cdef inline _cub_two_pass_launch(
+cdef inline void _cub_two_pass_launch(
         str name, Py_ssize_t block_size, Py_ssize_t segment_size,
         Py_ssize_t items_per_thread, str reduce_type, tuple params,
         list in_args, list out_args,
         str identity, str pre_map_expr, str reduce_expr, str post_map_expr,
         _kernel._TypeMap type_map, str input_expr, str output_expr,
-        str preamble, tuple options, stream):
+        str preamble, tuple options, stream) except *:
     '''
     Notes:
     1. Two-pass reduction: the first pass distributes an even share over
@@ -472,11 +461,11 @@ cdef inline _cub_two_pass_launch(
     func.linear_launch(gridx, inout_args, 0, blockx, stream)
 
 
-cdef inline _launch_cub(
+cdef inline void _launch_cub(
         self, out_block_num, block_size, block_stride,
         in_args, out_args, in_shape, out_shape, type_map,
         map_expr, reduce_expr, post_map_expr, reduce_type,
-        stream, params, cub_params):
+        stream, params, cub_params) except *:
     cdef bint full_reduction
     cdef Py_ssize_t contiguous_size, items_per_thread
     cdef function.Function func
@@ -537,7 +526,7 @@ def _get_cub_optimized_params(
             post_map_expr, reduce_type, stream, params, cub_params)
 
     def suggest_func(trial):
-        block_size_log = trial.suggest_int('block_size_log', 5, 9)
+        block_size_log = trial.suggest_int('block_size_log', 5, 10)
         block_size = 2 ** block_size_log
         items_per_thread = trial.suggest_int(
             'items_per_thread', 2, 32, step=2)
@@ -545,13 +534,14 @@ def _get_cub_optimized_params(
         trial.set_user_attr('block_size', block_size)
         return block_size, items_per_thread
 
+    # CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error
     optimize_impl = optimize_config.optimize_impl
     best = optimize_impl(
         optimize_config, target_func, suggest_func,
         default_best={
             'block_size_log': default_block_size_log,
             'items_per_thread': default_items_per_thread,
-        })
+        }, ignore_error=(driver.CUDADriverError,))
 
     return best.params['items_per_thread'], best.user_attrs['block_size']
 
@@ -561,7 +551,7 @@ cdef bint _try_to_call_cub_reduction(
         stream, optimize_context, tuple key,
         map_expr, reduce_expr, post_map_expr, reduce_type, type_map,
         tuple reduce_axis, tuple out_axis, const shape_t& out_shape,
-        ndarray ret):
+        ndarray ret) except *:
     """Try to use cub.
 
     Updates `ret` and returns a boolean value whether cub is used.
diff --git a/cupy/core/include/cupy/complex/complex.h b/cupy/core/include/cupy/complex/complex.h
index 4219f33f0b5..61e5f4712fe 100644
--- a/cupy/core/include/cupy/complex/complex.h
+++ b/cupy/core/include/cupy/complex/complex.h
@@ -75,12 +75,29 @@ struct complex {
 
   /* --- Constructors --- */
 
+  /*! Construct a complex number with an imaginary part of 0.
+   *
+   *  \param re The real part of the number.
+   */
+  inline __host__ __device__ complex(const T& re);
+
   /*! Construct a complex number from its real and imaginary parts.
    *
    *  \param re The real part of the number.
    *  \param im The imaginary part of the number.
    */
-  inline __host__ __device__ complex(const T& re = T(), const T& im = T());
+  inline __host__ __device__ complex(const T& re, const T& im);
+
+  /*! Default construct a complex number.
+   */
+  inline __host__ __device__ complex();
+
+  /*! This copy constructor copies from a \p complex with a type that is
+   *  convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  inline __host__ __device__ complex(const complex<T>& z);
 
   /*! This copy constructor copies from a \p complex with a type that
    *  is convertible to this \p complex \c value_type.
@@ -92,6 +109,32 @@ struct complex {
   template <typename X>
   inline __host__ __device__ complex(const complex<X>& z);
 
+  /* --- Assignment Operators --- */
+
+  /*! Assign `re` to the real part of this \p complex and set the imaginary part
+   *  to 0.
+   *
+   *  \param re The real part of the number.
+   */
+  inline __host__ __device__ complex& operator=(const T& re);
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  inline __host__ __device__ complex& operator=(const complex<T>& z);
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  inline __host__ __device__ complex& operator=(const complex<U>& z);
+
   /* --- Compound Assignment Operators --- */
 
   /*! Adds a \p complex to this \p complex and
diff --git a/cupy/core/include/cupy/complex/complex_inl.h b/cupy/core/include/cupy/complex/complex_inl.h
index b31c05d07c5..678dc7dd2a2 100644
--- a/cupy/core/include/cupy/complex/complex_inl.h
+++ b/cupy/core/include/cupy/complex/complex_inl.h
@@ -22,7 +22,11 @@
 namespace thrust {
 
 /* --- Constructors --- */
-// TODO(leofang): support more kinds of constructors from upstream
+template <typename T>
+inline __host__ __device__ complex<T>::complex(const T& re) {
+  real(re);
+  imag(T());
+}
 
 template <typename T>
 inline __host__ __device__ complex<T>::complex(const T& re, const T& im) {
@@ -30,6 +34,18 @@ inline __host__ __device__ complex<T>::complex(const T& re, const T& im) {
   imag(im);
 }
 
+template <typename T>
+inline __host__ __device__ complex<T>::complex() {
+  real(T());
+  imag(T());
+}
+
+template <typename T>
+inline __host__ __device__ complex<T>::complex(const complex<T>& z) {
+  real(z.real());
+  imag(z.imag());
+}
+
 template <typename T>
 template <typename X>
 inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
@@ -39,6 +55,30 @@ inline __host__ __device__ complex<T>::complex(const complex<X>& z) {
   imag(T(z.imag()));
 }
 
+/* --- Assignment Operators --- */
+
+template <typename T>
+inline __host__ __device__ complex<T>& complex<T>::operator=(const T& re) {
+  real(re);
+  imag(T());
+  return *this;
+}
+
+template <typename T>
+inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<T>& z) {
+  real(z.real());
+  imag(z.imag());
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+inline __host__ __device__ complex<T>& complex<T>::operator=(const complex<U>& z) {
+  real(T(z.real()));
+  imag(T(z.imag()));
+  return *this;
+}
+
 /* --- Compound Assignment Operators --- */
 // TODO(leofang): support operators with argument of type T, see upstream
 
diff --git a/tests/cupy_tests/core_tests/test_cub_reduction.py b/tests/cupy_tests/core_tests/test_cub_reduction.py
index a45978ba459..8b303227d6d 100644
--- a/tests/cupy_tests/core_tests/test_cub_reduction.py
+++ b/tests/cupy_tests/core_tests/test_cub_reduction.py
@@ -6,6 +6,7 @@
 from cupy import _environment
 from cupy import testing
 from cupy.core import _accelerator
+from cupy.core import _cub_reduction
 from cupy.cuda import memory
 
 
@@ -130,13 +131,17 @@ def test_can_use_accelerator_set_unset(self):
 
         a = cupy.random.random((10, 10))
         # this is the only function we can mock; the rest is cdef'd
-        func = ''.join(('cupy.core._cub_reduction.',
-                        '_SimpleCubReductionKernel_get_cached_function'))
-        with testing.AssertFunctionIsCalled(func):
+        func_name = ''.join(('cupy.core._cub_reduction.',
+                             '_SimpleCubReductionKernel_get_cached_function'))
+        func = _cub_reduction._SimpleCubReductionKernel_get_cached_function
+        with testing.AssertFunctionIsCalled(
+                func_name, wraps=func, times_called=2):  # two passes
             a.sum()
-        with testing.AssertFunctionIsCalled(func):
+        with testing.AssertFunctionIsCalled(
+                func_name, wraps=func, times_called=1):  # one pass
             a.sum(axis=1)
-        with testing.AssertFunctionIsCalled(func, times_called=0):
+        with testing.AssertFunctionIsCalled(
+                func_name, wraps=func, times_called=0):  # not used
             a.sum(axis=0)
 
         _accelerator.set_routine_accelerators(old_routine_accelerators)
diff --git a/tests/cupy_tests/core_tests/test_raw.py b/tests/cupy_tests/core_tests/test_raw.py
index 67eaf215caa..18140a5b7de 100644
--- a/tests/cupy_tests/core_tests/test_raw.py
+++ b/tests/cupy_tests/core_tests/test_raw.py
@@ -558,7 +558,7 @@ def test_cuFloatComplex(self):
 
         ker = mod.get_function('test_mulf')
         ker((grid,), (block,), (a, b, out))
-        assert (out == a * b).all()
+        assert cupy.allclose(out, a * b)
 
         ker = mod.get_function('test_divf')
         ker((grid,), (block,), (a, b, out))
@@ -574,7 +574,7 @@ def test_cuFloatComplex(self):
 
         ker = mod.get_function('test_fmaf')
         ker((grid,), (block,), (a, b, c, out))
-        assert (out == a * b + c).all()
+        assert cupy.allclose(out, a * b + c)
 
         ker = mod.get_function('test_makef')
         ker((grid,), (block,), (out,))
@@ -620,7 +620,7 @@ def test_cuDoubleComplex(self):
 
         ker = mod.get_function('test_mul')
         ker((grid,), (block,), (a, b, out))
-        assert (out == a * b).all()
+        assert cupy.allclose(out, a * b)
 
         ker = mod.get_function('test_div')
         ker((grid,), (block,), (a, b, out))
@@ -636,7 +636,7 @@ def test_cuDoubleComplex(self):
 
         ker = mod.get_function('test_fma')
         ker((grid,), (block,), (a, b, c, out))
-        assert (out == a * b + c).all()
+        assert cupy.allclose(out, a * b + c)
 
         ker = mod.get_function('test_make')
         ker((grid,), (block,), (out,))