diff --git a/cupy/cuda/stream.pyx b/cupy/cuda/stream.pyx
index 882e0164c73..567ef4c723a 100644
--- a/cupy/cuda/stream.pyx
+++ b/cupy/cuda/stream.pyx
@@ -276,12 +276,16 @@ class Stream(BaseStream):
         cdef intptr_t current_ptr
         if is_shutting_down():
             return
+        tls = _ThreadLocal.get()
         if self.ptr:
-            tls = _ThreadLocal.get()
             current_ptr = <intptr_t>tls.get_current_stream_ptr()
             if <intptr_t>self.ptr == current_ptr:
                 tls.set_current_stream(self.null)
             runtime.streamDestroy(self.ptr)
+        else:
+            current_stream = tls.get_current_stream()
+            if current_stream == self:
+                tls.set_current_stream(self.null)
         # Note that we can not release memory pool of the stream held in CPU
         # because the memory would still be used in kernels executed in GPU.
 
diff --git a/cupy/testing/__init__.py b/cupy/testing/__init__.py
index 060bf08d008..9f27c59c30c 100644
--- a/cupy/testing/__init__.py
+++ b/cupy/testing/__init__.py
@@ -20,6 +20,7 @@
 from cupy.testing.helper import for_all_dtypes_combination  # NOQA
 from cupy.testing.helper import for_CF_orders  # NOQA
 from cupy.testing.helper import for_complex_dtypes  # NOQA
+from cupy.testing.helper import for_contiguous_axes  # NOQA
 from cupy.testing.helper import for_dtypes  # NOQA
 from cupy.testing.helper import for_dtypes_combination  # NOQA
 from cupy.testing.helper import for_float_dtypes  # NOQA
diff --git a/cupy/testing/helper.py b/cupy/testing/helper.py
index 06d7067f02e..714609e6625 100644
--- a/cupy/testing/helper.py
+++ b/cupy/testing/helper.py
@@ -1013,6 +1013,45 @@ def for_CF_orders(name='order'):
     return for_orders([None, 'C', 'F', 'c', 'f'], name)
 
 
+def for_contiguous_axes(name='axis'):
+    '''Decorator for parametrizing tests with possible contiguous axes.
+
+    Args:
+        name(str): Argument name to which specified axis are passed.
+
+    .. note::
+        1. Adapted from tests/cupy_tests/fft_tests/test_fft.py.
+        2. Example: for ``shape = (1, 2, 3)``, the tested axes are
+            ``[(2,), (1, 2), (0, 1, 2)]`` for the C order, and
+            ``[(0,), (0, 1), (0, 1, 2)]`` for the F order.
+    '''
+    def decorator(impl):
+        @functools.wraps(impl)
+        def test_func(self, *args, **kw):
+            ndim = len(self.shape)
+            order = self.order
+            for i in range(ndim):
+                a = ()
+                if order in ('c', 'C'):
+                    for j in range(ndim-1, i-1, -1):
+                        a = (j,) + a
+                elif order in ('f', 'F'):
+                    for j in range(0, i+1):
+                        a = a + (j,)
+                else:
+                    raise ValueError('Please specify the array order.')
+                try:
+                    print(order, ', testing', a)
+                    kw[name] = a
+                    impl(self, *args, **kw)
+                except Exception:
+                    print(name, 'is', a, ', ndim is', ndim, ', shape is',
+                          self.shape, ', order is', order)
+                    raise
+        return test_func
+    return decorator
+
+
 def with_requires(*requirements):
     """Run a test case only when given requirements are satisfied.
 
diff --git a/cupyx/__init__.py b/cupyx/__init__.py
index 2bb3285b6b4..985b2d8d7f6 100644
--- a/cupyx/__init__.py
+++ b/cupyx/__init__.py
@@ -8,6 +8,7 @@
 from cupyx import linalg  # NOQA
 from cupyx import time  # NOQA
 from cupyx import scipy  # NOQA
+from cupyx import optimizing  # NOQA
 
 from cupyx._ufunc_config import errstate  # NOQA
 from cupyx._ufunc_config import geterr  # NOQA
diff --git a/cupyx/optimizing/_optimize.py b/cupyx/optimizing/_optimize.py
index af14cc6ef39..912a968bf29 100644
--- a/cupyx/optimizing/_optimize.py
+++ b/cupyx/optimizing/_optimize.py
@@ -1,7 +1,13 @@
 import contextlib
 import math
 
-import optuna
+
+try:
+    import optuna
+    _optuna_available = True
+except ImportError:
+    _optuna_available = False
+
 
 from cupy.core import _optimize_config
 from cupyx import time
@@ -68,6 +74,11 @@ def optimize(*, key=None, **config_dict):
       Optuna (https://optuna.org) installation is required.
       Currently it works for reduction operations only.
     """
+    if not _optuna_available:
+        raise RuntimeError(
+            'Optuna is required to run optimization. '
+            'See https://optuna.org/ for the installation instructions.')
+
     old_context = _optimize_config.get_current_context()
     context = _optimize_config.get_new_context(key, _optimize, config_dict)
     _optimize_config.set_current_context(context)
diff --git a/tests/cupy_tests/core_tests/test_ndarray_reduction.py b/tests/cupy_tests/core_tests/test_ndarray_reduction.py
index 3b941f4b149..fb750ffcdc2 100644
--- a/tests/cupy_tests/core_tests/test_ndarray_reduction.py
+++ b/tests/cupy_tests/core_tests/test_ndarray_reduction.py
@@ -1,5 +1,6 @@
 import unittest
 
+import cupy
 from cupy import testing
 
 
@@ -207,3 +208,36 @@ def test_ptp_nan_real(self, xp, dtype):
     def test_ptp_nan_imag(self, xp, dtype):
         a = xp.array([float('nan')*1.j, 1.j, -1.j], dtype)
         return a.ptp()
+
+
+# This class compares CUB results against NumPy's
+@testing.parameterize(*testing.product({
+    'shape': [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)],
+    'order': ('C', 'F'),
+}))
+@testing.gpu
+@unittest.skipIf(cupy.cuda.cub_enabled is False, 'The CUB module is not built')
+class TestCUBreduction(unittest.TestCase):
+    @testing.for_contiguous_axes()
+    @testing.for_dtypes('bhilBHILfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_min(self, xp, dtype, axis):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        return a.min(axis=axis)
+
+    @testing.for_contiguous_axes()
+    @testing.for_dtypes('bhilBHILfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_max(self, xp, dtype, axis):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        return a.max(axis=axis)
diff --git a/tests/cupy_tests/cuda_tests/test_stream.py b/tests/cupy_tests/cuda_tests/test_stream.py
index 4885ea1d7d6..521e04dc054 100644
--- a/tests/cupy_tests/cuda_tests/test_stream.py
+++ b/tests/cupy_tests/cuda_tests/test_stream.py
@@ -19,9 +19,8 @@ def test_eq(self):
         self.assertEqual(null1, null2)
         self.assertNotEqual(null2, null3)
 
-    @attr.gpu
-    def test_del(self):
-        stream = cuda.Stream().use()
+    def check_del(self, null):
+        stream = cuda.Stream(null=null).use()
         stream_ptr = stream.ptr
         x = from_data.array([1, 2, 3])
         del stream
@@ -31,6 +30,14 @@ def test_del(self):
         del stream_ptr
         del x
 
+    @attr.gpu
+    def test_del(self):
+        self.check_del(null=False)
+
+    @attr.gpu
+    def test_del_null(self):
+        self.check_del(null=True)
+
     @attr.gpu
     def test_get_and_add_callback(self):
         N = 100
diff --git a/tests/cupy_tests/math_tests/test_sumprod.py b/tests/cupy_tests/math_tests/test_sumprod.py
index ca38546f39c..c586f7bab08 100644
--- a/tests/cupy_tests/math_tests/test_sumprod.py
+++ b/tests/cupy_tests/math_tests/test_sumprod.py
@@ -192,6 +192,77 @@ def test_prod_dtype(self, xp, src_dtype, dst_dtype):
         return a.prod(dtype=dst_dtype)
 
 
+# This class compares CUB results against NumPy's
+@testing.parameterize(*testing.product({
+    'shape': [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)],
+    'order': ('C', 'F'),
+}))
+@testing.gpu
+@unittest.skipIf(cupy.cuda.cub_enabled is False, 'The CUB module is not built')
+class TestCUBreduction(unittest.TestCase):
+    @testing.for_contiguous_axes()
+    # sum supports less dtypes; don't test float16 as it's not as accurate?
+    @testing.for_dtypes('lLfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_sum(self, xp, dtype, axis):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        return a.sum(axis=axis)
+
+    @testing.for_contiguous_axes()
+    # prod supports less dtypes; don't test float16 as it's not as accurate?
+    @testing.for_dtypes('lLfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_prod(self, xp, dtype, axis):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        return a.prod(axis=axis)
+
+    # TODO(leofang): test axis after support is added
+    # don't test float16 as it's not as accurate?
+    @testing.for_dtypes('bhilBHILfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-4)
+    def test_cub_cumsum(self, xp, dtype):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        return a.cumsum()
+
+    # TODO(leofang): test axis after support is added
+    # don't test float16 as it's not as accurate?
+    @testing.for_dtypes('bhilBHILfdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-4)
+    def test_cub_cumprod(self, xp, dtype):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order in ('c', 'C'):
+            a = xp.ascontiguousarray(a)
+        elif self.order in ('f', 'F'):
+            a = xp.asfortranarray(a)
+        result = a.cumprod()
+        # for testing cumprod against complex arrays, the gotcha is CuPy may
+        # produce only Inf at the position where NumPy starts to give NaN. So,
+        # an error would be raised during assert_allclose where the positions
+        # of NaNs are examined. Since this is both algorithm and architecture
+        # dependent, we have no control over this behavior and can only
+        # circumvent the issue by manually converting Inf to NaN
+        if dtype in (numpy.complex64, numpy.complex128):
+            pos = xp.where(xp.isinf(result))
+            result[pos] = xp.nan + 1j * xp.nan
+        return result
+
+
 @testing.parameterize(
     *testing.product({
         'shape': [(2, 3, 4), (20, 30, 40)],
diff --git a/tests/cupy_tests/sorting_tests/test_search.py b/tests/cupy_tests/sorting_tests/test_search.py
index 8235f0afdc0..87420de87dc 100644
--- a/tests/cupy_tests/sorting_tests/test_search.py
+++ b/tests/cupy_tests/sorting_tests/test_search.py
@@ -159,6 +159,38 @@ def test_argmin_zero_size_axis1(self, xp, dtype):
         return a.argmin(axis=1)
 
 
+# This class compares CUB results against NumPy's
+# TODO(leofang): test axis after support is added
+@testing.parameterize(*testing.product({
+    'shape': [(10,), (10, 20), (10, 20, 30), (10, 20, 30, 40)],
+    'order': ('C', 'F'),
+}))
+@testing.gpu
+@unittest.skipIf(cupy.cuda.cub_enabled is False, 'The CUB module is not built')
+class TestCUBreduction(unittest.TestCase):
+    @testing.for_dtypes('bhilBHILefdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_argmin(self, xp, dtype):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order == 'C':
+            a = xp.ascontiguousarray(a)
+        else:
+            a = xp.asfortranarray(a)
+        return a.argmin()
+
+    @testing.for_dtypes('bhilBHILefdFD')
+    @testing.numpy_cupy_allclose(rtol=1E-5)
+    def test_cub_argmax(self, xp, dtype):
+        assert cupy.cuda.cub_enabled
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.order == 'C':
+            a = xp.ascontiguousarray(a)
+        else:
+            a = xp.asfortranarray(a)
+        return a.argmax()
+
+
 @testing.gpu
 @testing.parameterize(*testing.product({
     'func': ['argmin', 'argmax'],