Merge pull request #3798 from kmaehashi/bp-1380-v7-improve-determinis…

…tic-performance [backport] Improve cudnn performance when using deterministic mode
cupy · Aug 17, 2020 · d284dc7 · d284dc7
2 parents 53f2373 + 774733b
commit d284dc7
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 48 deletions.
diff --git a/cupy/cudnn.pyx b/cupy/cudnn.pyx
@@ -1377,21 +1377,28 @@ cpdef _Algorithm _get_algorithm_fwd(
     algo = _get_algorithm_fwd_cache.get(key, None)
     if algo is not None:
         return algo
+    cdef list ret
+    cdef bint skip
     if use_tensor_core and _cudnn_version >= 7000:
         ret = cudnn.getConvolutionForwardAlgorithm_v7(
             handle, x_desc, filter_desc, conv_desc, y_desc, 10)
-        for i, perf in enumerate(ret):
+        skip = False
+        for perf in ret:
             if perf.memory <= max_workspace_size:
                 break
+            skip = True
         else:
             raise RuntimeError('No conv fwd algo available with workspace size'
                                ' less equal {}'.format(max_workspace_size))
-        if i != 0:
+        if skip:
             warnings.warn(
                 'The best algo of conv fwd might not be selected due to '
                 'lack of workspace size ({})'.format(max_workspace_size),
                 util.PerformanceWarning)
-        if perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH:
+        algo = perf.algo
+        workspace_size = perf.memory
+        math_type = perf.mathType
+        if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH:
             _warn_algorithm_fwd(x, W, y, conv_param)
         algo = _Algorithm(perf.algo, perf.memory, perf.mathType)
     else:
@@ -1420,18 +1427,34 @@ cpdef _warn_algorithm_bwd_filter(
 cpdef _Algorithm _find_algorithm_bwd_filter(
         core.ndarray x, core.ndarray dy, core.ndarray dW, tuple conv_param,
         size_t handle, size_t x_desc, size_t dy_desc, size_t conv_desc,
-        size_t filter_desc, size_t max_workspace_size, bint use_tensor_core):
+        size_t filter_desc, size_t max_workspace_size, bint use_tensor_core,
+        bint deterministic):
     cdef cudnn.CuDNNAlgoPerf perf
+    cdef _Algorithm algo
     key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param,
            max_workspace_size)
     algo = _algorithm_bwd_filter_cache.get(key, None)
     if algo is not None:
         return algo
     workspace = memory.alloc(max_workspace_size)
     if _cudnn_version >= 7000:
-        perf = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7(
-            handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc,
-            filter_desc, dW.data.ptr, 1, workspace.ptr, max_workspace_size)[0]
+        if deterministic:
+            ret = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7(
+                handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc,
+                filter_desc, dW.data.ptr, 10, workspace.ptr,
+                max_workspace_size)
+            for perf in ret:
+                if perf.determinism:
+                    break
+            else:
+                raise RuntimeError(
+                    'No conv bwd filter algo available with workspace size '
+                    'less equal {}'.format(max_workspace_size))
+        else:
+            perf = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7(
+                handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc,
+                filter_desc, dW.data.ptr, 1, workspace.ptr,
+                max_workspace_size)[0]
         if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH:
             _warn_algorithm_bwd_filter(x, dy, dW, conv_param)
     else:
@@ -1448,29 +1471,39 @@ cpdef _Algorithm _find_algorithm_bwd_filter(
 cpdef _Algorithm _get_algorithm_bwd_filter(
         core.ndarray x, core.ndarray dy, core.ndarray dW, tuple conv_param,
         size_t handle, size_t x_desc, size_t gy_desc, size_t conv_desc,
-        size_t filter_desc, size_t max_workspace_size, bint use_tensor_core):
+        size_t filter_desc, size_t max_workspace_size, bint use_tensor_core,
+        bint deterministic):
     cdef cudnn.CuDNNAlgoPerf perf
     key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param,
            max_workspace_size)
     algo = _get_algorithm_bwd_filter_cache.get(key, None)
     if algo is not None:
         return algo
-    if use_tensor_core and _cudnn_version >= 7000:
+    cdef list ret
+    cdef bint skip
+    if _cudnn_version >= 7000:
         ret = cudnn.getConvolutionBackwardFilterAlgorithm_v7(
             handle, x_desc, gy_desc, conv_desc, filter_desc, 10)
-        for i, perf in enumerate(ret):
+        skip = False
+        for perf in ret:
+            if deterministic and not perf.determinism:
+                continue
             if perf.memory <= max_workspace_size:
                 break
+            skip = True
         else:
             raise RuntimeError(
                 'No conv bwd filter algo available with workspace size less '
                 'equal {}'.format(max_workspace_size))
-        if i != 0:
+        if use_tensor_core and skip:
             warnings.warn(
                 'The best algo of conv bwd filter might not not selected due '
                 'to lack of workspace size ({})'.format(max_workspace_size),
                 util.PerformanceWarning)
-        if perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH:
+        algo = perf.algo
+        workspace_size = perf.memory
+        math_type = perf.mathType
+        if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH:
             _warn_algorithm_bwd_filter(x, dy, dW, conv_param)
         algo = _Algorithm(perf.algo, perf.memory, perf.mathType)
     else:
@@ -1499,7 +1532,9 @@ cpdef _warn_algorithm_bwd_data(
 cpdef _Algorithm _find_algorithm_bwd_data(
         core.ndarray W, core.ndarray x, core.ndarray y, tuple conv_param,
         size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc,
-        size_t y_desc, size_t max_workspace_size, bint use_tensor_core):
+        size_t y_desc, size_t max_workspace_size, bint use_tensor_core,
+        bint deterministic):
+    cdef _Algorithm algo
     cdef cudnn.CuDNNAlgoPerf perf
     key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param,
            max_workspace_size)
@@ -1508,9 +1543,21 @@ cpdef _Algorithm _find_algorithm_bwd_data(
         return algo
     workspace = memory.alloc(max_workspace_size)
     if _cudnn_version >= 7000:
-        perf = cudnn.findConvolutionBackwardDataAlgorithmEx_v7(
-            handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc,
-            y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0]
+        if deterministic:
+            ret = cudnn.findConvolutionBackwardDataAlgorithmEx_v7(
+                handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc,
+                y_desc, y.data.ptr, 10, workspace.ptr, max_workspace_size)
+            for perf in ret:
+                if perf.determinism:
+                    break
+            else:
+                raise RuntimeError(
+                    'No conv bwd filter algo available with workspace size '
+                    'less equal {}'.format(max_workspace_size))
+        else:
+            perf = cudnn.findConvolutionBackwardDataAlgorithmEx_v7(
+                handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc,
+                y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0]
         if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH:
             _warn_algorithm_bwd_data(W, x, y, conv_param)
     else:
@@ -1527,29 +1574,39 @@ cpdef _Algorithm _find_algorithm_bwd_data(
 cpdef _Algorithm _get_algorithm_bwd_data(
         core.ndarray W, core.ndarray x, core.ndarray y, tuple conv_param,
         size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc,
-        size_t y_desc, size_t max_workspace_size, bint use_tensor_core):
+        size_t y_desc, size_t max_workspace_size, bint use_tensor_core,
+        bint deterministic):
     cdef cudnn.CuDNNAlgoPerf perf
     key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param,
            max_workspace_size)
-    algo = _algorithm_bwd_data_cache.get(key, None)
+    algo = _get_algorithm_bwd_data_cache.get(key, None)
     if algo is not None:
         return algo
-    if use_tensor_core and _cudnn_version >= 7000:
+    cdef list ret
+    cdef bint skip
+    if _cudnn_version >= 7000:
         ret = cudnn.getConvolutionBackwardDataAlgorithm_v7(
             handle, filter_desc, x_desc, conv_desc, y_desc, 10)
-        for i, perf in enumerate(ret):
+        skip = False
+        for perf in ret:
+            if deterministic and not perf.determinism:
+                continue
             if perf.memory <= max_workspace_size:
                 break
+            skip = True
         else:
             raise RuntimeError(
                 'No conv bwd data algo available with workspace size less '
                 'equal {}'.format(max_workspace_size))
-        if i != 0:
+        if use_tensor_core and skip:
             warnings.warn(
                 'The best algo of conv bwd data might not not selected due '
                 'to lack of workspace size ({})'.format(max_workspace_size),
                 util.PerformanceWarning)
-        if perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH:
+        algo = perf.algo
+        workspace_size = perf.memory
+        math_type = perf.mathType
+        if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH:
             _warn_algorithm_bwd_data(W, x, y, conv_param)
         algo = _Algorithm(perf.algo, perf.memory, perf.mathType)
     else:
@@ -1716,7 +1773,8 @@ def convolution_backward_filter(
     # CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 does not use Tensor Core.
     cdef bint use_tensor_core = (
         not deterministic and _should_use_tensor_core(tensor_core, x.dtype))
-    cdef tuple conv_param = (pad, stride, x.dtype, use_tensor_core)
+    cdef tuple conv_param = (
+        pad, stride, x.dtype, use_tensor_core, deterministic)
 
     handle = get_handle()
     x = core._internal_ascontiguousarray(x)
@@ -1740,22 +1798,27 @@ def convolution_backward_filter(
             conv_desc, pad, stride, dilation, groups, x.dtype,
             cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core)
 
-        if deterministic:
+        if deterministic and _cudnn_version < 7000:
             # TODO(imanishi): Support Tensor Core in deterministic mode.
             algo = cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
             workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize(
                 handle, x_desc, gy_desc, conv_desc, filter_desc, algo)
             math_type = cudnn.CUDNN_DEFAULT_MATH
-            # TODO(okuta): check workspace size
+            if workspace_size > max_workspace_size:
+                raise RuntimeError(
+                    'No conv bwd filter algo available with workspace size '
+                    'less equal {}'.format(max_workspace_size))
         else:
-            if auto_tune:
+            if auto_tune and not deterministic:
                 perf = _find_algorithm_bwd_filter(
                     x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc,
-                    filter_desc, max_workspace_size, use_tensor_core)
+                    filter_desc, max_workspace_size, use_tensor_core,
+                    deterministic)
             else:
                 perf = _get_algorithm_bwd_filter(
                     x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc,
-                    filter_desc, max_workspace_size, use_tensor_core)
+                    filter_desc, max_workspace_size, use_tensor_core,
+                    deterministic)
             algo = perf.algo
             workspace_size = perf.memory
             math_type = perf.mathType
@@ -1800,7 +1863,8 @@ def convolution_backward_data(
     # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 does not use Tensor Core.
     cdef bint use_tensor_core = (
         not deterministic and _should_use_tensor_core(tensor_core, x.dtype))
-    cdef tuple conv_param = (pad, stride, x.dtype, use_tensor_core)
+    cdef tuple conv_param = (
+        pad, stride, x.dtype, use_tensor_core, deterministic)
 
     # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but
     # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM.
@@ -1834,22 +1898,27 @@ def convolution_backward_data(
             conv_desc, pad, stride, dilation, groups, x.dtype,
             cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core)
 
-        if deterministic:
+        if deterministic and _cudnn_version < 7000:
             # TODO(imanishi): Support Tensor Core in deterministic mode.
             algo = cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
             workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize(
                 handle, filter_desc, x_desc, conv_desc, y_desc, algo)
             math_type = cudnn.CUDNN_DEFAULT_MATH
-            # TODO(okuta): check workspace size
+            if workspace_size > max_workspace_size:
+                raise RuntimeError(
+                    'No conv bwd data algo available with workspace size less '
+                    'equal {}'.format(max_workspace_size))
         else:
-            if auto_tune:
+            if auto_tune and not deterministic:
                 perf = _find_algorithm_bwd_data(
                     W, x, y, conv_param, handle, filter_desc, x_desc,
-                    conv_desc, y_desc, max_workspace_size, use_tensor_core)
+                    conv_desc, y_desc, max_workspace_size, use_tensor_core,
+                    deterministic)
             else:
                 perf = _get_algorithm_bwd_data(
                     W, x, y, conv_param, handle, filter_desc, x_desc,
-                    conv_desc, y_desc, max_workspace_size, use_tensor_core)
+                    conv_desc, y_desc, max_workspace_size, use_tensor_core,
+                    deterministic)
             algo = perf.algo
             workspace_size = perf.memory
             math_type = perf.mathType

diff --git a/tests/cupy_tests/test_cudnn.py b/tests/cupy_tests/test_cudnn.py
@@ -250,9 +250,10 @@ def setUp(self):
         if ((self.dilate > 1 and version < 6000) or
                 (self.groups > 1 and version < 7000)):
             self.err = ValueError
-        elif ((self.dilate > 1 and deterministic and version < 7000) or
-                (ndim > 2 and deterministic and version < 6000) or
-                (ndim > 2 and deterministic and self.dtype == numpy.float64)):
+        elif deterministic and (
+                (self.dilate > 1 and version < 7000) or
+                (ndim > 2 and version < 6000) or
+                (ndim > 2 and self.dtype == numpy.float64)):
             self.err = libcudnn.CuDNNError
         elif (8000 <= version and
               self.max_workspace_size == 0 and
@@ -275,6 +276,9 @@ def call(self):
             tensor_core=self.tensor_core)
 
     def test_call(self):
+        if self.deterministic and self.max_workspace_size == 0:
+            # This test case is very unstable
+            return
         if self.err is None:
             self.call()
             self.assertTrue((self.gW == 0).all())
@@ -353,6 +357,9 @@ def call(self):
             tensor_core=self.tensor_core)
 
     def test_call(self):
+        if self.deterministic and self.max_workspace_size == 0:
+            # This test case is very unstable
+            return
         if self.err is None:
             self.call()
             self.assertTrue((self.gx == 0).all())
@@ -409,27 +416,21 @@ def tearDown(self):
         cudnn.set_max_workspace_size(self._workspace_size)
 
     def test_backward_filter(self):
-        err = None
-        if (self.layout == libcudnn.CUDNN_TENSOR_NHWC and
+        if not (self.layout == libcudnn.CUDNN_TENSOR_NHWC and
                 self.dtype == numpy.float64):
-            err = self._get_error_type()
-        if err is None:
             return unittest.SkipTest()
-        with self.assertRaises(err):
+        with self.assertRaises(RuntimeError):
             cudnn.convolution_backward_filter(
                 self.x, self.gy, self.gW,
                 pad=(self.pad, self.pad), stride=(self.stride, self.stride),
-                dilation=(1, 1), groups=1, deterministic=0,
+                dilation=(1, 1), groups=1, deterministic=False,
                 auto_tune=self.auto_tune, tensor_core='always',
                 d_layout=self.layout, w_layout=self.layout)
 
     def test_backward_data(self):
-        err = None
-        if self.layout == libcudnn.CUDNN_TENSOR_NHWC:
-            err = self._get_error_type()
-        if err is None:
+        if self.layout != libcudnn.CUDNN_TENSOR_NHWC:
             return unittest.SkipTest()
-        with self.assertRaises(err):
+        with self.assertRaises(RuntimeError):
             cudnn.convolution_backward_data(
                 self.W, self.gy, None, self.gx,
                 pad=(self.pad, self.pad), stride=(self.stride, self.stride),