Merge pull request #8149 from jemiryguo/main

add incontiguous support for cutensor functions
cupy · Feb 7, 2024 · f643379 · f643379
2 parents 7a20c1a + 32bb3ad
commit f643379
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 19 deletions.
diff --git a/cupyx/cutensor.pyx b/cupyx/cutensor.pyx
@@ -242,8 +242,9 @@ cpdef TensorDescriptor create_tensor_descriptor(_ndarray_base a):
         (TensorDescriptor): A instance of class TensorDescriptor.
     """
     handle = _get_handle()
-    key = (handle.ptr, a.dtype, tuple(a.shape), tuple(a.strides))
-    alignment_req = 256
+    alignment_req = a.itemsize
+    key = (handle.ptr, a.dtype, tuple(a.shape),
+           tuple(a.strides), alignment_req)
     if a.data.ptr & (alignment_req - 1) != 0:
         raise ValueError("Missaligned array")
     if key not in _tensor_descriptors:
@@ -479,18 +480,13 @@ def elementwise_binary(
     Examples:
         See examples/cutensor/elementwise_binary.py
     """
-    if not (A._c_contiguous and C._c_contiguous):
-        raise ValueError('The inputs should be contiguous arrays.')
-
     if out is None:
         out = core._ndarray_init(
             _cupy.ndarray, C._shape, dtype=C.dtype, obj=None)
     elif C.dtype != out.dtype:
         raise ValueError('dtype mismatch: {} != {}'.format(C.dtype, out.dtype))
     elif not internal.vector_equal(C._shape, out._shape):
         raise ValueError('shape mismatch: {} != {}'.format(C.shape, out.shape))
-    elif not out._c_contiguous:
-        raise ValueError('`out` should be a contiguous array.')
 
     desc_A = create_tensor_descriptor(A)
     desc_C = create_tensor_descriptor(C)
@@ -625,18 +621,13 @@ def elementwise_trinary(
     Examples:
         See examples/cutensor/elementwise_trinary.py
     """
-    if not (A._c_contiguous and B._c_contiguous and C._c_contiguous):
-        raise ValueError('The inputs should be contiguous arrays.')
-
     if out is None:
         out = core._ndarray_init(
             _cupy.ndarray, C._shape, dtype=C.dtype, obj=None)
     elif C.dtype != out.dtype:
         raise ValueError('dtype mismatch: {} != {}'.format(C.dtype, out.dtype))
     elif not internal.vector_equal(C._shape, out._shape):
         raise ValueError('shape mismatch: {} != {}'.format(C.shape, out.shape))
-    elif not out._c_contiguous:
-        raise ValueError('`out` should be a contiguous array.')
 
     desc_A = create_tensor_descriptor(A)
     desc_B = create_tensor_descriptor(B)
@@ -783,9 +774,6 @@ def contraction(
     Examples:
         See examples/cutensor/contraction.py
     """
-    if not (A._c_contiguous and B._c_contiguous and C._c_contiguous):
-        raise ValueError('The inputs should be contiguous arrays.')
-
     desc_A = create_tensor_descriptor(A)
     desc_B = create_tensor_descriptor(B)
     desc_C = create_tensor_descriptor(C)
@@ -876,7 +864,7 @@ def reduction(
 
     This routine computes the tensor reduction:
 
-        C = alpha * reduce_op(op_A(A)) + beta * op_C(C))
+        C = alpha * reduce_op(op_A(A)) + beta * op_C(C)
 
     Args:
         alpha (scalar): Scaling factor for A.
@@ -897,9 +885,6 @@ def reduction(
     Examples:
         See examples/cutensor/reduction.py
     """
-    if not (A._c_contiguous and C._c_contiguous):
-        raise ValueError('The inputs should be contiguous arrays.')
-
     desc_A = create_tensor_descriptor(A)
     desc_C = create_tensor_descriptor(C)
     mode_A = _auto_create_mode(A, mode_A)

diff --git a/tests/cupyx_tests/test_cutensor.py b/tests/cupyx_tests/test_cutensor.py
@@ -337,3 +337,132 @@ def test_contraction(self):
                              self.c, mode_c)
         cupy.testing.assert_allclose(self.c, self.c_ref,
                                      rtol=self.tol, atol=self.tol)
+
+
+@testing.parameterize(*testing.product({
+    'dtype_char': ['e', 'f', 'd', 'F', 'D'],
+    'shape': [(30, 40, 30, 35)],
+    'alpha': [0.5, 1.0],
+    'beta': [0.0, 1.0],
+    'order': ['C', 'F']
+}))
+@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
+class TestCuTensorIncontiguous:
+    _tol = {'e': 1e-3, 'f': 2e-6, 'd': 1e-12}
+
+    @pytest.fixture(autouse=True)
+    def setUp(self):
+        compute_capability = int(device.get_compute_capability())
+        if compute_capability < 70 and self.dtype_char == 'e':
+            pytest.skip("Not supported")
+        self.dtype = numpy.dtype(self.dtype_char)
+        self.tol = self._tol[self.dtype_char.lower()]
+
+    def test_contraction(self):
+        mode_a = cutensor.create_mode('a', 'b', 'c')
+        mode_b = cutensor.create_mode('c', 'd', 'b')
+        mode_c = cutensor.create_mode('d', 'a')
+        a, b, c, d = self.shape
+        self.a = testing.shaped_random(
+            (a, b, c), cupy, dtype=self.dtype, order=self.order)
+        self.b = testing.shaped_random(
+            (c, d, b), cupy, dtype=self.dtype, order=self.order)
+        self.c = testing.shaped_random(
+            (d, a), cupy, dtype=self.dtype, order=self.order)
+        delta = 7
+        c_ref = self.c.copy()
+        c_ref = cutensor.contraction(self.alpha,
+                                     self.a, mode_a,
+                                     self.b, mode_b,
+                                     self.beta,
+                                     c_ref, mode_c)
+        for a0 in range(0, a, delta):
+            for d0 in range(0, d, delta):
+                cutensor.contraction(self.alpha,
+                                     self.a[a0:a0+delta], mode_a,
+                                     self.b[:, d0:d0+delta], mode_b,
+                                     self.beta,
+                                     self.c[d0:d0+delta, a0:a0+delta], mode_c)
+                cupy.testing.assert_allclose(self.c[d0:d0+delta, a0:a0+delta],
+                                             c_ref[d0:d0+delta, a0:a0+delta],
+                                             rtol=self.tol, atol=self.tol)
+
+    def test_reduction(self):
+        mode_a = cutensor.create_mode('a', 'b', 'c')
+        mode_c = cutensor.create_mode('b')
+        a, b, c, _ = self.shape
+        self.a = testing.shaped_random(
+            (a, b, c), cupy, dtype=self.dtype, order=self.order)
+        self.c = testing.shaped_random(
+            (b,), cupy, dtype=self.dtype, order=self.order)
+
+        c_ref = self.c.copy()
+        c_ref = cutensor.reduction(self.alpha,
+                                   self.a, mode_a,
+                                   self.beta,
+                                   c_ref, mode_c)
+        delta = 7
+        for b0 in range(0, b, delta):
+            cutensor.reduction(self.alpha,
+                               self.a[:, b0:b0+delta, :], mode_a,
+                               self.beta,
+                               self.c[b0:b0+delta], mode_c)
+            cupy.testing.assert_allclose(self.c[b0:b0+delta],
+                                         c_ref[b0:b0+delta],
+                                         rtol=self.tol, atol=self.tol)
+
+    def test_elementwise_binary(self):
+        mode_a = cutensor.create_mode('a', 'b', 'c')
+        mode_c = cutensor.create_mode('c', 'a', 'b')
+        a, b, c, _ = self.shape
+        self.a = testing.shaped_random(
+            (a, b, c), cupy, dtype=self.dtype, order=self.order)
+        self.c = testing.shaped_random(
+            (c, a, b), cupy, dtype=self.dtype, order=self.order)
+
+        c_ref = self.c.copy()
+        c_ref = cutensor.elementwise_binary(self.alpha,
+                                            self.a, mode_a,
+                                            self.beta,
+                                            c_ref, mode_c)
+        delta = 7
+        for b0 in range(0, b, delta):
+            cutensor.elementwise_binary(self.alpha,
+                                        self.a[:, b0:b0+delta], mode_a,
+                                        self.beta,
+                                        self.c[:, :, b0:b0+delta], mode_c,
+                                        out=self.c[:, :, b0:b0+delta])
+            cupy.testing.assert_allclose(self.c[:, :, b0:b0+delta],
+                                         c_ref[:, :, b0:b0+delta],
+                                         rtol=self.tol, atol=self.tol)
+
+    def test_elementwise_trinary(self):
+        mode_a = cutensor.create_mode('a', 'b', 'c')
+        mode_b = cutensor.create_mode('b', 'c', 'a')
+        mode_c = cutensor.create_mode('c', 'a', 'b')
+        a, b, c, _ = self.shape
+        self.a = testing.shaped_random(
+            (a, b, c), cupy, dtype=self.dtype, order=self.order)
+        self.b = testing.shaped_random(
+            (b, c, a), cupy, dtype=self.dtype, order=self.order)
+        self.c = testing.shaped_random(
+            (c, a, b), cupy, dtype=self.dtype, order=self.order)
+
+        for gamma in [0.0, 1.0]:
+            c_ref = self.c.copy()
+            c_ref = cutensor.elementwise_trinary(self.alpha, self.a, mode_a,
+                                                 self.beta, self.b, mode_b,
+                                                 gamma, c_ref, mode_c,
+                                                 out=c_ref)
+            delta = 7
+            for a0 in range(0, a, delta):
+                cutensor.elementwise_trinary(self.alpha,
+                                             self.a[a0:a0+delta],
+                                             mode_a, self.beta,
+                                             self.b[:, :, a0:a0+delta],
+                                             mode_b, gamma,
+                                             self.c[:, a0:a0+delta], mode_c,
+                                             out=self.c[:, a0:a0+delta])
+                cupy.testing.assert_allclose(self.c[:, a0:a0+delta],
+                                             c_ref[:, a0:a0+delta],
+                                             rtol=self.tol, atol=self.tol)