facebookresearch · fmassa · Nov 29, 2022 · Nov 28, 2022 · Nov 29, 2022
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
@@ -605,7 +605,9 @@ def test_backward(
 
     grad_out = torch.ones_like(out)
     if grad_out_contiguous is False:
-        grad_out = torch.tensor([1.0], device=device)[None, None, :].expand_as(out)
+        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[
+            None, None, :
+        ].expand_as(out)
 
     out.backward(grad_out)
     del out

diff --git a/xformers/ops/memory_efficient_attention.py b/xformers/ops/memory_efficient_attention.py
@@ -624,7 +624,7 @@ def _backward(cls, ctx, grad, saved_tensors):
 
         assert grad.dtype in cls.SUPPORTED_DTYPES
         cls._flash_attn_backward(
-            grad.reshape(ctx.kernel_output_shape),
+            grad.reshape(ctx.kernel_output_shape).contiguous(),
             q,
             k,
             v,