facebookresearch · fmassa · Apr 25, 2022 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
@@ -49,3 +49,58 @@ def test_key_query_all_ones(device, q_len, kv_len, batch_size, k_len):
     ref = value.mean(1, keepdim=True).expand_as(query)
 
     assert torch.allclose(out, ref, atol=1e-5)
+
+
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 3, 5])
+@pytest.mark.parametrize("device", _devices)
+def test_logsumexp(device, q_len, kv_len, batch_size, k_len):
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    _, lse = torch.ops.xformers.efficient_attention(query, key, value, True)
+    ref_lse = ((query / k_len ** 0.5) @ key.transpose(-2, -1)).logsumexp(-1)
+
+    assert torch.allclose(lse, ref_lse, atol=2e-4)
+
+
+@pytest.mark.parametrize("k_len", [5, 6, 32])
+@pytest.mark.parametrize("batch_size", [1, 4])
+@pytest.mark.parametrize("kv_len", [3, 15, 32, 33])
+@pytest.mark.parametrize("q_len", [2, 3, 5])
+@pytest.mark.parametrize("device", _devices)
+def test_memory_efficient_attention_backward(device, q_len, kv_len, batch_size, k_len):
+    scale = 3
+    query = torch.randn((batch_size, q_len, k_len), device=device) * scale
+    key = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+    value = torch.randn((batch_size, kv_len, k_len), device=device) * scale
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    out = xformers.ops.memory_efficient_attention(query, key, value)
+    out.backward(torch.ones_like(query))
+
+    grad_q = query.grad
+    grad_k = key.grad
+    grad_v = value.grad
+
+    query.grad = None
+    key.grad = None
+    value.grad = None
+
+    ref = ref_attention(query, key, value)
+    ref.backward(torch.ones_like(query))
+
+    # there is some extra precision loss in the CPU implementation due to an
+    # extra accumulation step in grad_q, which is not present in the CUDA
+    # implementation
+    atol = 3e-4 if device == "cuda" else 4e-4
+    assert torch.allclose(grad_q, query.grad, atol=atol), "grad_q doesn't match"
+    assert torch.allclose(grad_k, key.grad, atol=atol), "grad_k doesn't match"
+    assert torch.allclose(grad_v, value.grad, atol=atol), "grad_v doesn't match"
diff --git a/xformers/benchmarks/benchmark_mem_eff_attention.py b/xformers/benchmarks/benchmark_mem_eff_attention.py
@@ -30,66 +30,147 @@ def ref_attention(q, k, v):
 results = []
 mem_use: Dict[str, Dict[str, float]] = dict(optimized={}, vanilla={})
 
-print(f"Processing {len(SHAPES)} cases")
-for num_threads in NUM_THREADS:
-    for shape in SHAPES:
-        print(f"===== {shape} =====")
-        B, M, K = shape
-        q = torch.rand(shape, device=device)
-        sub_label = f"B={B}, M={M}, K={K}"
-
-        if True:
-            r = xformers.ops.memory_efficient_attention(q, q, q)
-
-            rr = ref_attention(q, q, q)
-            assert (r - rr).abs().max() < 1e-5
-
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.synchronize()
-        results.append(
-            benchmark.Timer(
-                stmt="fn(q, q, q)",
-                globals={
-                    "q": q,
-                    "fn": torch.ops.xformers.efficient_attention,
-                },
-                label="attention",
-                description="optimized",
-                sub_label=sub_label,
-                num_threads=num_threads,
-            ).blocked_autorange(min_run_time=min_run_time)
-        )
-        torch.cuda.synchronize()
-        memory = torch.cuda.max_memory_allocated() / 2 ** 20
-        mem_use["optimized"][sub_label] = memory
-        memory_str = f"Memory used: {memory} MB"
-
-        print("Optimized", memory_str)
-
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.synchronize()
-        results.append(
-            benchmark.Timer(
-                stmt="fn(q, q, q)",
-                globals={
-                    "q": q,
-                    "fn": ref_attention,
-                },
-                label="attention",
-                description="vanilla",
-                sub_label=sub_label,
-                num_threads=num_threads,
-            ).blocked_autorange(min_run_time=min_run_time)
-        )
-
-        torch.cuda.synchronize()
-        memory = torch.cuda.max_memory_allocated() / 2 ** 20
-        mem_use["vanilla"][sub_label] = memory
-        memory_str = f"Memory used: {memory} MB"
-        print("Vanilla", memory_str)
-
-
-compare = benchmark.Compare(results)
-compare.print()
-
-pprint.pprint(mem_use)
+
+def benchmark_forward():
+    print(f"Processing {len(SHAPES)} cases")
+    print("Forward")
+    for num_threads in NUM_THREADS:
+        for shape in SHAPES:
+            print(f"===== {shape} =====")
+            B, M, K = shape
+            q = torch.rand(shape, device=device)
+            sub_label = f"B={B}, M={M}, K={K}"
+
+            if True:
+                r = xformers.ops.memory_efficient_attention(q, q, q)
+
+                rr = ref_attention(q, q, q)
+                assert (r - rr).abs().max() < 1e-5
+
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+            results.append(
+                benchmark.Timer(
+                    stmt="fn(q, q, q)",
+                    globals={
+                        "q": q,
+                        "fn": xformers.ops.memory_efficient_attention,
+                    },
+                    label="attention",
+                    description="optimized",
+                    sub_label=sub_label,
+                    num_threads=num_threads,
+                ).blocked_autorange(min_run_time=min_run_time)
+            )
+            torch.cuda.synchronize()
+            memory = torch.cuda.max_memory_allocated() / 2 ** 20
+            mem_use["optimized"][sub_label] = memory
+            memory_str = f"Memory used: {memory} MB"
+
+            print("Optimized", memory_str)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+            results.append(
+                benchmark.Timer(
+                    stmt="fn(q, q, q)",
+                    globals={
+                        "q": q,
+                        "fn": ref_attention,
+                    },
+                    label="attention",
+                    description="vanilla",
+                    sub_label=sub_label,
+                    num_threads=num_threads,
+                ).blocked_autorange(min_run_time=min_run_time)
+            )
+
+            torch.cuda.synchronize()
+            memory = torch.cuda.max_memory_allocated() / 2 ** 20
+            mem_use["vanilla"][sub_label] = memory
+            memory_str = f"Memory used: {memory} MB"
+            print("Vanilla", memory_str)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+    pprint.pprint(mem_use)
+
+
+def benchmark_backward():
+    print(f"Processing {len(SHAPES)} cases")
+    print("Backward")
+    for num_threads in NUM_THREADS:
+        for shape in SHAPES:
+            print(f"===== {shape} =====")
+            B, M, K = shape
+            q = torch.rand(shape, device=device, requires_grad=True)
+            sub_label = f"B={B}, M={M}, K={K}"
+
+            if True:
+                r = xformers.ops.memory_efficient_attention(q, q, q)
+                r.backward(torch.ones_like(q))
+
+                grad = q.grad
+                q.grad = None
+
+                rr = ref_attention(q, q, q)
+                rr.backward(torch.ones_like(q))
+                assert (grad - q.grad).abs().max() < 1e-5
+
+            out = xformers.ops.memory_efficient_attention(q, q, q)
+            grad = torch.ones_like(q)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+            results.append(
+                benchmark.Timer(
+                    stmt="out.backward(grad, retain_graph=True)",
+                    globals={
+                        "out": out,
+                        "grad": grad,
+                    },
+                    label="attention",
+                    description="optimized",
+                    sub_label=sub_label,
+                    num_threads=num_threads,
+                ).blocked_autorange(min_run_time=min_run_time)
+            )
+            torch.cuda.synchronize()
+            memory = torch.cuda.max_memory_allocated() / 2 ** 20
+            mem_use["optimized"][sub_label] = memory
+            memory_str = f"Memory used: {memory} MB"
+
+            print("Optimized", memory_str)
+
+            out = ref_attention(q, q, q)
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+            results.append(
+                benchmark.Timer(
+                    stmt="out.backward(grad, retain_graph=True)",
+                    globals={
+                        "out": out,
+                        "grad": grad,
+                    },
+                    label="attention",
+                    description="vanilla",
+                    sub_label=sub_label,
+                    num_threads=num_threads,
+                ).blocked_autorange(min_run_time=min_run_time)
+            )
+
+            torch.cuda.synchronize()
+            memory = torch.cuda.max_memory_allocated() / 2 ** 20
+            mem_use["vanilla"][sub_label] = memory
+            memory_str = f"Memory used: {memory} MB"
+            print("Vanilla", memory_str)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+    pprint.pprint(mem_use)
+
+
+benchmark_forward()
+benchmark_backward()
diff --git a/xformers/components/attention/csrc/attention.cpp b/xformers/components/attention/csrc/attention.cpp
@@ -2,5 +2,7 @@
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention(Tensor query, Tensor key, Tensor value) -> Tensor"));
+      "xformers::efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_logsumexp) -> (Tensor, Tensor)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor logsumexp) -> (Tensor, Tensor, Tensor)"));
 }