[feat] Compositional attention (#178)

* Initial implementation * adding the graphs * doc + removing seemingly niche options * minor fixes to the LRA setup * Refactor the projection, align args on other attentions * code review, thanks @dianaml0 * adding some more explanations
facebookresearch · Jan 20, 2022 · cdbe195 · cdbe195
1 parent c16078b
commit cdbe195
Show file tree

Hide file tree

Showing 17 changed files with 496 additions and 15 deletions.
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -11,7 +11,7 @@ Please note that:
 - These numbers are dependent of hyperparameters (dimensions chosen for Linformer, sparsity of the pattern), they are mostly an illustration
 - The sparse attention patterns tested here are just presets, as explained in the linked notebook generating any new sparse attention pattern should be relatively easy, while keeping the benefits of optimized computations.
 
-Some examples, generated with `python3 xformers/benchmarks/benchmark_encoder.py --activations gelu --plot -emb 256 -bs 32 -heads 16`
+Some examples, generated with `python3 xformers/benchmarks/benchmark_encoder.py --activations gelu --plot -emb 256 -bs 8 -heads 4`
 
 ![Memory use for different attentions](docs/plots/memory_vs_attention.png)  ![Runtime for different attentions](docs/plots/runtime_vs_attention.png)
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## TBD
+### Added
+- Compositional Attention [#41]
+
 ### Fixed
 - bugfix Favor, single feature map [#183]
 

diff --git a/README.md b/README.md
@@ -139,6 +139,9 @@ Patrick et al., 2021](https://arxiv.org/abs/2106.05392)*
   - See BigBird, Longformers,..
 - [FourierMix](xformers/components/attention/fourier_mix.py)
   - *[FNet: Mixing Tokens with Fourier Transforms, Lee-Thorp et al.](https://arxiv.org/abs/2105.03824v1)*
+- [CompositionalAttention](xformers/components/attention/compositional.py)
+  - *[Compositional Attention: Disentangling search and retrieval, S. Mittal et al.](https://arxiv.org/pdf/2110.09419v1.pdf)*
+
 - ... add a new one [see Contribution.md](CONTRIBUTING.md)
 
 </p></details>

diff --git a/docs/plots/memory_vs_attention.png b/docs/plots/memory_vs_attention.png
diff --git a/docs/plots/runtime_vs_attention.png b/docs/plots/runtime_vs_attention.png
diff --git a/examples/microGPT.py b/examples/microGPT.py
@@ -68,6 +68,7 @@ def __init__(
                         "dropout": self.hparams.attn_pdrop,
                         "causal": True,
                         "seq_len": self.hparams.block_size,
+                        "num_rules": self.hparams.n_head,
                     },
                 },
                 "feedforward_config": {

diff --git a/requirements-lra.txt b/requirements-lra.txt
@@ -4,5 +4,6 @@
 tensorboard>=2.3.0
 tensorflow>=2.3.1
 tensorflow-datasets>=4.0.1
+tensorflow-text>=2.7.3
 submitit
 fvcore
diff --git a/tests/test_attentions.py b/tests/test_attentions.py
@@ -43,10 +43,12 @@ def _get_multihead(
         "dropout": attn_dropout,
         "causal": causal,
         "seq_len": SEQ,
-        "window_size": SEQ // 8 + 1,
+        "window_size": SEQ // 8 + 1,  # local attention
         "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
+        "dim_model": MODEL,
         "num_heads": heads,
         "dim_head": MODEL / heads,
+        "num_rules": 2,  # Compositional Attention
     }
 
     if skip_output_projection:

diff --git a/tests/test_block_factory.py b/tests/test_block_factory.py
@@ -58,10 +58,12 @@ def test_xformer_encoder_block(
         "window_size": SEQ // 8 + 1,
         "seq_len": SEQ,
         "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
+        "dim_model": MODEL,
         "num_heads": heads,
-        "dim_head": MODEL / heads,
+        "dim_head": MODEL // heads,
         "layout": torch.eye(SEQ // block_size, SEQ // block_size, dtype=torch.long),
         "block_size": block_size,
+        "num_rules": 2,  # Compositional Attention
     }
 
     multi_head_config = {
@@ -146,11 +148,11 @@ def test_xformer_decoder_block(
         "causal": causal,
         "window_size": SEQ // 8 + 1,
         "seq_len": SEQ,
+        "dim_head": MODEL // heads,
         "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
-        "num_heads": heads,
-        "dim_head": MODEL / heads,
         "layout": torch.eye(SEQ // block_size, SEQ // block_size, dtype=torch.long),
         "block_size": block_size,
+        "num_rules": 2,  # Compositional Attention
     }
 
     multi_head_config = {

diff --git a/tests/test_compositional_attention.py b/tests/test_compositional_attention.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from xformers.components import MultiHeadDispatch
+
+# Automatically test all the registered attentions
+from xformers.components.attention import (
+    _DENSITY_THRESHOLD,
+    ATTENTION_REGISTRY,
+    build_attention,
+)
+
+DEVICES = (
+    [torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
+)
+
+BATCH = 2
+SEQ = 128 if torch.cuda.is_available() else 32
+MODEL = 128 if torch.cuda.is_available() else 64
+GLOBAL_ATTENTION_RATIO = (
+    _DENSITY_THRESHOLD * 0.9
+)  # Make sure that we test the sparse implementation, no matter the threshold
+
+assert ATTENTION_REGISTRY.keys(), "Attention layers should have been registered"
+
+
+@pytest.mark.parametrize("attn_dropout", [0.0, 0.3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("heads", [1, 4])
+@pytest.mark.parametrize("rules", [1, 4])
+@pytest.mark.parametrize("q_compose", [False, True])
+@pytest.mark.parametrize("dim_selection", [MODEL // 2, None])
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("qk_rule", [True, False])
+@pytest.mark.parametrize("nonlinear", [True, False])
+@pytest.mark.parametrize("device", DEVICES)
+def test_build_and_run(
+    heads: int,
+    attn_dropout: float,
+    causal: bool,
+    rules: int,
+    q_compose: bool,
+    dim_selection: int,
+    bias: bool,
+    qk_rule: bool,
+    nonlinear: bool,
+    device: torch.device,
+):
+
+    torch.manual_seed(42)
+
+    test_config = {
+        "name": "compositional",
+        "dropout": attn_dropout,
+        "causal": causal,
+        "seq_len": SEQ,
+        "window_size": SEQ // 8 + 1,  # local attention
+        "attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
+        "dim_model": MODEL,
+        "num_heads": heads,
+        "num_rules": 2,  # Compositional Attention
+        "q_compose": q_compose,
+        "rules": rules,
+        "dim_selection": dim_selection,
+        "bias": bias,
+        "qk_rule": qk_rule,
+        "nonlinear": nonlinear,
+    }
+
+    # Add some blocksparse layout to test the corresponding attention
+    block_size = 16
+    test_config["layout"] = torch.eye(
+        SEQ // block_size, SEQ // block_size, dtype=torch.long
+    )
+    test_config["block_size"] = block_size
+
+    attention = build_attention(test_config)
+
+    # build a multi head dispatch to test this attention mechanism
+    multi_head = MultiHeadDispatch(
+        seq_len=SEQ,
+        dim_model=MODEL,
+        num_heads=heads,
+        attention=attention,
+        residual_dropout=0.0,
+    ).to(device)
+
+    # Check that a shuffled input produces the same results
+    seqs = [SEQ, SEQ - 16]
+
+    for seq in seqs:
+        # Check that we can pass a smaller sequence
+        inputs = torch.rand(BATCH, seq, MODEL, device=device)
+        shuffle = torch.randperm(inputs.shape[1])
+        inputs_shuffled = inputs[:, shuffle, :].clone()
+
+        results = multi_head(inputs, inputs, inputs)
+        results_shuffled = multi_head(inputs_shuffled, inputs_shuffled, inputs_shuffled)
+
+        torch.allclose(results[:, shuffle, :], results_shuffled)
+
+        # Test the non-self-attention codepath
+        att = multi_head(inputs, inputs_shuffled, inputs)
+
+        # Check that dropout actually drops some values
+        if attn_dropout > 0:
+            att_2 = multi_head(inputs, inputs_shuffled, inputs)
+            assert (att != att_2).any()
diff --git a/xformers/benchmarks/LRA/code/config.json b/xformers/benchmarks/LRA/code/config.json
@@ -80,7 +80,7 @@
             "eval_frequency": 50,
             "num_train_steps": 10000,
             "num_eval_steps": 62,
-            "gradient_accumulation": 1
+            "gradient_accumulation": 2
         },
         "model": {
             "pooling_mode": "mean",
@@ -94,7 +94,7 @@
             },
             "xformer": [
                 {
-                    "reversible": true,
+                    "reversible": false,
                     "block_type": "encoder",
                     "num_layers": 2,
                     "layer_norm_style": "pre",

diff --git a/xformers/components/__init__.py b/xformers/components/__init__.py
@@ -48,6 +48,11 @@ def build_multi_head_attention(
                     "num_heads"
                 ]
 
+            if "dim_model" not in multi_head_config["attention"]:
+                multi_head_config["attention"]["dim_model"] = multi_head_config[
+                    "dim_model"
+                ]
+
             if (
                 "dim_features" not in multi_head_config["attention"]
                 or multi_head_config["attention"]["dim_features"] is None

diff --git a/xformers/components/attention/attention_mask.py b/xformers/components/attention/attention_mask.py
@@ -24,7 +24,7 @@ class AttentionMask:
     """
 
     def __init__(self, additive_mask: torch.Tensor, is_causal: bool = False):
-        assert additive_mask.is_floating_point()
+        assert additive_mask.is_floating_point(), additive_mask.dtype
         assert not additive_mask.requires_grad
 
         if additive_mask.ndim == 2:
@@ -49,7 +49,7 @@ def from_bool(cls: Type[Self], x: torch.Tensor) -> Self:
         """
         assert x.dtype == torch.bool
 
-        additive_mask = torch.empty_like(x, dtype=torch.float)
+        additive_mask = torch.empty_like(x, dtype=torch.float, device=x.device)
         additive_mask.masked_fill_(x, 0.0)
         additive_mask.masked_fill_(~x, float("-inf"))
 
@@ -62,7 +62,7 @@ def from_multiplicative(cls: Type[Self], x: torch.Tensor) -> Self:
         """
         assert not x.dtype == torch.bool
 
-        additive_mask = torch.empty_like(x, dtype=torch.float)
+        additive_mask = torch.empty_like(x, dtype=torch.float, device=x.device)
         x = x.bool()
 
         additive_mask.masked_fill_(x, 0.0)

diff --git a/xformers/components/attention/base.py b/xformers/components/attention/base.py
@@ -11,6 +11,8 @@
 import torch
 import torch.nn as nn
 
+from xformers.components.attention import AttentionMask
+
 
 @dataclass
 class AttentionConfig:
@@ -29,7 +31,7 @@ class AttentionConfig:
 class Attention(nn.Module, metaclass=ABCMeta):
     r"""The base Attention mechanism, which is typically a sub-part of the multi-head attention"""
 
-    _causal_mask: Optional[torch.Tensor] = None
+    _causal_mask: Optional[AttentionMask] = None
 
     @abstractmethod
     def __init__(self, dropout: Optional[float] = None, *args, **kwargs):
@@ -47,6 +49,10 @@ def __init__(self, dropout: Optional[float] = None, *args, **kwargs):
         # Requires that K and Q have the same sequence length
         self.requires_same_k_q_dimensions = False
 
+        # Whether the attention owns the single head/multihead mechanism
+        # so that the MHA wrapper should skip it
+        self.requires_skip_multi_head = False
+
     @classmethod
     def from_config(cls: Type[Self], config: AttentionConfig) -> Self:
         # Generate the class inputs from the config