[metaformers] handling different normalizations + layer repetition (#345

) * handling different normalizations + layer repetition * bugfix localizing the layers in the stack (#348) * renaming the layer_norm_style param when building from config Co-authored-by: Benjamin Lefaudeux <lefaudeux@Benjamins-MacBook-Pro.local>
facebookresearch · Jul 14, 2022 · 3a7b713 · 3a7b713
1 parent 769cfe3
commit 3a7b713
Show file tree

Hide file tree

Showing 29 changed files with 330 additions and 235 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,14 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Removed dupliacated biases in the FusedMLP layers [#317]
 - Rotary embeddings respecting input types [#326]
 - Poolformer style instantiating useless projection layers [#349]
+- Fix layer position not being properly tracked, causing extra layernorms for programatic xformers [#348]
 
 ### Added
 - Four blocksparsity layouts from DeepSpeed [#320]
 - Support several initialization options [#312]
 - Conv2DFeedforward feedforward part [#321]
 - VisualAttention [#329]
 - Automatic blocksparse for causal attention [#334]
-
+- Better hierarchical transformer  generation [#345]
 
 ## [0.0.11] - 2022-05-30
 ### Fixed

diff --git a/HOWTO.md b/HOWTO.md
@@ -30,7 +30,7 @@ Let's present here a couple of code snippets on how to solve a couple of questio
     - [Intro](#intro)
     - [Transformer](#transformer)
     - [In practice](#in-practice)
-  - [Hierarchical Transformers](#hierarchical-transformers)
+    - [Hierarchical Transformers](#hierarchical-transformers)
 
 
 ## Understanding the dimension conventions
@@ -405,7 +405,7 @@ VOCAB = 64
 
 encoder_config = {
     "dim_model": EMB,
-    "layer_norm_style": "pre",  # Optional, pre/post
+    "residual_norm_style": "pre",  # Optional, pre/post
     "position_encoding_config": {
         "name": "vocab",  # whatever position encodinhg makes sense
         "seq_len": SEQ,
@@ -489,7 +489,7 @@ my_config = [
         "block_type": "encoder",
         "num_layers": 3,  # Optional, this means that this config will repeat N times
         "dim_model": EMB,
-        "layer_norm_style": "pre",  # Optional, pre/post
+        "residual_norm_style": "pre",  # Optional, pre/post
         "position_encoding_config": {
             "name": "vocab",  # whatever position encodinhg makes sense
             "seq_len": 1024,
@@ -520,7 +520,7 @@ my_config = [
         "block_type": "decoder",
         "num_layers": 3,  # Optional, this means that this config will repeat N times
         "dim_model": EMB,
-        "layer_norm_style": "pre",  # Optional, pre/post
+        "residual_norm_style": "pre",  # Optional, pre/post
         "position_encoding_config": {
             "name": "vocab",  # whatever position encodinhg makes sense
             "seq_len": SEQ,
@@ -778,6 +778,7 @@ A small helper is provided to make it easier to generate matching configurations
             stride=4,
             padding=2,
             seq_len=image_size * image_size // 16,
+            feedforward="MLP",
         ),
         BasicLayerConfig(
             embedding=128,
@@ -786,6 +787,7 @@ A small helper is provided to make it easier to generate matching configurations
             stride=2,
             padding=1,
             seq_len=image_size * image_size // 64,
+            feedforward="MLP",
         ),
         BasicLayerConfig(
             embedding=320,
@@ -794,13 +796,14 @@ A small helper is provided to make it easier to generate matching configurations
             stride=2,
             padding=1,
             seq_len=image_size * image_size // 256,
+            feedforward="MLP",
         ),
     ]
 
     # Fill in the gaps in the config
     xformer_config = get_hierarchical_configuration(
         base_hierarchical_configs,
-        layernorm_style="pre",
+        residual_norm_style="pre",
         use_rotary_embeddings=False,
         mlp_multiplier=4,
         dim_head=32,

diff --git a/docs/source/tutorials/hierarchical.rst b/docs/source/tutorials/hierarchical.rst
@@ -26,6 +26,7 @@ A small helper is provided to make it easier to generate matching configurations
             stride=4,
             padding=2,
             seq_len=image_size * image_size // 16,
+            feedforward="MLP",
         ),
         BasicLayerConfig(
             embedding=128,
@@ -34,6 +35,7 @@ A small helper is provided to make it easier to generate matching configurations
             stride=2,
             padding=1,
             seq_len=image_size * image_size // 64,
+            feedforward="MLP",
         ),
         BasicLayerConfig(
             embedding=320,
@@ -42,13 +44,14 @@ A small helper is provided to make it easier to generate matching configurations
             stride=2,
             padding=1,
             seq_len=image_size * image_size // 256,
+            feedforward="MLP",
         ),
     ]
 
     # Fill in the gaps in the config
     xformer_config = get_hierarchical_configuration(
         base_hierarchical_configs,
-        layernorm_style="pre",
+        residual_norm_style="pre",
         use_rotary_embeddings=False,
         mlp_multiplier=4,
         dim_head=32,

diff --git a/docs/source/tutorials/pytorch_encoder.rst b/docs/source/tutorials/pytorch_encoder.rst
@@ -59,7 +59,7 @@ With this said, you can build an encoder directly as follows:
 
     encoder_config = {
         "dim_model": EMB,
-        "layer_norm_style": "pre",  # Optional, pre/post
+        "residual_norm_style": "pre",  # Optional, pre/post
         "position_encoding_config": {
             "name": "vocab",  # whatever position encodinhg makes sense
             "seq_len": SEQ,
@@ -158,7 +158,7 @@ There's also an added flexibility with xFormers in that attention mechanisms can
             "block_type": "encoder",
             "num_layers": 3,  # Optional, this means that this config will repeat N times
             "dim_model": EMB,
-            "layer_norm_style": "pre",  # Optional, pre/post
+            "residual_norm_style": "pre",  # Optional, pre/post
             "position_encoding_config": {
                 "name": "vocab",  # whatever position encodinhg makes sense
                 "seq_len": 1024,
@@ -186,7 +186,7 @@ There's also an added flexibility with xFormers in that attention mechanisms can
             "block_type": "decoder",
             "num_layers": 3,  # Optional, this means that this config will repeat N times
             "dim_model": EMB,
-            "layer_norm_style": "pre",  # Optional, pre/post
+            "residual_norm_style": "pre",  # Optional, pre/post
             "position_encoding_config": {
                 "name": "vocab",  # whatever position encodinhg makes sense
                 "seq_len": SEQ,

diff --git a/docs/source/xformers_mingpt.ipynb b/docs/source/xformers_mingpt.ipynb
@@ -124,7 +124,7 @@
         "                \"block_type\": \"encoder\",\n",
         "                \"num_layers\": self.hparams.n_layer,\n",
         "                \"dim_model\": self.hparams.n_embd,\n",
-        "                \"layer_norm_style\": \"pre\",\n",
+        "                \"residual_norm_style\": \"pre\",\n",
         "                \"position_encoding_config\": {\n",
         "                    \"name\": \"vocab\",\n",
         "                    \"seq_len\": self.hparams.block_size,\n",
@@ -491,4 +491,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
diff --git a/examples/README.md b/examples/README.md
@@ -20,14 +20,23 @@ and finally an inference example or some test loss and accuracy.
 If your current machine does not expose enough RAM and the example reports an `OutOfMemoryError`, please adjust the batch size.
 
 
-### MicroViT
+## NLP: microGPT
+
+This is an hommage to [minGPT](https://github.com/karpathy/minGPT), in particular the training over Shakespeare dialogs of an autoregressive model. The default configuration is that of a standard Transformer, but you can change parts as you see fit. You can get to reasonable results within an hour or so on a single GPU.
+
+## Vision models
+
+You can find a couple of very small examples, of models being trained on the CIFAR10 dataset. They can be modified to training on something like ImageNet with minimal changes, but running them out of the box requires a bit more work in that case.
+
+
+### ViT
 
 This is meant to be an easy introduction to using xformers in practice, mirroring closely [this Pytorch Lightning](https://pytorchlightning.github.io/lightning-tutorials/notebooks/lightning_examples/cifar10-baseline.html) tutorial. The default settings are close to this tutorial, which trains a 11M parameters ResNet on the CIFAR dataset, we train a 10.6M ViT on the same dataset. The ViT configuration is not optimal for CIFAR, since the pictures have a very small size to begin with and information is probably lost given the patches. Nevertheless you should be able to reach about 80% accuracy within about an hour on a single GPU.
 
 ![Example curves](../docs/assets/microViT.png)
 
 
-### MicroMetaformer
+### Metaformer
 
 This is very close to the MicroViT example above, but illustrating the use of a hierarchical Transformer ([Metaformer](https://arxiv.org/pdf/2111.11418.pdf)) this time, through a helper function which generates the required configuration given the pooling parameters. The suggested configuration is about 6.6M parameters big (half of a ResNet18) and trains to about 86% top-1 Cifar10 within minutes.
 

diff --git a/examples/build_model/conf/stack/base_decoder.yaml b/examples/build_model/conf/stack/base_decoder.yaml
@@ -6,21 +6,21 @@ reversible: False  # Optionally make these layers reversible to save memory
 num_layers: 3  # Optional this means that this config will repeat N times
 block_type: decoder
 dim_model: ${emb}
-layer_norm_style: pre  # Optional pre/post
-position_encoding_config: 
+residual_norm_style: pre  # Optional pre/post
+position_encoding_config:
   name: vocab  # whatever position encodinhg makes sense
   seq_len: ${seq}
   vocab_size: ${vocab}
   dropout: 0
-multi_head_config_masked: 
+multi_head_config_masked:
   num_heads: 4
   residual_dropout: 0
   attention: ???
-multi_head_config_cross: 
+multi_head_config_cross:
   num_heads: 4
   residual_dropout: 0
   attention: ???
-feedforward_config: 
+feedforward_config:
   name: MLP
   dropout: 0
   activation: relu

diff --git a/examples/build_model/conf/stack/base_encoder.yaml b/examples/build_model/conf/stack/base_encoder.yaml
@@ -6,17 +6,17 @@ reversible: False
 num_layers: 4
 use_triton: True
 dim_model: ${emb}
-layer_norm_style: pre
+residual_norm_style: pre
 position_encoding_config:
   name: vocab
   seq_len: 1024
   vocab_size: ${vocab}
   dropout: 0
-multi_head_config: 
+multi_head_config:
   num_heads: 4
   residual_dropout: 0
   attention: ???
-feedforward_config: 
+feedforward_config:
   name: MLP
   dropout: 0
   activation: relu

diff --git a/examples/cifarMetaformer.py → examples/cifar_MetaFormer.py b/examples/cifarMetaformer.py → examples/cifar_MetaFormer.py
@@ -7,18 +7,25 @@
 import pytorch_lightning as pl
 import torch
 from pl_bolts.datamodules import CIFAR10DataModule
-from pl_bolts.transforms.dataset_normalizations import cifar10_normalization
 from torch import nn
 from torchmetrics import Accuracy
-from torchvision import transforms
 
-from examples.microViT import Classifier, VisionTransformer
+from examples.cifar_ViT import Classifier, VisionTransformer
 from xformers.factory import xFormer, xFormerConfig
 from xformers.helpers.hierarchical_configs import (
     BasicLayerConfig,
     get_hierarchical_configuration,
 )
 
+# This is very close to the cifarViT example, and reuses a lot of the training code, only the model part is different.
+# There are many ways one can use xformers to write down a MetaFormer, for instance by
+# picking up the parts from `xformers.components` and implementing the model explicitly,
+# or by patching another existing ViT-like implementation.
+
+# This example takes another approach, as we define the whole model configuration in one go (dict structure)
+# and then use the xformers factory to generate the model. This obfuscates a lot of the model building
+# (though you can inspect the resulting implementation), but makes it trivial to do some hyperparameter search
+
 
 class MetaVisionTransformer(VisionTransformer):
     def __init__(
@@ -32,7 +39,7 @@ def __init__(
         dim=384,
         attention="scaled_dot_product",
         feedforward="MLP",
-        layer_norm_style="pre",
+        residual_norm_style="pre",
         use_rotary_embeddings=True,
         linear_warmup_ratio=0.1,
         classifier=Classifier.GAP,
@@ -44,9 +51,10 @@ def __init__(
         self.save_hyperparameters()
 
         # Generate the skeleton of our hierarchical Transformer
-
-        # This is a small poolformer configuration, adapted to the small CIFAR10 pictures (32x32)
-        # Any other related config would work, and the attention mechanisms don't have to be the same across layers
+        # - This is a small poolformer configuration, adapted to the small CIFAR10 pictures (32x32)
+        # - Please note that this does not match the L1 configuration in the paper, as this would correspond to repeated
+        #   layers. CIFAR pictures are too small for this config to be directly meaningful (although that would run)
+        # - Any other related config would work, and the attention mechanisms don't have to be the same across layers
         base_hierarchical_configs = [
             BasicLayerConfig(
                 embedding=64,
@@ -55,6 +63,8 @@ def __init__(
                 stride=2,
                 padding=1,
                 seq_len=image_size * image_size // 4,
+                feedforward=feedforward,
+                repeat_layer=1,
             ),
             BasicLayerConfig(
                 embedding=128,
@@ -63,6 +73,8 @@ def __init__(
                 stride=2,
                 padding=1,
                 seq_len=image_size * image_size // 16,
+                feedforward=feedforward,
+                repeat_layer=1,
             ),
             BasicLayerConfig(
                 embedding=320,
@@ -71,6 +83,8 @@ def __init__(
                 stride=2,
                 padding=1,
                 seq_len=image_size * image_size // 64,
+                feedforward=feedforward,
+                repeat_layer=1,
             ),
             BasicLayerConfig(
                 embedding=512,
@@ -79,17 +93,18 @@ def __init__(
                 stride=2,
                 padding=1,
                 seq_len=image_size * image_size // 256,
+                feedforward=feedforward,
+                repeat_layer=1,
             ),
         ]
 
         # Fill in the gaps in the config
         xformer_config = get_hierarchical_configuration(
             base_hierarchical_configs,
-            layernorm_style=layer_norm_style,
+            residual_norm_style=residual_norm_style,
             use_rotary_embeddings=use_rotary_embeddings,
             mlp_multiplier=4,
             dim_head=32,
-            feedforward="Conv2DFeedforward",
         )
 
         # Now instantiate the metaformer trunk
@@ -131,34 +146,16 @@ def forward(self, x):
     torch.cuda.manual_seed_all(42)
     torch.manual_seed(42)
 
-    train_transforms = transforms.Compose(
-        [
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            cifar10_normalization(),
-        ]
-    )
-
-    test_transforms = transforms.Compose(
-        [
-            transforms.ToTensor(),
-            cifar10_normalization(),
-        ]
-    )
-
     # We'll use a datamodule here, which already handles dataset/dataloader/sampler
-    # See https://pytorchlightning.github.io/lightning-tutorials/notebooks/lightning_examples/cifar10-baseline.html
+    # - See https://pytorchlightning.github.io/lightning-tutorials/notebooks/lightning_examples/cifar10-baseline.html
     # for a full tutorial
+    # - Please note that default transforms are being used
     dm = CIFAR10DataModule(
         data_dir="data",
         batch_size=BATCH,
         num_workers=NUM_WORKERS,
         pin_memory=True,
     )
-    dm.train_transforms = train_transforms
-    dm.test_transforms = test_transforms
-    dm.val_transforms = test_transforms
 
     image_size = dm.size(-1)  # 32 for CIFAR
     num_classes = dm.num_classes  # 10 for CIFAR
@@ -171,7 +168,7 @@ def forward(self, x):
         image_size=image_size,
         num_classes=num_classes,
         attention="scaled_dot_product",
-        layer_norm_style="pre",
+        residual_norm_style="pre",
         feedforward="MLP",
         use_rotary_embeddings=True,
     )