[Minor] Expose bias options for both MLP and FusedMLP, use same defau…

…lts (#220) * expose the bias options for both MLP and FusedMLP, use the same defaults * using the same eps in layernorm as default torch (#221)
facebookresearch · Mar 2, 2022 · d4c28fb · d4c28fb
1 parent a65c243
commit d4c28fb
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.0.x] - TBD
+### Fixed
+- Expose bias flag for feedforwards, same default as Timm [#220]
+- Update eps value for layernormm, same default as torch [#221]
+
 ## [0.0.9] - 2022-02-09
 ### Added
 - Compositional Attention [#41]

diff --git a/examples/microViT.py b/examples/microViT.py
@@ -208,7 +208,7 @@ def test_step(self, batch, _):
     # Adjust batch depending on the available memory on your machine.
     # You can also use reversible layers to save memory
     REF_BATCH = 4096
-    BATCH = 512
+    BATCH = 256
 
     MAX_EPOCHS = 20
     NUM_WORKERS = 4

diff --git a/tests/test_triton_layernorm.py b/tests/test_triton_layernorm.py
@@ -53,8 +53,8 @@ def test_layernorm_parity(shape, amp):
     eps = 1e-5
 
     # Initialize the two layers, weights are 1 and 0 by default, no randomness
-    torch_layernorm = torch.nn.LayerNorm(X.shape[-1], eps).to("cuda")
-    triton_layernorm = FusedLayerNorm(X.shape[-1], eps).to("cuda")
+    torch_layernorm = torch.nn.LayerNorm(X.shape[-1], eps=eps).to("cuda")
+    triton_layernorm = FusedLayerNorm(X.shape[-1], affine=True, eps=eps).to("cuda")
 
     with autocast(enabled=amp):
         assert torch.allclose(X, X_)  # sanity checking, else all hell breaks loose

diff --git a/xformers/components/feedforward/fused_mlp.py b/xformers/components/feedforward/fused_mlp.py
@@ -37,6 +37,7 @@ def __init__(
                 dropout: float,
                 activation: Activation,
                 hidden_layer_multiplier: int,
+                bias: bool = True,
                 *args,
                 **kwargs,
             ):
@@ -45,13 +46,13 @@ def __init__(
                 dim_mlp = hidden_layer_multiplier * dim_model
 
                 self.mlp = nn.Sequential(
-                    nn.Linear(in_features=dim_model, out_features=dim_mlp, bias=False),
+                    nn.Linear(in_features=dim_model, out_features=dim_mlp, bias=bias),
                     # pyre-ignore[16]: TODO(T101400990): Pyre did not recognize
                     # the `FusedLinear` import.
                     FusedDropoutBias(
                         p=dropout, bias_shape=dim_mlp, activation=activation
                     ),
-                    nn.Linear(in_features=dim_mlp, out_features=dim_model, bias=False),
+                    nn.Linear(in_features=dim_mlp, out_features=dim_model, bias=bias),
                     # pyre-ignore[16]: TODO(T101400990): Pyre did not recognize
                     # the `FusedLinear` import.
                     FusedDropoutBias(p=dropout, bias_shape=dim_model, activation=None),

diff --git a/xformers/components/feedforward/mlp.py b/xformers/components/feedforward/mlp.py
@@ -28,16 +28,17 @@ def __init__(
         dropout: float,
         activation: Activation,
         hidden_layer_multiplier: int,
+        bias: bool = True,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
         self.mlp = nn.Sequential(
-            nn.Linear(dim_model, hidden_layer_multiplier * dim_model),
+            nn.Linear(dim_model, hidden_layer_multiplier * dim_model, bias=bias),
             build_activation(activation),
             nn.Dropout(dropout),
-            nn.Linear(hidden_layer_multiplier * dim_model, dim_model),
+            nn.Linear(hidden_layer_multiplier * dim_model, dim_model, bias=bias),
             nn.Dropout(dropout),
         )
 

diff --git a/xformers/triton/layer_norm.py b/xformers/triton/layer_norm.py
@@ -32,7 +32,7 @@ class FusedLayerNorm(nn.Module):
 
     """
 
-    def __init__(self, normalized_shape, affine=True, eps=1e-05):
+    def __init__(self, normalized_shape, affine=True, eps=1e-06):
         super().__init__()
         if affine:
             self.weight = nn.Parameter(torch.ones(normalized_shape))
@@ -49,7 +49,7 @@ def layer_norm(
     x: torch.Tensor,
     weight: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
-    eps: float = 1e-05,
+    eps: float = 1e-06,
 ) -> torch.Tensor:
 
     global _triton_registered_warnings