Skip to content

Commit

Permalink
[Minor] Expose bias options for both MLP and FusedMLP, use same defau…
Browse files Browse the repository at this point in the history
…lts (#220)

* expose the bias options for both MLP and FusedMLP, use the same defaults
* using the same eps in layernorm as default torch (#221)
  • Loading branch information
blefaudeux committed Mar 2, 2022
1 parent a65c243 commit d4c28fb
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 10 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.0.x] - TBD
### Fixed
- Expose bias flag for feedforwards, same default as Timm [#220]
- Update eps value for layernormm, same default as torch [#221]

## [0.0.9] - 2022-02-09
### Added
- Compositional Attention [#41]
Expand Down
2 changes: 1 addition & 1 deletion examples/microViT.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_step(self, batch, _):
# Adjust batch depending on the available memory on your machine.
# You can also use reversible layers to save memory
REF_BATCH = 4096
BATCH = 512
BATCH = 256

MAX_EPOCHS = 20
NUM_WORKERS = 4
Expand Down
4 changes: 2 additions & 2 deletions tests/test_triton_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def test_layernorm_parity(shape, amp):
eps = 1e-5

# Initialize the two layers, weights are 1 and 0 by default, no randomness
torch_layernorm = torch.nn.LayerNorm(X.shape[-1], eps).to("cuda")
triton_layernorm = FusedLayerNorm(X.shape[-1], eps).to("cuda")
torch_layernorm = torch.nn.LayerNorm(X.shape[-1], eps=eps).to("cuda")
triton_layernorm = FusedLayerNorm(X.shape[-1], affine=True, eps=eps).to("cuda")

with autocast(enabled=amp):
assert torch.allclose(X, X_) # sanity checking, else all hell breaks loose
Expand Down
5 changes: 3 additions & 2 deletions xformers/components/feedforward/fused_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
dropout: float,
activation: Activation,
hidden_layer_multiplier: int,
bias: bool = True,
*args,
**kwargs,
):
Expand All @@ -45,13 +46,13 @@ def __init__(
dim_mlp = hidden_layer_multiplier * dim_model

self.mlp = nn.Sequential(
nn.Linear(in_features=dim_model, out_features=dim_mlp, bias=False),
nn.Linear(in_features=dim_model, out_features=dim_mlp, bias=bias),
# pyre-ignore[16]: TODO(T101400990): Pyre did not recognize
# the `FusedLinear` import.
FusedDropoutBias(
p=dropout, bias_shape=dim_mlp, activation=activation
),
nn.Linear(in_features=dim_mlp, out_features=dim_model, bias=False),
nn.Linear(in_features=dim_mlp, out_features=dim_model, bias=bias),
# pyre-ignore[16]: TODO(T101400990): Pyre did not recognize
# the `FusedLinear` import.
FusedDropoutBias(p=dropout, bias_shape=dim_model, activation=None),
Expand Down
7 changes: 4 additions & 3 deletions xformers/components/feedforward/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,17 @@ def __init__(
dropout: float,
activation: Activation,
hidden_layer_multiplier: int,
bias: bool = True,
*args,
**kwargs
**kwargs,
):
super().__init__()

self.mlp = nn.Sequential(
nn.Linear(dim_model, hidden_layer_multiplier * dim_model),
nn.Linear(dim_model, hidden_layer_multiplier * dim_model, bias=bias),
build_activation(activation),
nn.Dropout(dropout),
nn.Linear(hidden_layer_multiplier * dim_model, dim_model),
nn.Linear(hidden_layer_multiplier * dim_model, dim_model, bias=bias),
nn.Dropout(dropout),
)

Expand Down
4 changes: 2 additions & 2 deletions xformers/triton/layer_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class FusedLayerNorm(nn.Module):
"""

def __init__(self, normalized_shape, affine=True, eps=1e-05):
def __init__(self, normalized_shape, affine=True, eps=1e-06):
super().__init__()
if affine:
self.weight = nn.Parameter(torch.ones(normalized_shape))
Expand All @@ -49,7 +49,7 @@ def layer_norm(
x: torch.Tensor,
weight: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
eps: float = 1e-05,
eps: float = 1e-06,
) -> torch.Tensor:

global _triton_registered_warnings
Expand Down

0 comments on commit d4c28fb

Please sign in to comment.