Skip to content

Commit

Permalink
renaming the layer_norm_style param when building from config
Browse files Browse the repository at this point in the history
  • Loading branch information
Benjamin Lefaudeux authored and blefaudeux committed Jul 8, 2022
1 parent 58b36eb commit 78f8b7b
Show file tree
Hide file tree
Showing 22 changed files with 132 additions and 127 deletions.
6 changes: 3 additions & 3 deletions HOWTO.md
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ VOCAB = 64

encoder_config = {
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": SEQ,
Expand Down Expand Up @@ -489,7 +489,7 @@ my_config = [
"block_type": "encoder",
"num_layers": 3, # Optional, this means that this config will repeat N times
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": 1024,
Expand Down Expand Up @@ -520,7 +520,7 @@ my_config = [
"block_type": "decoder",
"num_layers": 3, # Optional, this means that this config will repeat N times
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": SEQ,
Expand Down
6 changes: 3 additions & 3 deletions docs/source/tutorials/pytorch_encoder.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ With this said, you can build an encoder directly as follows:
encoder_config = {
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": SEQ,
Expand Down Expand Up @@ -158,7 +158,7 @@ There's also an added flexibility with xFormers in that attention mechanisms can
"block_type": "encoder",
"num_layers": 3, # Optional, this means that this config will repeat N times
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": 1024,
Expand Down Expand Up @@ -186,7 +186,7 @@ There's also an added flexibility with xFormers in that attention mechanisms can
"block_type": "decoder",
"num_layers": 3, # Optional, this means that this config will repeat N times
"dim_model": EMB,
"layer_norm_style": "pre", # Optional, pre/post
"residual_norm_style": "pre", # Optional, pre/post
"position_encoding_config": {
"name": "vocab", # whatever position encodinhg makes sense
"seq_len": SEQ,
Expand Down
4 changes: 2 additions & 2 deletions docs/source/xformers_mingpt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@
" \"block_type\": \"encoder\",\n",
" \"num_layers\": self.hparams.n_layer,\n",
" \"dim_model\": self.hparams.n_embd,\n",
" \"layer_norm_style\": \"pre\",\n",
" \"residual_norm_style\": \"pre\",\n",
" \"position_encoding_config\": {\n",
" \"name\": \"vocab\",\n",
" \"seq_len\": self.hparams.block_size,\n",
Expand Down Expand Up @@ -491,4 +491,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
}
}
10 changes: 5 additions & 5 deletions examples/build_model/conf/stack/base_decoder.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@ reversible: False # Optionally make these layers reversible to save memory
num_layers: 3 # Optional this means that this config will repeat N times
block_type: decoder
dim_model: ${emb}
layer_norm_style: pre # Optional pre/post
position_encoding_config:
residual_norm_style: pre # Optional pre/post
position_encoding_config:
name: vocab # whatever position encodinhg makes sense
seq_len: ${seq}
vocab_size: ${vocab}
dropout: 0
multi_head_config_masked:
multi_head_config_masked:
num_heads: 4
residual_dropout: 0
attention: ???
multi_head_config_cross:
multi_head_config_cross:
num_heads: 4
residual_dropout: 0
attention: ???
feedforward_config:
feedforward_config:
name: MLP
dropout: 0
activation: relu
Expand Down
6 changes: 3 additions & 3 deletions examples/build_model/conf/stack/base_encoder.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ reversible: False
num_layers: 4
use_triton: True
dim_model: ${emb}
layer_norm_style: pre
residual_norm_style: pre
position_encoding_config:
name: vocab
seq_len: 1024
vocab_size: ${vocab}
dropout: 0
multi_head_config:
multi_head_config:
num_heads: 4
residual_dropout: 0
attention: ???
feedforward_config:
feedforward_config:
name: MLP
dropout: 0
activation: relu
Expand Down
6 changes: 3 additions & 3 deletions examples/cifar_MetaFormer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(
dim=384,
attention="scaled_dot_product",
feedforward="MLP",
layer_norm_style="pre",
residual_norm_style="pre",
use_rotary_embeddings=True,
linear_warmup_ratio=0.1,
classifier=Classifier.GAP,
Expand Down Expand Up @@ -101,7 +101,7 @@ def __init__(
# Fill in the gaps in the config
xformer_config = get_hierarchical_configuration(
base_hierarchical_configs,
layernorm_style=layer_norm_style,
layernorm_style=residual_norm_style,
use_rotary_embeddings=use_rotary_embeddings,
mlp_multiplier=4,
dim_head=32,
Expand Down Expand Up @@ -168,7 +168,7 @@ def forward(self, x):
image_size=image_size,
num_classes=num_classes,
attention="scaled_dot_product",
layer_norm_style="pre",
residual_norm_style="pre",
feedforward="MLP",
use_rotary_embeddings=True,
)
Expand Down
6 changes: 3 additions & 3 deletions examples/cifar_ViT.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
attn_pdrop=0.0,
mlp_pdrop=0.0,
attention="scaled_dot_product",
layer_norm_style="pre",
residual_norm_style="pre",
hidden_layer_multiplier=4,
use_rotary_embeddings=True,
linear_warmup_ratio=0.1,
Expand All @@ -67,7 +67,7 @@ def __init__(
"block_type": "encoder",
"num_layers": n_layer,
"dim_model": dim,
"layer_norm_style": layer_norm_style,
"residual_norm_style": residual_norm_style,
"multi_head_config": {
"num_heads": n_head,
"residual_dropout": resid_pdrop,
Expand Down Expand Up @@ -226,7 +226,7 @@ def test_step(self, batch, _):
num_classes=num_classes,
attention="scaled_dot_product",
classifier=Classifier.TOKEN,
layer_norm_style="pre",
residual_norm_style="pre",
use_rotary_embeddings=True,
)
trainer = pl.Trainer(
Expand Down
2 changes: 1 addition & 1 deletion examples/microGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
"block_type": "encoder",
"num_layers": self.hparams.n_layer,
"dim_model": self.hparams.n_embd,
"layer_norm_style": "post",
"residual_norm_style": "post",
"position_encoding_config": {
"name": "vocab",
"seq_len": self.hparams.block_size,
Expand Down
20 changes: 10 additions & 10 deletions tests/test_block_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
@pytest.mark.parametrize("activation", [a.value for a in Activation])
@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
@pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
@pytest.mark.parametrize("layer_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("reversible", [True, False])
@pytest.mark.skipif(
Expand All @@ -46,7 +46,7 @@ def test_xformer_encoder_block(
attn_dropout: float,
residual_dropout: float,
activation: Activation,
layer_norm_style: str,
residual_norm_style: str,
device: torch.device,
reversible: bool,
):
Expand Down Expand Up @@ -100,7 +100,7 @@ def test_xformer_encoder_block(
multi_head_config=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style=layer_norm_style,
residual_norm_style=residual_norm_style,
reversible=reversible,
)

Expand Down Expand Up @@ -136,7 +136,7 @@ def test_xformer_encoder_block(
@pytest.mark.parametrize("rotary_embeddings", [False, True])
@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
@pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
@pytest.mark.parametrize("layer_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.skipif(
not torch.cuda.is_available(), reason="This test requires a CUDA device"
Expand All @@ -150,7 +150,7 @@ def test_xformer_decoder_block(
residual_dropout: float,
causal: bool,
activation: Activation,
layer_norm_style: str,
residual_norm_style: str,
device: torch.device,
):

Expand Down Expand Up @@ -202,7 +202,7 @@ def test_xformer_decoder_block(
multi_head_config=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style=layer_norm_style,
residual_norm_style=residual_norm_style,
)

decoder_block_config = xFormerDecoderConfig(
Expand All @@ -211,7 +211,7 @@ def test_xformer_decoder_block(
multi_head_config_cross=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style=layer_norm_style,
residual_norm_style=residual_norm_style,
)

# Test that the whole block can be instantiated
Expand Down Expand Up @@ -303,7 +303,7 @@ def test_embedding_projection():
multi_head_config=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style="pre",
residual_norm_style="pre",
reversible=False,
)

Expand Down Expand Up @@ -371,7 +371,7 @@ def test_simplicial_embedding(
multi_head_config=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style="pre",
residual_norm_style="pre",
reversible=False,
simplicial_embeddings={"L": 4},
)
Expand All @@ -398,7 +398,7 @@ def test_simplicial_embedding(
multi_head_config=multi_head_config,
feedforward_config=feedforward_config,
position_encoding_config=position_encoding_config,
layer_norm_style="pre",
residual_norm_style="pre",
reversible=False,
simplicial_embeddings={"L": 3},
)
Expand Down
26 changes: 14 additions & 12 deletions tests/test_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"reversible": False,
"block_type": "encoder",
"dim_model": EMB,
"layer_norm_style": "pre",
"residual_norm_style": "pre",
"position_encoding_config": {
"name": "vocab",
"seq_len": SEQ,
Expand Down Expand Up @@ -61,7 +61,7 @@
decoder_configs = {
"block_type": "decoder",
"dim_model": EMB,
"layer_norm_style": "pre",
"residual_norm_style": "pre",
"position_encoding_config": {
"name": "vocab",
"seq_len": SEQ,
Expand Down Expand Up @@ -109,9 +109,11 @@
@pytest.mark.parametrize("config", [test_configs_list, test_configs_dict])
@pytest.mark.parametrize("reversible", [True, False])
@pytest.mark.parametrize("tie_embedding_weights", [True, False])
@pytest.mark.parametrize("layer_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("residual_norm_style", ["pre", "post", "deepnorm"])
@pytest.mark.parametrize("device", DEVICES)
def test_presets(config, reversible, tie_embedding_weights, layer_norm_style, device):
def test_presets(
config, reversible, tie_embedding_weights, residual_norm_style, device
):
torch.cuda.manual_seed(42)
torch.manual_seed(42)

Expand All @@ -120,12 +122,12 @@ def test_presets(config, reversible, tie_embedding_weights, layer_norm_style, de
# Only the encoder can be reversible
config[0]["reversible"] = reversible

config[0]["layer_norm_style"] = layer_norm_style
config[1]["layer_norm_style"] = layer_norm_style
config[0]["residual_norm_style"] = residual_norm_style
config[1]["residual_norm_style"] = residual_norm_style
else:
config["encoder"]["reversible"] = reversible
config["encoder"]["layer_norm_style"] = layer_norm_style
config["decoder"]["layer_norm_style"] = layer_norm_style
config["encoder"]["residual_norm_style"] = residual_norm_style
config["decoder"]["residual_norm_style"] = residual_norm_style

modelConfig = xFormerConfig(config, tie_embedding_weights)
if isinstance(modelConfig.stack_configs, dict):
Expand All @@ -137,7 +139,7 @@ def test_presets(config, reversible, tie_embedding_weights, layer_norm_style, de

context = (
pytest.raises(AssertionError)
if reversible and (tie_embedding_weights or layer_norm_style == "deepnorm")
if reversible and (tie_embedding_weights or residual_norm_style == "deepnorm")
else nullcontext()
)

Expand All @@ -152,7 +154,7 @@ def check_against_default(p):
assert change > 0.1

# Check deepnorm init, if applicable
if layer_norm_style == "deepnorm":
if residual_norm_style == "deepnorm":
for n, p in model.encoders.named_parameters():
# Check the MHA
if "in_proj_weight" in n:
Expand Down Expand Up @@ -209,10 +211,10 @@ def test_weight_init(weight_init, feedforward, deepnorm, device):
config = test_configs_dict

if deepnorm:
config["encoder"]["layer_norm_style"] = "deepnorm"
config["encoder"]["residual_norm_style"] = "deepnorm"
config["encoder"]["feedforward_config"]["name"] = feedforward

config["decoder"]["layer_norm_style"] = "deepnorm"
config["decoder"]["residual_norm_style"] = "deepnorm"

# Make sure that all the init methods catch all the weights
xformers_weight_init._assert_if_not_initialized = True
Expand Down
2 changes: 1 addition & 1 deletion tests/test_pickling.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"block_type": "encoder",
"num_layers": 2,
"dim_model": 768,
"layer_norm_style": "pre",
"residual_norm_style": "pre",
"multi_head_config": {
"num_heads": 12,
"residual_dropout": 0.1,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_pytorch_transformer_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"block_type": "encoder",
"dim_model": EMB,
"num_layers": LAYERS,
"layer_norm_style": "post",
"residual_norm_style": "post",
"multi_head_config": {
"num_heads": HEADS,
"residual_dropout": DROP,
Expand All @@ -52,7 +52,7 @@
"block_type": "decoder",
"dim_model": EMB,
"num_layers": LAYERS,
"layer_norm_style": "post",
"residual_norm_style": "post",
"multi_head_config_masked": {
"num_heads": HEADS,
"residual_dropout": DROP,
Expand Down

0 comments on commit 78f8b7b

Please sign in to comment.