From c0f065e0a310c40354f6bf03097a46a06e71109b Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Fri, 23 Oct 2020 06:01:57 -0700 Subject: [PATCH 1/7] Rename type -> key in fairseq/tasks/sentence_prediction.py (fixes #2746) --- fairseq/tasks/sentence_prediction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py index 69dc996e6a..0ec3824d04 100644 --- a/fairseq/tasks/sentence_prediction.py +++ b/fairseq/tasks/sentence_prediction.py @@ -135,11 +135,11 @@ def setup_task(cls, args, **kwargs): def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" - def get_path(type, split): - return os.path.join(self.args.data, type, split) + def get_path(key, split): + return os.path.join(self.args.data, key, split) - def make_dataset(type, dictionary): - split_path = get_path(type, split) + def make_dataset(key, dictionary): + split_path = get_path(key, split) dataset = data_utils.load_indexed_dataset( split_path, @@ -151,7 +151,7 @@ def make_dataset(type, dictionary): input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( - get_path(type, split) + get_path("input0", split) ) input1 = make_dataset("input1", self.source_dictionary) From c518c2c0c782d195fd0cec9fd079d09e1bf33588 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Fri, 23 Oct 2020 06:03:46 -0700 Subject: [PATCH 2/7] Turn off logging in test_fp16_optimizer.TestGradientScaling --- tests/test_fp16_optimizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py index aa6a863d32..8de8e28ce0 100644 --- a/tests/test_fp16_optimizer.py +++ b/tests/test_fp16_optimizer.py @@ -5,6 +5,7 @@ import argparse import copy +import logging import unittest import torch @@ -46,6 +47,10 @@ def setUp(self): }, } ) + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) def run_iter(self, model, params, optimizer): optimizer.zero_grad() From 135ba51afa58a810a5004ed08e7726e35ffebcbc Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Fri, 23 Oct 2020 06:13:02 -0700 Subject: [PATCH 3/7] Update preprocessing docs (fixes #2565) --- fairseq/options.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fairseq/options.py b/fairseq/options.py index f2a3e7cfb1..b79443a177 100644 --- a/fairseq/options.py +++ b/fairseq/options.py @@ -249,11 +249,13 @@ def add_preprocess_args(parser): group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, - help="train file prefix") + help="train file prefix (also used to build dictionaries)") group.add_argument("--validpref", metavar="FP", default=None, - help="comma separated, valid file prefixes") + help="comma separated, valid file prefixes " + "(words missing from train set are replaced with )") group.add_argument("--testpref", metavar="FP", default=None, - help="comma separated, test file prefixes") + help="comma separated, test file prefixes " + "(words missing from train set are replaced with )") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", From ade599805d97abc116561d5440f6f8c7928222f6 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Fri, 23 Oct 2020 06:17:58 -0700 Subject: [PATCH 4/7] Documentation updates --- docs/getting_started.rst | 15 ++++++++++++--- examples/roberta/README.md | 1 - fairseq/dataclass/configs.py | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index fa5971dd31..d227b95544 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -170,13 +170,14 @@ The easiest way to launch jobs is with the `torch.distributed.launch For example, to train a large English-German Transformer model on 2 nodes each with 8 GPUs (in total 16 GPUs), run the following command on each node, -replacing ``node_rank=0`` with ``node_rank=1`` on the second node: +replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making +sure to update ``--master_addr`` to the IP address of the first node: .. code-block:: console > python -m torch.distributed.launch --nproc_per_node=8 \ --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \ - --master_port=1234 \ + --master_port=12345 \ $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \ --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ @@ -184,7 +185,15 @@ replacing ``node_rank=0`` with ``node_rank=1`` on the second node: --lr 0.0005 --min-lr 1e-09 \ --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 3584 \ - --fp16 --distributed-no-spawn + --fp16 + +On SLURM clusters, fairseq will automatically detect the number of nodes and +GPUs, but a port number must be provided: + +.. code-block:: console + + > salloc --gpus=16 --nodes 2 (...) + > srun fairseq-train --distributed-port 12345 (...). Sharding very large datasets ---------------------------- diff --git a/examples/roberta/README.md b/examples/roberta/README.md index fdddd5b8d2..ca86131eea 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples)) - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md) - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md) - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md) -- Finetuning on SQuAD: coming soon ## Pretraining using your own data diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index abcb9c4c48..484d2526d7 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass): batch_size_valid: Optional[int] = field( default=None, metadata={ - "help": "batch size of the validation batch" " (defaults to --batch-size)", + "help": "batch size of the validation batch (defaults to --batch-size)", "argparse_alias": "--max-sentences-valid", }, ) From ca574a010280949e2aa29a4adf6e7c4e37e74009 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Mon, 26 Oct 2020 07:04:12 -0700 Subject: [PATCH 5/7] Removed unused Linear definition in transformer_layer.py --- fairseq/modules/transformer_layer.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py index 8775aa7766..6f3c79de7c 100644 --- a/fairseq/modules/transformer_layer.py +++ b/fairseq/modules/transformer_layer.py @@ -144,7 +144,6 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): residual = x if self.normalize_before: x = self.final_layer_norm(x) - x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) @@ -413,11 +412,3 @@ def forward( def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn - - -def Linear(in_features, out_features, bias=True): - m = nn.Linear(in_features, out_features, bias) - nn.init.xavier_uniform_(m.weight) - if bias: - nn.init.constant_(m.bias, 0.0) - return m From d985198855e8fd7209daf9b6a0785ac8d9222a74 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Mon, 26 Oct 2020 07:05:26 -0700 Subject: [PATCH 6/7] Move default untie_weights_roberta option into base architecture --- fairseq/models/roberta/model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py index 5c9f92a149..0f6efe5b33 100644 --- a/fairseq/models/roberta/model.py +++ b/fairseq/models/roberta/model.py @@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder): def __init__(self, args, dictionary): super().__init__(dictionary) + + # set any missing default values + base_architecture(args) self.args = args if args.encoder_layers_to_keep: @@ -417,7 +420,6 @@ def __init__(self, args, dictionary): q_noise=args.quant_noise_pq, qn_block_size=args.quant_noise_pq_block_size, ) - args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False) self.lm_head = RobertaLMHead( embed_dim=args.encoder_embed_dim, @@ -495,6 +497,7 @@ def base_architecture(args): args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) + args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False) args.spectral_norm_classification_head = getattr( args, "spectral_norm_classification_head", False ) From cbd35836951d4135ad2239fe79cb634cb1fc6cf2 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Mon, 26 Oct 2020 07:31:39 -0700 Subject: [PATCH 7/7] Fix noisychannel example (fixes #2213) --- examples/noisychannel/rerank.py | 2 +- examples/noisychannel/rerank_generate.py | 2 +- examples/noisychannel/rerank_score_bw.py | 2 +- examples/noisychannel/rerank_score_lm.py | 2 +- examples/noisychannel/rerank_tune.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py index b5ffd1ca34..bb80d11a67 100644 --- a/examples/noisychannel/rerank.py +++ b/examples/noisychannel/rerank.py @@ -11,7 +11,7 @@ from fairseq.data import dictionary from fairseq.scoring import bleu -from . import ( +from examples.noisychannel import ( rerank_generate, rerank_options, rerank_score_bw, diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py index d512088de8..daeeae059a 100644 --- a/examples/noisychannel/rerank_generate.py +++ b/examples/noisychannel/rerank_generate.py @@ -15,7 +15,7 @@ from fairseq import options from fairseq_cli import generate, preprocess -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def gen_and_reprocess_nbest(args): diff --git a/examples/noisychannel/rerank_score_bw.py b/examples/noisychannel/rerank_score_bw.py index 895673b1cc..b0bc913651 100644 --- a/examples/noisychannel/rerank_score_bw.py +++ b/examples/noisychannel/rerank_score_bw.py @@ -9,7 +9,7 @@ from fairseq import options from fairseq_cli import generate -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def score_bw(args): diff --git a/examples/noisychannel/rerank_score_lm.py b/examples/noisychannel/rerank_score_lm.py index 89ebf61cce..e80948d78b 100644 --- a/examples/noisychannel/rerank_score_lm.py +++ b/examples/noisychannel/rerank_score_lm.py @@ -7,7 +7,7 @@ from fairseq import options -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def score_lm(args): diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py index 1be71744a3..b2e8b7594a 100644 --- a/examples/noisychannel/rerank_tune.py +++ b/examples/noisychannel/rerank_tune.py @@ -9,7 +9,7 @@ import numpy as np from fairseq import options -from . import rerank, rerank_options +from examples.noisychannel import rerank, rerank_options def random_search(args):