facebookresearch · myleott · Oct 23, 2020 · Oct 23, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -170,21 +170,30 @@ The easiest way to launch jobs is with the `torch.distributed.launch
 
 For example, to train a large English-German Transformer model on 2 nodes each
 with 8 GPUs (in total 16 GPUs), run the following command on each node,
-replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
+replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
+sure to update ``--master_addr`` to the IP address of the first node:
 
 .. code-block:: console
 
     > python -m torch.distributed.launch --nproc_per_node=8 \
         --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
-        --master_port=1234 \
+        --master_port=12345 \
         $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
         --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
         --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
         --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
         --lr 0.0005 --min-lr 1e-09 \
         --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
         --max-tokens 3584 \
-        --fp16  --distributed-no-spawn 
+        --fp16
+
+On SLURM clusters, fairseq will automatically detect the number of nodes and
+GPUs, but a port number must be provided:
+
+.. code-block:: console
+
+    > salloc --gpus=16 --nodes 2 (...)
+    > srun fairseq-train --distributed-port 12345 (...).
 
 Sharding very large datasets
 ----------------------------

diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py
@@ -11,7 +11,7 @@
 from fairseq.data import dictionary
 from fairseq.scoring import bleu
 
-from . import (
+from examples.noisychannel import (
     rerank_generate,
     rerank_options,
     rerank_score_bw,

diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py
@@ -15,7 +15,7 @@
 from fairseq import options
 from fairseq_cli import generate, preprocess
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def gen_and_reprocess_nbest(args):

diff --git a/examples/noisychannel/rerank_score_bw.py b/examples/noisychannel/rerank_score_bw.py
@@ -9,7 +9,7 @@
 from fairseq import options
 from fairseq_cli import generate
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def score_bw(args):

diff --git a/examples/noisychannel/rerank_score_lm.py b/examples/noisychannel/rerank_score_lm.py
@@ -7,7 +7,7 @@
 
 from fairseq import options
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def score_lm(args):

diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py
@@ -9,7 +9,7 @@
 import numpy as np
 from fairseq import options
 
-from . import rerank, rerank_options
+from examples.noisychannel import rerank, rerank_options
 
 
 def random_search(args):

diff --git a/examples/roberta/README.md b/examples/roberta/README.md
@@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
 - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
 - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
-- Finetuning on SQuAD: coming soon
 
 ## Pretraining using your own data
 

diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
@@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass):
     batch_size_valid: Optional[int] = field(
         default=None,
         metadata={
-            "help": "batch size of the validation batch" " (defaults to --batch-size)",
+            "help": "batch size of the validation batch (defaults to --batch-size)",
             "argparse_alias": "--max-sentences-valid",
         },
     )

diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
@@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder):
 
     def __init__(self, args, dictionary):
         super().__init__(dictionary)
+
+        # set any missing default values
+        base_architecture(args)
         self.args = args
 
         if args.encoder_layers_to_keep:
@@ -417,7 +420,6 @@ def __init__(self, args, dictionary):
             q_noise=args.quant_noise_pq,
             qn_block_size=args.quant_noise_pq_block_size,
         )
-        args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
 
         self.lm_head = RobertaLMHead(
             embed_dim=args.encoder_embed_dim,
@@ -495,6 +497,7 @@ def base_architecture(args):
     args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
     args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
     args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
+    args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
     args.spectral_norm_classification_head = getattr(
         args, "spectral_norm_classification_head", False
     )

diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
@@ -144,7 +144,6 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None):
         residual = x
         if self.normalize_before:
             x = self.final_layer_norm(x)
-
         x = self.activation_fn(self.fc1(x))
         x = self.activation_dropout_module(x)
         x = self.fc2(x)
@@ -413,11 +412,3 @@ def forward(
 
     def make_generation_fast_(self, need_attn: bool = False, **kwargs):
         self.need_attn = need_attn
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
diff --git a/fairseq/options.py b/fairseq/options.py
@@ -249,11 +249,13 @@ def add_preprocess_args(parser):
     group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                        help="target language")
     group.add_argument("--trainpref", metavar="FP", default=None,
-                       help="train file prefix")
+                       help="train file prefix (also used to build dictionaries)")
     group.add_argument("--validpref", metavar="FP", default=None,
-                       help="comma separated, valid file prefixes")
+                       help="comma separated, valid file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
     group.add_argument("--testpref", metavar="FP", default=None,
-                       help="comma separated, test file prefixes")
+                       help="comma separated, test file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
     group.add_argument("--align-suffix", metavar="FP", default=None,
                        help="alignment file suffix")
     group.add_argument("--destdir", metavar="DIR", default="data-bin",

diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
@@ -135,11 +135,11 @@ def setup_task(cls, args, **kwargs):
     def load_dataset(self, split, combine=False, **kwargs):
         """Load a given dataset split (e.g., train, valid, test)."""
 
-        def get_path(type, split):
-            return os.path.join(self.args.data, type, split)
+        def get_path(key, split):
+            return os.path.join(self.args.data, key, split)
 
-        def make_dataset(type, dictionary):
-            split_path = get_path(type, split)
+        def make_dataset(key, dictionary):
+            split_path = get_path(key, split)
 
             dataset = data_utils.load_indexed_dataset(
                 split_path,
@@ -151,7 +151,7 @@ def make_dataset(type, dictionary):
 
         input0 = make_dataset("input0", self.source_dictionary)
         assert input0 is not None, "could not find dataset: {}".format(
-            get_path(type, split)
+            get_path("input0", split)
         )
         input1 = make_dataset("input1", self.source_dictionary)
 

diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py
@@ -5,6 +5,7 @@
 
 import argparse
 import copy
+import logging
 import unittest
 
 import torch
@@ -46,6 +47,10 @@ def setUp(self):
                 },
             }
         )
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
 
     def run_iter(self, model, params, optimizer):
         optimizer.zero_grad()