From c0f065e0a310c40354f6bf03097a46a06e71109b Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 23 Oct 2020 06:01:57 -0700
Subject: [PATCH 1/7] Rename type -> key in
 fairseq/tasks/sentence_prediction.py (fixes #2746)

---
 fairseq/tasks/sentence_prediction.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py
index 69dc996e6a..0ec3824d04 100644
--- a/fairseq/tasks/sentence_prediction.py
+++ b/fairseq/tasks/sentence_prediction.py
@@ -135,11 +135,11 @@ def setup_task(cls, args, **kwargs):
     def load_dataset(self, split, combine=False, **kwargs):
         """Load a given dataset split (e.g., train, valid, test)."""
 
-        def get_path(type, split):
-            return os.path.join(self.args.data, type, split)
+        def get_path(key, split):
+            return os.path.join(self.args.data, key, split)
 
-        def make_dataset(type, dictionary):
-            split_path = get_path(type, split)
+        def make_dataset(key, dictionary):
+            split_path = get_path(key, split)
 
             dataset = data_utils.load_indexed_dataset(
                 split_path,
@@ -151,7 +151,7 @@ def make_dataset(type, dictionary):
 
         input0 = make_dataset("input0", self.source_dictionary)
         assert input0 is not None, "could not find dataset: {}".format(
-            get_path(type, split)
+            get_path("input0", split)
         )
         input1 = make_dataset("input1", self.source_dictionary)
 

From c518c2c0c782d195fd0cec9fd079d09e1bf33588 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 23 Oct 2020 06:03:46 -0700
Subject: [PATCH 2/7] Turn off logging in
 test_fp16_optimizer.TestGradientScaling

---
 tests/test_fp16_optimizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py
index aa6a863d32..8de8e28ce0 100644
--- a/tests/test_fp16_optimizer.py
+++ b/tests/test_fp16_optimizer.py
@@ -5,6 +5,7 @@
 
 import argparse
 import copy
+import logging
 import unittest
 
 import torch
@@ -46,6 +47,10 @@ def setUp(self):
                 },
             }
         )
+        logging.disable(logging.CRITICAL)
+
+    def tearDown(self):
+        logging.disable(logging.NOTSET)
 
     def run_iter(self, model, params, optimizer):
         optimizer.zero_grad()

From 135ba51afa58a810a5004ed08e7726e35ffebcbc Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 23 Oct 2020 06:13:02 -0700
Subject: [PATCH 3/7] Update preprocessing docs (fixes #2565)

---
 fairseq/options.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fairseq/options.py b/fairseq/options.py
index f2a3e7cfb1..b79443a177 100644
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -249,11 +249,13 @@ def add_preprocess_args(parser):
     group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                        help="target language")
     group.add_argument("--trainpref", metavar="FP", default=None,
-                       help="train file prefix")
+                       help="train file prefix (also used to build dictionaries)")
     group.add_argument("--validpref", metavar="FP", default=None,
-                       help="comma separated, valid file prefixes")
+                       help="comma separated, valid file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
     group.add_argument("--testpref", metavar="FP", default=None,
-                       help="comma separated, test file prefixes")
+                       help="comma separated, test file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
     group.add_argument("--align-suffix", metavar="FP", default=None,
                        help="alignment file suffix")
     group.add_argument("--destdir", metavar="DIR", default="data-bin",

From ade599805d97abc116561d5440f6f8c7928222f6 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 23 Oct 2020 06:17:58 -0700
Subject: [PATCH 4/7] Documentation updates

---
 docs/getting_started.rst     | 15 ++++++++++++---
 examples/roberta/README.md   |  1 -
 fairseq/dataclass/configs.py |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index fa5971dd31..d227b95544 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -170,13 +170,14 @@ The easiest way to launch jobs is with the `torch.distributed.launch
 
 For example, to train a large English-German Transformer model on 2 nodes each
 with 8 GPUs (in total 16 GPUs), run the following command on each node,
-replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
+replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
+sure to update ``--master_addr`` to the IP address of the first node:
 
 .. code-block:: console
 
     > python -m torch.distributed.launch --nproc_per_node=8 \
         --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
-        --master_port=1234 \
+        --master_port=12345 \
         $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
         --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
         --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
@@ -184,7 +185,15 @@ replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
         --lr 0.0005 --min-lr 1e-09 \
         --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
         --max-tokens 3584 \
-        --fp16  --distributed-no-spawn 
+        --fp16
+
+On SLURM clusters, fairseq will automatically detect the number of nodes and
+GPUs, but a port number must be provided:
+
+.. code-block:: console
+
+    > salloc --gpus=16 --nodes 2 (...)
+    > srun fairseq-train --distributed-port 12345 (...).
 
 Sharding very large datasets
 ----------------------------
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index fdddd5b8d2..ca86131eea 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
 - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
 - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
 - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
-- Finetuning on SQuAD: coming soon
 
 ## Pretraining using your own data
 
diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
index abcb9c4c48..484d2526d7 100644
--- a/fairseq/dataclass/configs.py
+++ b/fairseq/dataclass/configs.py
@@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass):
     batch_size_valid: Optional[int] = field(
         default=None,
         metadata={
-            "help": "batch size of the validation batch" " (defaults to --batch-size)",
+            "help": "batch size of the validation batch (defaults to --batch-size)",
             "argparse_alias": "--max-sentences-valid",
         },
     )

From ca574a010280949e2aa29a4adf6e7c4e37e74009 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 26 Oct 2020 07:04:12 -0700
Subject: [PATCH 5/7] Removed unused Linear definition in transformer_layer.py

---
 fairseq/modules/transformer_layer.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py
index 8775aa7766..6f3c79de7c 100644
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -144,7 +144,6 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None):
         residual = x
         if self.normalize_before:
             x = self.final_layer_norm(x)
-
         x = self.activation_fn(self.fc1(x))
         x = self.activation_dropout_module(x)
         x = self.fc2(x)
@@ -413,11 +412,3 @@ def forward(
 
     def make_generation_fast_(self, need_attn: bool = False, **kwargs):
         self.need_attn = need_attn
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m

From d985198855e8fd7209daf9b6a0785ac8d9222a74 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 26 Oct 2020 07:05:26 -0700
Subject: [PATCH 6/7] Move default untie_weights_roberta option into base
 architecture

---
 fairseq/models/roberta/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py
index 5c9f92a149..0f6efe5b33 100644
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder):
 
     def __init__(self, args, dictionary):
         super().__init__(dictionary)
+
+        # set any missing default values
+        base_architecture(args)
         self.args = args
 
         if args.encoder_layers_to_keep:
@@ -417,7 +420,6 @@ def __init__(self, args, dictionary):
             q_noise=args.quant_noise_pq,
             qn_block_size=args.quant_noise_pq_block_size,
         )
-        args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
 
         self.lm_head = RobertaLMHead(
             embed_dim=args.encoder_embed_dim,
@@ -495,6 +497,7 @@ def base_architecture(args):
     args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
     args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
     args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
+    args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
     args.spectral_norm_classification_head = getattr(
         args, "spectral_norm_classification_head", False
     )

From cbd35836951d4135ad2239fe79cb634cb1fc6cf2 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Mon, 26 Oct 2020 07:31:39 -0700
Subject: [PATCH 7/7] Fix noisychannel example (fixes #2213)

---
 examples/noisychannel/rerank.py          | 2 +-
 examples/noisychannel/rerank_generate.py | 2 +-
 examples/noisychannel/rerank_score_bw.py | 2 +-
 examples/noisychannel/rerank_score_lm.py | 2 +-
 examples/noisychannel/rerank_tune.py     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py
index b5ffd1ca34..bb80d11a67 100644
--- a/examples/noisychannel/rerank.py
+++ b/examples/noisychannel/rerank.py
@@ -11,7 +11,7 @@
 from fairseq.data import dictionary
 from fairseq.scoring import bleu
 
-from . import (
+from examples.noisychannel import (
     rerank_generate,
     rerank_options,
     rerank_score_bw,
diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py
index d512088de8..daeeae059a 100644
--- a/examples/noisychannel/rerank_generate.py
+++ b/examples/noisychannel/rerank_generate.py
@@ -15,7 +15,7 @@
 from fairseq import options
 from fairseq_cli import generate, preprocess
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def gen_and_reprocess_nbest(args):
diff --git a/examples/noisychannel/rerank_score_bw.py b/examples/noisychannel/rerank_score_bw.py
index 895673b1cc..b0bc913651 100644
--- a/examples/noisychannel/rerank_score_bw.py
+++ b/examples/noisychannel/rerank_score_bw.py
@@ -9,7 +9,7 @@
 from fairseq import options
 from fairseq_cli import generate
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def score_bw(args):
diff --git a/examples/noisychannel/rerank_score_lm.py b/examples/noisychannel/rerank_score_lm.py
index 89ebf61cce..e80948d78b 100644
--- a/examples/noisychannel/rerank_score_lm.py
+++ b/examples/noisychannel/rerank_score_lm.py
@@ -7,7 +7,7 @@
 
 from fairseq import options
 
-from . import rerank_options, rerank_utils
+from examples.noisychannel import rerank_options, rerank_utils
 
 
 def score_lm(args):
diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py
index 1be71744a3..b2e8b7594a 100644
--- a/examples/noisychannel/rerank_tune.py
+++ b/examples/noisychannel/rerank_tune.py
@@ -9,7 +9,7 @@
 import numpy as np
 from fairseq import options
 
-from . import rerank, rerank_options
+from examples.noisychannel import rerank, rerank_options
 
 
 def random_search(args):