Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Misc fixes #2786

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions docs/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,21 +170,30 @@ The easiest way to launch jobs is with the `torch.distributed.launch

For example, to train a large English-German Transformer model on 2 nodes each
with 8 GPUs (in total 16 GPUs), run the following command on each node,
replacing ``node_rank=0`` with ``node_rank=1`` on the second node:
replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making
sure to update ``--master_addr`` to the IP address of the first node:

.. code-block:: console

> python -m torch.distributed.launch --nproc_per_node=8 \
--nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \
--master_port=1234 \
--master_port=12345 \
$(which fairseq-train) data-bin/wmt16_en_de_bpe32k \
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
--lr 0.0005 --min-lr 1e-09 \
--dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 3584 \
--fp16 --distributed-no-spawn
--fp16

On SLURM clusters, fairseq will automatically detect the number of nodes and
GPUs, but a port number must be provided:

.. code-block:: console

> salloc --gpus=16 --nodes 2 (...)
> srun fairseq-train --distributed-port 12345 (...).

Sharding very large datasets
----------------------------
Expand Down
2 changes: 1 addition & 1 deletion examples/noisychannel/rerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from fairseq.data import dictionary
from fairseq.scoring import bleu

from . import (
from examples.noisychannel import (
rerank_generate,
rerank_options,
rerank_score_bw,
Expand Down
2 changes: 1 addition & 1 deletion examples/noisychannel/rerank_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from fairseq import options
from fairseq_cli import generate, preprocess

from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils


def gen_and_reprocess_nbest(args):
Expand Down
2 changes: 1 addition & 1 deletion examples/noisychannel/rerank_score_bw.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from fairseq import options
from fairseq_cli import generate

from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils


def score_bw(args):
Expand Down
2 changes: 1 addition & 1 deletion examples/noisychannel/rerank_score_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from fairseq import options

from . import rerank_options, rerank_utils
from examples.noisychannel import rerank_options, rerank_utils


def score_lm(args):
Expand Down
2 changes: 1 addition & 1 deletion examples/noisychannel/rerank_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from fairseq import options

from . import rerank, rerank_options
from examples.noisychannel import rerank, rerank_options


def random_search(args):
Expand Down
1 change: 0 additions & 1 deletion examples/roberta/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples))
- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
- Finetuning on SQuAD: coming soon

## Pretraining using your own data

Expand Down
2 changes: 1 addition & 1 deletion fairseq/dataclass/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ class DatasetConfig(FairseqDataclass):
batch_size_valid: Optional[int] = field(
default=None,
metadata={
"help": "batch size of the validation batch" " (defaults to --batch-size)",
"help": "batch size of the validation batch (defaults to --batch-size)",
"argparse_alias": "--max-sentences-valid",
},
)
Expand Down
5 changes: 4 additions & 1 deletion fairseq/models/roberta/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ class RobertaEncoder(FairseqEncoder):

def __init__(self, args, dictionary):
super().__init__(dictionary)

# set any missing default values
base_architecture(args)
self.args = args

if args.encoder_layers_to_keep:
Expand All @@ -417,7 +420,6 @@ def __init__(self, args, dictionary):
q_noise=args.quant_noise_pq,
qn_block_size=args.quant_noise_pq_block_size,
)
args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)

self.lm_head = RobertaLMHead(
embed_dim=args.encoder_embed_dim,
Expand Down Expand Up @@ -495,6 +497,7 @@ def base_architecture(args):
args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
args.spectral_norm_classification_head = getattr(
args, "spectral_norm_classification_head", False
)
Expand Down
9 changes: 0 additions & 9 deletions fairseq/modules/transformer_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None):
residual = x
if self.normalize_before:
x = self.final_layer_norm(x)

x = self.activation_fn(self.fc1(x))
x = self.activation_dropout_module(x)
x = self.fc2(x)
Expand Down Expand Up @@ -413,11 +412,3 @@ def forward(

def make_generation_fast_(self, need_attn: bool = False, **kwargs):
self.need_attn = need_attn


def Linear(in_features, out_features, bias=True):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)
if bias:
nn.init.constant_(m.bias, 0.0)
return m
8 changes: 5 additions & 3 deletions fairseq/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,13 @@ def add_preprocess_args(parser):
group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
help="target language")
group.add_argument("--trainpref", metavar="FP", default=None,
help="train file prefix")
help="train file prefix (also used to build dictionaries)")
group.add_argument("--validpref", metavar="FP", default=None,
help="comma separated, valid file prefixes")
help="comma separated, valid file prefixes "
"(words missing from train set are replaced with <unk>)")
group.add_argument("--testpref", metavar="FP", default=None,
help="comma separated, test file prefixes")
help="comma separated, test file prefixes "
"(words missing from train set are replaced with <unk>)")
group.add_argument("--align-suffix", metavar="FP", default=None,
help="alignment file suffix")
group.add_argument("--destdir", metavar="DIR", default="data-bin",
Expand Down
10 changes: 5 additions & 5 deletions fairseq/tasks/sentence_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,11 @@ def setup_task(cls, args, **kwargs):
def load_dataset(self, split, combine=False, **kwargs):
"""Load a given dataset split (e.g., train, valid, test)."""

def get_path(type, split):
return os.path.join(self.args.data, type, split)
def get_path(key, split):
return os.path.join(self.args.data, key, split)

def make_dataset(type, dictionary):
split_path = get_path(type, split)
def make_dataset(key, dictionary):
split_path = get_path(key, split)

dataset = data_utils.load_indexed_dataset(
split_path,
Expand All @@ -151,7 +151,7 @@ def make_dataset(type, dictionary):

input0 = make_dataset("input0", self.source_dictionary)
assert input0 is not None, "could not find dataset: {}".format(
get_path(type, split)
get_path("input0", split)
)
input1 = make_dataset("input1", self.source_dictionary)

Expand Down
5 changes: 5 additions & 0 deletions tests/test_fp16_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import argparse
import copy
import logging
import unittest

import torch
Expand Down Expand Up @@ -46,6 +47,10 @@ def setUp(self):
},
}
)
logging.disable(logging.CRITICAL)

def tearDown(self):
logging.disable(logging.NOTSET)

def run_iter(self, model, params, optimizer):
optimizer.zero_grad()
Expand Down