From dbfca6e6d6c9f185144cca9d5ab55a86d824905e Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Fri, 20 Nov 2020 05:59:25 -0800 Subject: [PATCH] Add fairseq-hydra-train and update docs (#1449) Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1449 Test Plan: Imported from OSS Reviewed By: alexeib Differential Revision: D25094525 Pulled By: myleott fbshipit-source-id: 430387d11196d3292933bb168cf09ea16ebc0d3b --- docs/hydra_integration.md | 226 ++++++++++++++++++------------- examples/wav2vec/README.md | 21 ++- fairseq/config/config.yaml | 6 +- fairseq/dataclass/configs.py | 6 + fairseq/modules/cross_entropy.py | 6 +- fairseq_cli/hydra_train.py | 34 ++++- setup.py | 17 ++- 7 files changed, 203 insertions(+), 113 deletions(-) diff --git a/docs/hydra_integration.md b/docs/hydra_integration.md index f924de961b..8e4082cb24 100644 --- a/docs/hydra_integration.md +++ b/docs/hydra_integration.md @@ -1,57 +1,70 @@ ## Hydra -[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python framework that simplifies the development of -research and other complex applications. The key feature is the ability to dynamically create a hierarchical -configuration by composition and override it through config files and the command line. The name Hydra comes from its -ability to run multiple similar jobs - much like a Hydra with multiple heads. +[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python +framework that simplifies the development of research and other complex +applications. The key feature is the ability to dynamically create a +hierarchical configuration by composition and override it through config files +and the command line. The name Hydra comes from its ability to run multiple +similar jobs - much like a Hydra with multiple heads. ## Motivation -Until recently, all components in fairseq were configured through a shared "args" namespace that was created at -application startup. Components declared their own "add_args" method to update the argparse parser, hoping that -the names would not clash with arguments from other components. While this model works for smaller applications, -as fairseq grew and became integrated into other applications, this became problematic. -In order to determine how to configure each component, one needed to a) examine what args were added by this component, and -b) read the code to figure out what shared arguments it is using that were added in other places. Reproducing -models involved sharing commands that often contained dozens of command line switches. - -The model described above is still supported by fairseq for backward compatibility, but will be deprecated some time -in the future. - -New components in fairseq should now create a dataclass that encapsulates all parameters required to configure this -component. The dataclass is registered along with the component, and fairseq takes care of constructing and -providing this configuration object to the component's constructor. Note that sharing parameters can optionally -still work, but one has to explicitly point to the "source of truth" (see inheritance example below). -These changes make components in fairseq -more independent and re-usable by other applications: all that is needed to create a component is to initialize its -dataclass and overwrite some of the defaults. - -While configuring fairseq through command line (using either the legacy argparse based or the new Hydra based entry points) is still -fully supported, you can now take advantage of configuring fairseq completely or piece-by-piece through -hierarchical YAML configuration files. These files can also be shipped as examples that others can use to run -an identically configured job. - -Additionally, Hydra has a rich and growing -[library of plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that provide functionality such as -hyperparameter sweeping (including using bayesian optimization through the [Ax](https://github.com/facebook/Ax) library), -job launching across various platforms, and more. +Until recently, all components in fairseq were configured through a shared +`args` namespace that was created at application startup. Components declared +their own `add_args` method to update the argparse parser, hoping that the names +would not clash with arguments from other components. While this model works for +smaller applications, as fairseq grew and became integrated into other +applications, this became problematic. In order to determine how to configure +each component, one needed to a) examine what args were added by this component, +and b) read the code to figure out what shared arguments it is using that were +added in other places. Reproducing models involved sharing commands that often +contained dozens of command line switches. + +The model described above is still supported by fairseq for backward +compatibility, but will be deprecated some time in the future. + +New components in fairseq should now create a dataclass that encapsulates all +parameters required to configure this component. The dataclass is registered +along with the component, and fairseq takes care of constructing and providing +this configuration object to the component's constructor. Note that sharing +parameters can optionally still work, but one has to explicitly point to the +"source of truth" (see inheritance example below). These changes make components +in fairseq more independent and re-usable by other applications: all that is +needed to create a component is to initialize its dataclass and overwrite some +of the defaults. + +While configuring fairseq through command line (using either the legacy argparse +based or the new Hydra based entry points) is still fully supported, you can now +take advantage of configuring fairseq completely or piece-by-piece through +hierarchical YAML configuration files. These files can also be shipped as +examples that others can use to run an identically configured job. + +Additionally, Hydra has a rich and growing [library of +plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that +provide functionality such as hyperparameter sweeping (including using bayesian +optimization through the [Ax](https://github.com/facebook/Ax) library), job +launching across various platforms, and more. ## Creating or migrating components -In general, each new (or updated) component should provide a companion [dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are typically located in the same -file as the component and are passed as arguments to the register_*() functions. Top-level configs that should be -present in every fairseq application are placed in the [global](fairseq/dataclass/configs.py) config file and added -to the FairseqConfig object. - -Each dataclass is a plain-old-data object, similar to a NamedTuple. These classes are decorated with a @dataclass -decorator, and typically inherit from `FairseqDataclass` (which adds some functionality for backward compatibility). -Each field must have a type, and generally has metadata (such as a help string) and a default value. Only primitive types or other config objects are allowed as +In general, each new (or updated) component should provide a companion +[dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are +typically located in the same file as the component and are passed as arguments +to the `register_*()` functions. Top-level configs that should be present in +every fairseq application are placed in the +[global](fairseq/dataclass/configs.py) config file and added to the +`FairseqConfig` object. + +Each dataclass is a plain-old-data object, similar to a `NamedTuple`. These +classes are decorated with a `@dataclass` decorator, and typically inherit from +`FairseqDataclass` (which adds some functionality for backward compatibility). +Each field must have a type, and generally has metadata (such as a help string) +and a default value. Only primitive types or other config objects are allowed as data types for each field. - Example: - +#### Example: -``` python +```python from dataclasses import dataclass, field from fairseq.dataclass import FairseqDataclass @@ -71,11 +84,12 @@ class InteractiveConfig(FairseqDataclass): ### Inherting values -Some components require sharing a value. For example, a learning rate scheduler and an optimizer may both need to -know the initial learning rate value. One can declare a field that, by default, will -inherit its value from another config node in the same hierarchy: +Some components require sharing a value. For example, a learning rate scheduler +and an optimizer may both need to know the initial learning rate value. One can +declare a field that, by default, will inherit its value from another config +node in the same hierarchy: -``` python +```python @dataclass FairseqAdamConfig(FairseqDataclass): ... @@ -83,18 +97,21 @@ FairseqAdamConfig(FairseqDataclass): ... ``` -`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"` , which is the value one can use in a YAML config file or through -command line to achieve the same effect. Note that this assumes that there is an "optimization" config object -in the root config and it has a field called "lr". +`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"`, which is +the value one can use in a YAML config file or through command line to achieve +the same effect. Note that this assumes that there is an "optimization" config +object in the root config and it has a field called "lr". ### Tasks and Models -Creating Tasks and Models works same as before, except that legacy implementations now inherit from Legacy* base classes, -while new components inherit from FairseqTask and FairseqModel and provide a dataclass to the register_*() functions. +Creating Tasks and Models works same as before, except that legacy +implementations now inherit from `LegacyFairseq*` base classes, while new +components inherit from `FairseqTask` and `FairseqModel` and provide a dataclass +to the `register_*()` functions. -Task example: +#### Task example: -``` python +```python @dataclass class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field( @@ -110,9 +127,9 @@ class LanguageModelingTask(LegacyFairseqTask): ... ``` -Model example: +#### Model example: -``` python +```python @dataclass class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( @@ -131,9 +148,10 @@ class TransformerLanguageModel(FairseqLanguageModel): ### Other components -Other components work as before, but they now take their configuration dataclass as the only constructor argument: +Other components work as before, but they now take their configuration dataclass +as the only constructor argument: -``` python +```python @dataclass class MosesTokenizerConfig(FairseqDataclass): source_lang: str = field(default="en", metadata={"help": "source language"}) @@ -145,50 +163,61 @@ class MosesTokenizer(object): ... ``` -Note that if you are adding a new registry for a new set of components, you need to add it to the FairseqConfig object in -fairseq/dataclass/configs.py: +Note that if you are adding a new registry for a new set of components, you need +to add it to the `FairseqConfig` object in `fairseq/dataclass/configs.py`: -``` python +```python @dataclass class FairseqConfig(object): ... my_new_registry: Any = None ``` -## Training with hydra_train.py +## Training with `fairseq-hydra-train` -To fully take advantage of configuration flexibility offered by Hydra, you may want to train new models using the -hydra_train.py entry point located in the fairseq_cli directory. Legacy CLI tools such as train.py, -will remain supported for the foreseeable future but will be deprecated eventually. +To fully take advantage of configuration flexibility offered by Hydra, you may +want to train new models using the `fairseq-hydra-train` entry point. Legacy CLI +tools such as `fairseq-train` will remain supported for the foreseeable future +but will be deprecated eventually. -On startup, Hydra will create a configuration object that contains a hierarchy of all the necessary dataclasses -populated with their default values in the code. The default values are overwritten by values found in YAML files in -fairseq/config directory (which currently just set default task, optimizer, etc) and then further overwritten by values -provided through command line arguments. +On startup, Hydra will create a configuration object that contains a hierarchy +of all the necessary dataclasses populated with their default values in the +code. The default values are overwritten by values found in YAML files in +`fairseq/config` directory (which currently sets minimal defaults) and then +further overwritten by values provided through command line arguments. Some of the most common use cases are shown below: -### 1. Overwrite default values through command line: +### 1. Override default values through command line: ```shell script -python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 task.data=data-bin \ -model=transformer_lm/transformer_lm_gpt task=language_modeling optimization.max_update=5000 - +$ fairseq-hydra-train \ + distributed_training.distributed_world_size=1 \ + dataset.batch_size=2 \ + task.data=data-bin \ + model=transformer_lm/transformer_lm_gpt \ + task=language_modeling \ + optimization.max_update=5000 ``` -Note that along with explicitly providing values for parameters such as dataset.batch_size, this also tells Hydra to overlay configuration found in `fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` -over the default values in the dataclass. If you want to train a model without specifying a particular architecture -you can simply specify model=transformer_lm. This only works for migrated tasks and models. +Note that along with explicitly providing values for parameters such as +`dataset.batch_size`, this also tells Hydra to overlay configuration found in +`fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` over the default +values in the dataclass. If you want to train a model without specifying a +particular architecture you can simply specify `model=transformer_lm`. This only +works for migrated tasks and models. ### 2. Replace bundled configs with an external config: ```shell script -python fairseq_cli/hydra_train.py --config-path /path/to/external/configs --config-name wiki103 +$ fairseq-hydra-train \ + --config-path /path/to/external/configs \ + --config-name wiki103 ``` -where /path/to/external/configs/wiki103.yaml contains: +where `/path/to/external/configs/wiki103.yaml` contains: -``` yaml +```yaml # @package _group_ model: @@ -211,24 +240,38 @@ lr_scheduler: _name: cosine ``` -Note that here bundled configs from `fairseq/config` directory are not used, however the defaults from each dataclass will still be used (unless overwritten by your external config). +Note that here bundled configs from `fairseq/config` directory are not used, +however the defaults from each dataclass will still be used (unless overwritten +by your external config). -Additionally you can choose to break up your configs by creating a directory structure in the same location as your main config file, with the names of the top-level fields -(such as "model", "dataset", etc), and placing config files with meaningful names that would populate that specific section of your -top-level config file (for example, you might have model/small_transformer_lm.yaml, model/big_transformer_lm.yaml, etc). You can then specify the correct configuration via command line, defaults in the main config, or even launch all of them as a sweep (see Hydra documentation on how to do this). +Additionally you can choose to break up your configs by creating a directory +structure in the same location as your main config file, with the names of the +top-level fields (such as "model", "dataset", etc), and placing config files +with meaningful names that would populate that specific section of your +top-level config file (for example, you might have +`model/small_transformer_lm.yaml`, `model/big_transformer_lm.yaml`, etc). You +can then specify the correct configuration via command line, defaults in the +main config, or even launch all of them as a sweep (see Hydra documentation on +how to do this). ### 3. Add an external config directory to Hydra search path: -This allows combining default configuration (including using any bundled config files), while specifying your own config files for some parts of the configuration. +This allows combining default configuration (including using any bundled config +files), while specifying your own config files for some parts of the +configuration. ```shell script -python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 \ -task.data=/path/to/data/ model=transformer_lm/2_layers task=language_modeling optimization.max_update=5000 \ ---config-dir /path/to/external/configs - +$ fairseq-hydra-train \ + distributed_training.distributed_world_size=1 \ + dataset.batch_size=2 \ + task.data=/path/to/data/ \ + model=transformer_lm/2_layers \ + task=language_modeling \ + optimization.max_update=5000 \ + --config-dir /path/to/external/configs ``` -where /path/to/external/configs has the following structure: +where `/path/to/external/configs` has the following structure: ``` . +-- model @@ -236,5 +279,6 @@ where /path/to/external/configs has the following structure: | | +-- 2_layers.yaml ``` -and 2_layers.yaml contains a copy of transformer_lm_gpt.yaml but with decoder_layers set to 2. You can add -other configs to configure other components as well. +and `2_layers.yaml` contains a copy of `transformer_lm_gpt.yaml` but with +`decoder_layers` set to 2. You can add other configs to configure other +components as well. diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md index 442a92553a..fdbf844ec7 100644 --- a/examples/wav2vec/README.md +++ b/examples/wav2vec/README.md @@ -56,8 +56,10 @@ This configuration was used for the base model trained on the Librispeech datase Note that the input is expected to be single channel, sampled at 16 kHz ```shell script -$ python fairseq_cli/hydra_train.py task.data=/path/to/data \ ---config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining --config-name wav2vec2_base_librispeech +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_base_librispeech ``` Note: you can simulate 64 GPUs by using k GPUs and adding command line parameters (before --config-path) @@ -68,8 +70,10 @@ Note: you can simulate 64 GPUs by using k GPUs and adding command line parameter This configuration was used for the large model trained on the Libri-light dataset in the wav2vec 2.0 paper ```shell script -$ python fairseq_cli/hydra_train.py task.data=/path/to/data \ ---config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining --config-name wav2vec2_large_librivox +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_large_librivox ``` Note: you can simulate 128 GPUs by using k GPUs and adding command line parameters (before --config-path) @@ -88,9 +92,12 @@ $ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $sp Fine-tuning on 100h of Librispeech with letter targets: ```shell script -python fairseq_cli/hydra_train.py distributed_training.distributed_port=$PORT task.data=/path/to/data \ -model.w2v_path=/path/to/model.pt --config-path /path/to/fairseq-py/examples/wav2vec/config/finetuning \ ---config-name base_100h +$ fairseq-hydra-train \ + distributed_training.distributed_port=$PORT \ + task.data=/path/to/data \ + model.w2v_path=/path/to/model.pt \ + --config-path /path/to/fairseq-py/examples/wav2vec/config/finetuning \ + --config-name base_100h ``` There are other config files in the config/finetuning directory that can be used to fine-tune on other splits. diff --git a/fairseq/config/config.yaml b/fairseq/config/config.yaml index 039609aece..9621baa5e9 100644 --- a/fairseq/config/config.yaml +++ b/fairseq/config/config.yaml @@ -1,10 +1,10 @@ # @package _group_ defaults: - - task: language_modeling + - task: null - model: null - criterion: cross_entropy - - optimizer: adam - - lr_scheduler: cosine + - optimizer: null + - lr_scheduler: fixed - bpe: null - tokenizer: null - scoring: null diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py index 28dc8905c7..36d88d83f7 100644 --- a/fairseq/dataclass/configs.py +++ b/fairseq/dataclass/configs.py @@ -173,6 +173,12 @@ class CommonConfig(FairseqDataclass): profile: bool = field( default=False, metadata={"help": "enable autograd profiler emit_nvtx"} ) + reset_logging: bool = field( + default=True, + metadata={ + "help": "when using Hydra, reset the logging at the beginning of training" + }, + ) @dataclass diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py index 0d2beb44bb..6f33c24cb5 100644 --- a/fairseq/modules/cross_entropy.py +++ b/fairseq/modules/cross_entropy.py @@ -26,12 +26,14 @@ def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"): import xentropy_cuda from apex.contrib import xentropy - logger.info("using fused cross entropy") - def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): if logits.device == torch.device("cpu"): return _cross_entropy_pytorch(logits, target, ignore_index, reduction) else: + if not getattr(cross_entropy, "_has_logged_once", False): + logger.info("using fused cross entropy") + cross_entropy._has_logged_once = True + half_to_float = logits.dtype == torch.half losses = xentropy.SoftmaxCrossEntropyLoss.apply( logits, diff --git a/fairseq_cli/hydra_train.py b/fairseq_cli/hydra_train.py index ffd3c5cd07..b092ce14ee 100644 --- a/fairseq_cli/hydra_train.py +++ b/fairseq_cli/hydra_train.py @@ -4,29 +4,32 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import hydra -from omegaconf import OmegaConf +import logging import os +import sys from fairseq.dataclass.initialize import hydra_init from fairseq_cli.train import main as pre_main from fairseq import distributed_utils from fairseq.dataclass.configs import FairseqConfig -import logging +import hydra import torch +from omegaconf import OmegaConf -logger = logging.getLogger(__name__) +logger = logging.getLogger("fairseq_cli.hydra_train") @hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config") def hydra_main(cfg: FairseqConfig) -> None: - cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)) OmegaConf.set_struct(cfg, True) + if cfg.common.reset_logging: + reset_logging() # Hydra hijacks logging, fix that + if cfg.common.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): @@ -35,7 +38,22 @@ def hydra_main(cfg: FairseqConfig) -> None: distributed_utils.call_main(cfg, pre_main) -if __name__ == "__main__": +def reset_logging(): + root = logging.getLogger() + for handler in root.handlers: + root.removeHandler(handler) + root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper()) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + root.addHandler(handler) + + +def cli_main(): try: from hydra._internal.utils import get_args @@ -46,3 +64,7 @@ def hydra_main(cfg: FairseqConfig) -> None: hydra_init(cfg_name) hydra_main() + + +if __name__ == "__main__": + cli_main() diff --git a/setup.py b/setup.py index 2aae720d7e..6bc450a7fa 100644 --- a/setup.py +++ b/setup.py @@ -22,14 +22,18 @@ def write_version_py(): # append latest commit hash to version string try: - sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip() + sha = ( + subprocess.check_output(["git", "rev-parse", "HEAD"]) + .decode("ascii") + .strip() + ) version += "+" + sha[:7] except Exception: pass # write version info to fairseq/version.py with open(os.path.join("fairseq", "version.py"), "w") as f: - f.write("__version__ = \"{}\"\n".format(version)) + f.write('__version__ = "{}"\n'.format(version)) return version @@ -194,7 +198,8 @@ def do_setup(package_data): "tests", "tests.*", ] - ) + extra_packages, + ) + + extra_packages, package_data=package_data, ext_modules=extensions, test_suite="tests", @@ -202,6 +207,7 @@ def do_setup(package_data): "console_scripts": [ "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main", "fairseq-generate = fairseq_cli.generate:cli_main", + "fairseq-hydra-train = fairseq_cli.hydra_train:cli_main", "fairseq-interactive = fairseq_cli.interactive:cli_main", "fairseq-preprocess = fairseq_cli.preprocess:cli_main", "fairseq-score = fairseq_cli.score:cli_main", @@ -230,8 +236,11 @@ def get_files(path, relative_to="fairseq"): fairseq_examples = os.path.join("fairseq", "examples") if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples): os.symlink(os.path.join("..", "examples"), fairseq_examples) + package_data = { - "fairseq": get_files("fairseq/examples"), + "fairseq": ( + get_files(fairseq_examples) + get_files(os.path.join("fairseq", "config")) + ) } do_setup(package_data) finally: