From dbfca6e6d6c9f185144cca9d5ab55a86d824905e Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Fri, 20 Nov 2020 05:59:25 -0800
Subject: [PATCH] Add fairseq-hydra-train and update docs (#1449)

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1449

Test Plan: Imported from OSS

Reviewed By: alexeib

Differential Revision: D25094525

Pulled By: myleott

fbshipit-source-id: 430387d11196d3292933bb168cf09ea16ebc0d3b
---
 docs/hydra_integration.md        | 226 ++++++++++++++++++-------------
 examples/wav2vec/README.md       |  21 ++-
 fairseq/config/config.yaml       |   6 +-
 fairseq/dataclass/configs.py     |   6 +
 fairseq/modules/cross_entropy.py |   6 +-
 fairseq_cli/hydra_train.py       |  34 ++++-
 setup.py                         |  17 ++-
 7 files changed, 203 insertions(+), 113 deletions(-)

diff --git a/docs/hydra_integration.md b/docs/hydra_integration.md
index f924de961b..8e4082cb24 100644
--- a/docs/hydra_integration.md
+++ b/docs/hydra_integration.md
@@ -1,57 +1,70 @@
 ## Hydra
 
-[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python framework that simplifies the development of
-research and other complex applications. The key feature is the ability to dynamically create a hierarchical
-configuration by composition and override it through config files and the command line. The name Hydra comes from its
-ability to run multiple similar jobs - much like a Hydra with multiple heads.
+[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python
+framework that simplifies the development of research and other complex
+applications. The key feature is the ability to dynamically create a
+hierarchical configuration by composition and override it through config files
+and the command line. The name Hydra comes from its ability to run multiple
+similar jobs - much like a Hydra with multiple heads.
 
 ## Motivation
 
-Until recently, all components in fairseq were configured through a shared "args" namespace that was created at
-application startup. Components declared their own "add_args" method to update the argparse parser, hoping that
-the names would not clash with arguments from other components. While this model works for smaller applications,
-as fairseq grew and became integrated into other applications, this became problematic.
-In order to determine how to configure each component, one needed to a) examine what args were added by this component, and
-b) read the code to figure out what shared arguments it is using that were added in other places. Reproducing
-models involved sharing commands that often contained dozens of command line switches.
-
-The model described above is still supported by fairseq for backward compatibility, but will be deprecated some time
-in the future.
-
-New components in fairseq should now create a dataclass that encapsulates all parameters required to configure this
-component. The dataclass is registered along with the component, and fairseq takes care of constructing and
-providing this configuration object to the component's constructor. Note that sharing parameters can optionally
-still work, but one has to explicitly point to the "source of truth" (see inheritance example below).
-These changes make components in fairseq
-more independent and re-usable by other applications: all that is needed to create a component is to initialize its
-dataclass and overwrite some of the defaults.
-
-While configuring fairseq through command line (using either the legacy argparse based or the new Hydra based entry points) is still
-fully supported, you can now take advantage of configuring fairseq completely or piece-by-piece through
-hierarchical YAML configuration files. These files can also be shipped as examples that others can use to run
-an identically configured job.
-
-Additionally, Hydra has a rich and growing
-[library of plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that provide functionality such as
-hyperparameter sweeping (including using bayesian optimization through the [Ax](https://github.com/facebook/Ax) library),
-job launching across various platforms, and more.
+Until recently, all components in fairseq were configured through a shared
+`args` namespace that was created at application startup. Components declared
+their own `add_args` method to update the argparse parser, hoping that the names
+would not clash with arguments from other components. While this model works for
+smaller applications, as fairseq grew and became integrated into other
+applications, this became problematic. In order to determine how to configure
+each component, one needed to a) examine what args were added by this component,
+and b) read the code to figure out what shared arguments it is using that were
+added in other places. Reproducing models involved sharing commands that often
+contained dozens of command line switches.
+
+The model described above is still supported by fairseq for backward
+compatibility, but will be deprecated some time in the future.
+
+New components in fairseq should now create a dataclass that encapsulates all
+parameters required to configure this component. The dataclass is registered
+along with the component, and fairseq takes care of constructing and providing
+this configuration object to the component's constructor. Note that sharing
+parameters can optionally still work, but one has to explicitly point to the
+"source of truth" (see inheritance example below). These changes make components
+in fairseq more independent and re-usable by other applications: all that is
+needed to create a component is to initialize its dataclass and overwrite some
+of the defaults.
+
+While configuring fairseq through command line (using either the legacy argparse
+based or the new Hydra based entry points) is still fully supported, you can now
+take advantage of configuring fairseq completely or piece-by-piece through
+hierarchical YAML configuration files. These files can also be shipped as
+examples that others can use to run an identically configured job.
+
+Additionally, Hydra has a rich and growing [library of
+plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that
+provide functionality such as hyperparameter sweeping (including using bayesian
+optimization through the [Ax](https://github.com/facebook/Ax) library), job
+launching across various platforms, and more.
 
 ## Creating or migrating components
 
-In general, each new (or updated) component should provide a companion [dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are typically located in the same
-file as the component and are passed as arguments to the register_*() functions. Top-level configs that should be
-present in every fairseq application are placed in the [global](fairseq/dataclass/configs.py) config file and added
-to the FairseqConfig object.
-
-Each dataclass is a plain-old-data object, similar to a NamedTuple. These classes are decorated with a @dataclass
-decorator, and typically inherit from `FairseqDataclass` (which adds some functionality for backward compatibility).
-Each field must have a type, and generally has metadata (such as a help string) and a default value. Only primitive types or other config objects are allowed as
+In general, each new (or updated) component should provide a companion
+[dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are
+typically located in the same file as the component and are passed as arguments
+to the `register_*()` functions. Top-level configs that should be present in
+every fairseq application are placed in the
+[global](fairseq/dataclass/configs.py) config file and added to the
+`FairseqConfig` object.
+
+Each dataclass is a plain-old-data object, similar to a `NamedTuple`. These
+classes are decorated with a `@dataclass` decorator, and typically inherit from
+`FairseqDataclass` (which adds some functionality for backward compatibility).
+Each field must have a type, and generally has metadata (such as a help string)
+and a default value. Only primitive types or other config objects are allowed as
 data types for each field.
 
- Example:
-
+#### Example:
 
-``` python
+```python
 from dataclasses import dataclass, field
 from fairseq.dataclass import FairseqDataclass
 
@@ -71,11 +84,12 @@ class InteractiveConfig(FairseqDataclass):
 
 ### Inherting values
 
-Some components require sharing a value. For example, a learning rate scheduler and an optimizer may both need to
-know the initial learning rate value. One can declare a field that, by default, will
-inherit its value from another config node in the same hierarchy:
+Some components require sharing a value. For example, a learning rate scheduler
+and an optimizer may both need to know the initial learning rate value. One can
+declare a field that, by default, will inherit its value from another config
+node in the same hierarchy:
 
-``` python
+```python
 @dataclass
 FairseqAdamConfig(FairseqDataclass):
     ...
@@ -83,18 +97,21 @@ FairseqAdamConfig(FairseqDataclass):
     ...
 ```
 
-`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"` , which is the value one can use in a YAML config file or through
-command line to achieve the same effect. Note that this assumes that there is an "optimization" config object
-in the root config and it has a field called "lr".
+`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"`, which is
+the value one can use in a YAML config file or through command line to achieve
+the same effect. Note that this assumes that there is an "optimization" config
+object in the root config and it has a field called "lr".
 
 ### Tasks and Models
 
-Creating Tasks and Models works same as before, except that legacy implementations now inherit from Legacy* base classes,
-while new components inherit from FairseqTask and FairseqModel and provide a dataclass to the register_*() functions.
+Creating Tasks and Models works same as before, except that legacy
+implementations now inherit from `LegacyFairseq*` base classes, while new
+components inherit from `FairseqTask` and `FairseqModel` and provide a dataclass
+to the `register_*()` functions.
 
-Task example:
+#### Task example:
 
-``` python
+```python
 @dataclass
 class LanguageModelingConfig(FairseqDataclass):
     data: Optional[str] = field(
@@ -110,9 +127,9 @@ class LanguageModelingTask(LegacyFairseqTask):
         ...
 ```
 
-Model example:
+#### Model example:
 
-``` python
+```python
 @dataclass
 class TransformerLanguageModelConfig(FairseqDataclass):
     activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
@@ -131,9 +148,10 @@ class TransformerLanguageModel(FairseqLanguageModel):
 
 ### Other components
 
-Other components work as before, but they now take their configuration dataclass as the only constructor argument:
+Other components work as before, but they now take their configuration dataclass
+as the only constructor argument:
 
-``` python
+```python
 @dataclass
 class MosesTokenizerConfig(FairseqDataclass):
     source_lang: str = field(default="en", metadata={"help": "source language"})
@@ -145,50 +163,61 @@ class MosesTokenizer(object):
         ...
 ```
 
-Note that if you are adding a new registry for a new set of components, you need to add it to the FairseqConfig object in
-fairseq/dataclass/configs.py:
+Note that if you are adding a new registry for a new set of components, you need
+to add it to the `FairseqConfig` object in `fairseq/dataclass/configs.py`:
 
-``` python
+```python
 @dataclass
 class FairseqConfig(object):
     ...
     my_new_registry: Any = None
 ```
 
-## Training with hydra_train.py
+## Training with `fairseq-hydra-train`
 
-To fully take advantage of configuration flexibility offered by Hydra, you may want to train new models using the
-hydra_train.py entry point located in the fairseq_cli directory. Legacy CLI tools such as train.py,
-will remain supported for the foreseeable future but will be deprecated eventually.
+To fully take advantage of configuration flexibility offered by Hydra, you may
+want to train new models using the `fairseq-hydra-train` entry point. Legacy CLI
+tools such as `fairseq-train` will remain supported for the foreseeable future
+but will be deprecated eventually.
 
-On startup, Hydra will create a configuration object that contains a hierarchy of all the necessary dataclasses
-populated with their default values in the code. The default values are overwritten by values found in YAML files in
-fairseq/config directory (which currently just set default task, optimizer, etc) and then further overwritten by values
-provided through command line arguments. 
+On startup, Hydra will create a configuration object that contains a hierarchy
+of all the necessary dataclasses populated with their default values in the
+code. The default values are overwritten by values found in YAML files in
+`fairseq/config` directory (which currently sets minimal defaults) and then
+further overwritten by values provided through command line arguments.
 
 Some of the most common use cases are shown below:
 
-### 1. Overwrite default values through command line:
+### 1. Override default values through command line:
 
 ```shell script
-python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 task.data=data-bin \
-model=transformer_lm/transformer_lm_gpt task=language_modeling optimization.max_update=5000
-
+$ fairseq-hydra-train \
+    distributed_training.distributed_world_size=1 \
+    dataset.batch_size=2 \
+    task.data=data-bin \
+    model=transformer_lm/transformer_lm_gpt \
+    task=language_modeling \
+    optimization.max_update=5000
 ```
 
-Note that along with explicitly providing values for parameters such as dataset.batch_size, this also tells Hydra to overlay configuration found in `fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml`
-over the default values in the dataclass. If you want to train a model without specifying a particular architecture
-you can simply specify model=transformer_lm. This only works for migrated tasks and models.
+Note that along with explicitly providing values for parameters such as
+`dataset.batch_size`, this also tells Hydra to overlay configuration found in
+`fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` over the default
+values in the dataclass. If you want to train a model without specifying a
+particular architecture you can simply specify `model=transformer_lm`. This only
+works for migrated tasks and models.
 
 ### 2. Replace bundled configs with an external config:
 
 ```shell script
-python fairseq_cli/hydra_train.py --config-path /path/to/external/configs --config-name wiki103
+$ fairseq-hydra-train \
+    --config-path /path/to/external/configs \
+    --config-name wiki103
 ```
 
-where /path/to/external/configs/wiki103.yaml contains:
+where `/path/to/external/configs/wiki103.yaml` contains:
 
-``` yaml
+```yaml
 # @package _group_
 
 model:
@@ -211,24 +240,38 @@ lr_scheduler:
   _name: cosine
 ```
 
-Note that here bundled configs from `fairseq/config` directory are not used, however the defaults from each dataclass will still be used (unless overwritten by your external config). 
+Note that here bundled configs from `fairseq/config` directory are not used,
+however the defaults from each dataclass will still be used (unless overwritten
+by your external config).
 
-Additionally you can choose to break up your configs by creating a directory structure in the same location as your main config file, with the names of the top-level fields
-(such as "model", "dataset", etc), and placing config files with meaningful names that would populate that specific section of your
-top-level config file (for example, you might have model/small_transformer_lm.yaml, model/big_transformer_lm.yaml, etc). You can then specify the correct configuration via command line, defaults in the main config, or even launch all of them as a sweep (see Hydra documentation on how to do this).
+Additionally you can choose to break up your configs by creating a directory
+structure in the same location as your main config file, with the names of the
+top-level fields (such as "model", "dataset", etc), and placing config files
+with meaningful names that would populate that specific section of your
+top-level config file (for example, you might have
+`model/small_transformer_lm.yaml`, `model/big_transformer_lm.yaml`, etc). You
+can then specify the correct configuration via command line, defaults in the
+main config, or even launch all of them as a sweep (see Hydra documentation on
+how to do this).
 
 ### 3. Add an external config directory to Hydra search path:
 
-This allows combining default configuration (including using any bundled config files), while specifying your own config files for some parts of the configuration.
+This allows combining default configuration (including using any bundled config
+files), while specifying your own config files for some parts of the
+configuration.
 
 ```shell script
-python fairseq_cli/hydra_train.py distributed_training.distributed_world_size=1 dataset.batch_size=2 \
-task.data=/path/to/data/ model=transformer_lm/2_layers task=language_modeling optimization.max_update=5000 \
---config-dir /path/to/external/configs
-
+$ fairseq-hydra-train \
+    distributed_training.distributed_world_size=1 \
+    dataset.batch_size=2 \
+    task.data=/path/to/data/ \
+    model=transformer_lm/2_layers \
+    task=language_modeling \
+    optimization.max_update=5000 \
+    --config-dir /path/to/external/configs
 ```
 
-where /path/to/external/configs has the following structure:
+where `/path/to/external/configs` has the following structure:
 ```
 .
 +-- model
@@ -236,5 +279,6 @@ where /path/to/external/configs has the following structure:
 |   |   +-- 2_layers.yaml
 ```
 
-and 2_layers.yaml contains a copy of transformer_lm_gpt.yaml but with decoder_layers set to 2. You can add
-other configs to configure other components as well.
+and `2_layers.yaml` contains a copy of `transformer_lm_gpt.yaml` but with
+`decoder_layers` set to 2. You can add other configs to configure other
+components as well.
diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md
index 442a92553a..fdbf844ec7 100644
--- a/examples/wav2vec/README.md
+++ b/examples/wav2vec/README.md
@@ -56,8 +56,10 @@ This configuration was used for the base model trained on the Librispeech datase
 Note that the input is expected to be single channel, sampled at 16 kHz
 
 ```shell script
-$ python fairseq_cli/hydra_train.py task.data=/path/to/data \
---config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining --config-name wav2vec2_base_librispeech
+$ fairseq-hydra-train \
+    task.data=/path/to/data \
+    --config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining \
+    --config-name wav2vec2_base_librispeech
 ```
 
 Note: you can simulate 64 GPUs by using k GPUs and adding command line parameters (before --config-path) 
@@ -68,8 +70,10 @@ Note: you can simulate 64 GPUs by using k GPUs and adding command line parameter
 This configuration was used for the large model trained on the Libri-light dataset in the wav2vec 2.0 paper
 
 ```shell script
-$ python fairseq_cli/hydra_train.py task.data=/path/to/data \
---config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining --config-name wav2vec2_large_librivox
+$ fairseq-hydra-train \
+    task.data=/path/to/data \
+    --config-path /path/to/fairseq-py/examples/wav2vec/config/pretraining \
+    --config-name wav2vec2_large_librivox
 ```
 
 Note: you can simulate 128 GPUs by using k GPUs and adding command line parameters (before --config-path) 
@@ -88,9 +92,12 @@ $ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $sp
 
 Fine-tuning on 100h of Librispeech with letter targets:
 ```shell script
-python fairseq_cli/hydra_train.py distributed_training.distributed_port=$PORT task.data=/path/to/data \
-model.w2v_path=/path/to/model.pt --config-path /path/to/fairseq-py/examples/wav2vec/config/finetuning \
---config-name base_100h
+$ fairseq-hydra-train \
+    distributed_training.distributed_port=$PORT \
+    task.data=/path/to/data \
+    model.w2v_path=/path/to/model.pt \
+    --config-path /path/to/fairseq-py/examples/wav2vec/config/finetuning \
+    --config-name base_100h
 ```
 
 There are other config files in the config/finetuning directory that can be used to fine-tune on other splits.
diff --git a/fairseq/config/config.yaml b/fairseq/config/config.yaml
index 039609aece..9621baa5e9 100644
--- a/fairseq/config/config.yaml
+++ b/fairseq/config/config.yaml
@@ -1,10 +1,10 @@
 # @package _group_
 defaults:
-    - task: language_modeling
+    - task: null
     - model: null
     - criterion: cross_entropy
-    - optimizer: adam
-    - lr_scheduler: cosine
+    - optimizer: null
+    - lr_scheduler: fixed
     - bpe: null
     - tokenizer: null
     - scoring: null
diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
index 28dc8905c7..36d88d83f7 100644
--- a/fairseq/dataclass/configs.py
+++ b/fairseq/dataclass/configs.py
@@ -173,6 +173,12 @@ class CommonConfig(FairseqDataclass):
     profile: bool = field(
         default=False, metadata={"help": "enable autograd profiler emit_nvtx"}
     )
+    reset_logging: bool = field(
+        default=True,
+        metadata={
+            "help": "when using Hydra, reset the logging at the beginning of training"
+        },
+    )
 
 
 @dataclass
diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py
index 0d2beb44bb..6f33c24cb5 100644
--- a/fairseq/modules/cross_entropy.py
+++ b/fairseq/modules/cross_entropy.py
@@ -26,12 +26,14 @@ def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"):
     import xentropy_cuda
     from apex.contrib import xentropy
 
-    logger.info("using fused cross entropy")
-
     def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
         if logits.device == torch.device("cpu"):
             return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
         else:
+            if not getattr(cross_entropy, "_has_logged_once", False):
+                logger.info("using fused cross entropy")
+                cross_entropy._has_logged_once = True
+
             half_to_float = logits.dtype == torch.half
             losses = xentropy.SoftmaxCrossEntropyLoss.apply(
                 logits,
diff --git a/fairseq_cli/hydra_train.py b/fairseq_cli/hydra_train.py
index ffd3c5cd07..b092ce14ee 100644
--- a/fairseq_cli/hydra_train.py
+++ b/fairseq_cli/hydra_train.py
@@ -4,29 +4,32 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import hydra
-from omegaconf import OmegaConf
+import logging
 import os
+import sys
 
 from fairseq.dataclass.initialize import hydra_init
 from fairseq_cli.train import main as pre_main
 from fairseq import distributed_utils
 from fairseq.dataclass.configs import FairseqConfig
 
-import logging
+import hydra
 import torch
+from omegaconf import OmegaConf
 
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("fairseq_cli.hydra_train")
 
 
 @hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config")
 def hydra_main(cfg: FairseqConfig) -> None:
-
     cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True, enum_to_str=True))
 
     OmegaConf.set_struct(cfg, True)
 
+    if cfg.common.reset_logging:
+        reset_logging()  # Hydra hijacks logging, fix that
+
     if cfg.common.profile:
         with torch.cuda.profiler.profile():
             with torch.autograd.profiler.emit_nvtx():
@@ -35,7 +38,22 @@ def hydra_main(cfg: FairseqConfig) -> None:
         distributed_utils.call_main(cfg, pre_main)
 
 
-if __name__ == "__main__":
+def reset_logging():
+    root = logging.getLogger()
+    for handler in root.handlers:
+        root.removeHandler(handler)
+    root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper())
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(
+        logging.Formatter(
+            fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    )
+    root.addHandler(handler)
+
+
+def cli_main():
     try:
         from hydra._internal.utils import get_args
 
@@ -46,3 +64,7 @@ def hydra_main(cfg: FairseqConfig) -> None:
 
     hydra_init(cfg_name)
     hydra_main()
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/setup.py b/setup.py
index 2aae720d7e..6bc450a7fa 100644
--- a/setup.py
+++ b/setup.py
@@ -22,14 +22,18 @@ def write_version_py():
 
     # append latest commit hash to version string
     try:
-        sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()
+        sha = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"])
+            .decode("ascii")
+            .strip()
+        )
         version += "+" + sha[:7]
     except Exception:
         pass
 
     # write version info to fairseq/version.py
     with open(os.path.join("fairseq", "version.py"), "w") as f:
-        f.write("__version__ = \"{}\"\n".format(version))
+        f.write('__version__ = "{}"\n'.format(version))
     return version
 
 
@@ -194,7 +198,8 @@ def do_setup(package_data):
                 "tests",
                 "tests.*",
             ]
-        ) + extra_packages,
+        )
+        + extra_packages,
         package_data=package_data,
         ext_modules=extensions,
         test_suite="tests",
@@ -202,6 +207,7 @@ def do_setup(package_data):
             "console_scripts": [
                 "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main",
                 "fairseq-generate = fairseq_cli.generate:cli_main",
+                "fairseq-hydra-train = fairseq_cli.hydra_train:cli_main",
                 "fairseq-interactive = fairseq_cli.interactive:cli_main",
                 "fairseq-preprocess = fairseq_cli.preprocess:cli_main",
                 "fairseq-score = fairseq_cli.score:cli_main",
@@ -230,8 +236,11 @@ def get_files(path, relative_to="fairseq"):
     fairseq_examples = os.path.join("fairseq", "examples")
     if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples):
         os.symlink(os.path.join("..", "examples"), fairseq_examples)
+
     package_data = {
-        "fairseq": get_files("fairseq/examples"),
+        "fairseq": (
+            get_files(fairseq_examples) + get_files(os.path.join("fairseq", "config"))
+        )
     }
     do_setup(package_data)
 finally: