Introduce slurm evaluator, Continuous monitoring and launching of eva…

…luations for slurm jobs
facebookresearch · Jun 14, 2021 · 2d17ad6 · 2d17ad6
1 parent b5fefc2
commit 2d17ad6
Show file tree

Hide file tree

Showing 11 changed files with 931 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -97,5 +97,5 @@ website/pages/tutorials/*
 **/.ipynb_checkpoints/**
 
 # Configs for local development
-configs/config_local/*
+configs/config/config_local/*
 train_config.yaml
diff --git a/dev/__init__.py b/dev/__init__.py
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json b/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,22 @@
+{
+    "params": {
+           "evaluation_iter_freq": -1,
+           "evaluation_phase_freq": -1,
+           "evaluate_final_phase": true,
+           "autoload_slurm_evaluator_checkpoint": false,
+           "slurm_evaluator_checkpoint": null,
+           "auto_retry_evaluations": false,
+           "retry_evaluation_job_ids": [],
+           "max_retries": 3,
+           "pytorch_ports": [40050]
+       },
+       "slurm_options": {
+            "NAME": "vissl",
+            "COMMENT": "vissl evaluation job",
+            "CONSTRAINT": "",
+            "TIMEOUT_MIN": 4320,
+            "CPUS_PER_TASK": 8,
+            "MEM_GB": 16,
+            "ADDITIONAL_PARAMETERS": {}
+        }
+}
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_template.json b/dev/benchmark_suite/benchmark_suite_scheduler_template.json
@@ -0,0 +1,33 @@
+{
+    "params": {
+           "training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config",
+           "benchmarks": [
+               {
+                   "evaluation_name": "(str) Name of benchmark for convenience",
+                   "config_files": [
+                       "config=path/to/evaluation/config",
+                       "config.OVERRIDES=new_value"
+                   ]
+               }
+           ],
+           "evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations",
+           "evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases",
+           "evaluate_final_phase": "(bool, default=True) Evaluate the final phase",
+           "autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint",
+           "slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint",
+           "auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations",
+           "retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry",
+           "max_retries": "(int, default=3) Maximum number of retries",
+           "pytorch_ports": "(List[int], default=[40500]) List of pytorch ports to cycle through as you are launching your evaluations, in order to prevent Pytorch DDP port colissions."
+       },
+       "slurm_options": {
+            "PARTITION": "(str) Partition",
+            "NAME": "(str, default=vissl) Name of slurm job",
+            "COMMENT": "(str, default=vissl evaluation job) Comment of slurm job",
+            "CONSTRAINT": "(str, default='') Constraing of slurm job",
+            "TIMEOUT_MIN": "(int, default=72 * 60) Minimum amount of minutes to timeout",
+            "CPUS_PER_TASK": "(int, default=8) Numer of cpus per task.",
+            "MEM_GB": "(int, default=32) Amount of RAM to request from slurm",
+            "ADDITIONAL_PARAMETERS": "(Dict[[str, Any]], default={}) Any default slurm options to pass to submitit",
+    }
+}
diff --git a/dev/launch_benchmark_suite_scheduler_slurm.sh b/dev/launch_benchmark_suite_scheduler_slurm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This benchmark suite script launches a benchmark suite scheduler slurm job.
+# The job takes an absolute json config path (see benchmark_suite_scheduler_template.json for info)
+# The job continuously monitors training benchmarks, and dynamically launches evaluation jobs
+# and amalgamates the results.
+
+######################### EXAMPLE USAGE #################################
+
+# cd into vissl root directory.
+#
+# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh /path/to/benchmark_suite_scheduler.json
+
+# See benchmark_suite_scheduler_template.json or for config information or slurm_evaluator.py for class structure.
+######################### INPUT PARAMS ##################################
+
+FILE=( "$@" )
+
+####################### setup experiment dir ###################################
+
+# create a temporary experiment folder to run the SLURM job in isolation
+RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S')
+EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID"
+
+echo "EXP_ROOT_DIR: $EXP_ROOT_DIR"
+echo "CONFIG_FILE: $FILE"
+
+rm -rf $EXP_ROOT_DIR
+mkdir -p "$EXP_ROOT_DIR"
+cp -r . $EXP_ROOT_DIR
+
+####################### setup experiment dir ###################################
+export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
+python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \
+    "${FILE[@]}"
diff --git a/tools/launch_benchmark_suite_scheduler_slurm.py b/tools/launch_benchmark_suite_scheduler_slurm.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pkg_resources
+import sys
+
+import submitit
+from fvcore.common.file_io import PathManager
+from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
+from vissl.config.attr_dict import AttrDict
+from vissl.utils.hydra_config import is_hydra_available
+from vissl.utils.io import load_file
+from vissl.utils.misc import recursive_dict_merge
+from vissl.utils.slurm import is_submitit_available
+
+
+# Default config options
+default_config_file = pkg_resources.resource_filename(
+    "dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
+)
+_DEFAULT_CONFIG = load_file(default_config_file)
+
+
+class SlurmEvaluatorJob:
+    """
+     The slurm evaluator job is a thin wrapper around the BenchmarkSuiteScheduler used by submitit.
+     It's main function is run multiple evaluations on a single training.
+    """
+    def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler):
+        self.benchmark_suite_scheduler = benchmark_suite_scheduler
+
+    def __call__(self):
+        self.benchmark_suite_scheduler.evaluate()
+
+    def checkpoint(self):
+        """
+        This method is called whenever a job is pre-empted, timedout, etc,.
+        Here we save the evaluation benchmarks, so that we can reload them
+        and continue where we left off.
+        """
+        self.benchmark_suite_scheduler.save_evaluation_benchmarks()
+        # Forces the benchmark_suite_scheduler to automatically reload it's
+        # checkpoint, the benchmark results.
+        self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = (
+            True
+        )
+
+        trainer = SlurmEvaluatorJob(
+            benchmark_suite_scheduler=self.benchmark_suite_scheduler
+        )
+        return submitit.helpers.DelayedSubmission(trainer)
+
+
+def launch_benchmark_suite_scheduler(config_file):
+    assert PathManager.exists(config_file), "Slurm evaluator config file must exist"
+
+    user_config = load_file(config_file)
+    config = _DEFAULT_CONFIG.copy()
+    recursive_dict_merge(config ,user_config)
+
+    benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
+    benchmark_suite_scheduler_job = SlurmEvaluatorJob(
+        benchmark_suite_scheduler=benchmark_suite_scheduler
+    )
+    executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())
+
+    assert "slurm_options" in config, "slurm_options must be specified"
+    assert (
+        "PARTITION" in config["slurm_options"]
+    ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"
+
+    slurm_options = AttrDict(config["slurm_options"])
+    executor.update_parameters(
+        name=slurm_options.NAME,
+        slurm_comment=slurm_options.COMMENT,
+        slurm_partition=slurm_options.PARTITION,
+        slurm_constraint=slurm_options.CONSTRAINT,
+        timeout_min=slurm_options.TIMEOUT_MIN,
+        nodes=1,
+        cpus_per_task=slurm_options.CPUS_PER_TASK,
+        tasks_per_node=1,
+        mem_gb=slurm_options.MEM_GB,
+        slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
+    )
+
+    job = executor.submit(benchmark_suite_scheduler_job)
+    print(f"SUBMITTED EVALUATION JOB: {job.job_id}")
+
+
+if __name__ == "__main__":
+    """
+    Example usage:
+    python -u "./vissl/engines/benchmark_suite_scheduler.py" \
+        "/path/to/benchmark_suite_scheduler_example.json"
+    """
+    assert is_hydra_available(), "Make sure to install hydra"
+
+    assert (
+        is_submitit_available()
+    ), "Please 'pip install submitit' to schedule jobs on SLURM"
+
+    config_file = sys.argv[1]
+    launch_benchmark_suite_scheduler(config_file)
diff --git a/vissl/hooks/log_hooks.py b/vissl/hooks/log_hooks.py
@@ -245,6 +245,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
                     "eta": eta_string,
                     "peak_mem(M)": peak_mem_used,
                 }
+
+                if iteration == 1:
+                    # Set max iterations. Currently used in benchmark_suite_scheduler.py
+                    log_data["max_iterations"] = task.max_iteration
+
                 if self.btime_freq and len(batch_times) >= self.btime_freq:
                     rolling_avg_time = (
                         sum(batch_times[-self.btime_freq :]) / self.btime_freq