diff --git a/dev/__init__.py b/dev/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json b/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json new file mode 100644 index 000000000..8aa3c5126 --- /dev/null +++ b/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json @@ -0,0 +1,21 @@ +{ + "params": { + "evaluation_iter_freq": -1, + "evaluation_phase_freq": -1, + "evaluate_final_phase": true, + "autoload_slurm_evaluator_checkpoint": false, + "slurm_evaluator_checkpoint": null, + "auto_retry_evaluations": false, + "retry_evaluation_job_ids": [], + "max_retries": 3 + }, + "slurm_options": { + "NAME": "vissl", + "COMMENT": "vissl evaluation job", + "CONSTRAINT": "", + "TIMEOUT_MIN": 4320, + "CPUS_PER_TASK": 8, + "MEM_GB": 16, + "ADDITIONAL_PARAMETERS": {} + } +} diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_swav.json b/dev/benchmark_suite/benchmark_suite_scheduler_swav.json new file mode 100644 index 000000000..2256d6cac --- /dev/null +++ b/dev/benchmark_suite/benchmark_suite_scheduler_swav.json @@ -0,0 +1,25 @@ +{ + "params": { + "training_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-05-27-12-52-01/checkpoints", + "benchmarks": [ + { + "evaluation_name": "quick_eval_in1k_linear", + "config_files": [ + "config=test/integration_test/quick_eval_in1k_linear.yaml" + ] + } + ], + "evaluation_iter_freq": 20, + "evaluation_phase_freq": 2, + "evaluate_final_phase": true, + "autoload_slurm_evaluator_checkpoint": false, + "slurm_evaluator_checkpoint": null, + "max_training_iterations": null, + "auto_retry_evaluations": true, + "retry_evaluation_job_ids": [], + "max_retries": 3 + }, + "slurm_options": { + "PARTITION": "learnfair" + } + } \ No newline at end of file diff --git a/tools/launch_benchmark_suite_scheduler_slurm.py b/tools/launch_benchmark_suite_scheduler_slurm.py index c36b44d40..467863e8c 100644 --- a/tools/launch_benchmark_suite_scheduler_slurm.py +++ b/tools/launch_benchmark_suite_scheduler_slurm.py @@ -1,30 +1,26 @@ -#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import pkg_resources import sys import submitit from fvcore.common.file_io import PathManager -from tools.benchmark_suite_scheduler import BenchmarkSuiteScheduler +from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler from vissl.config.attr_dict import AttrDict from vissl.utils.hydra_config import is_hydra_available from vissl.utils.io import load_file +from vissl.utils.misc import recursive_dict_merge from vissl.utils.slurm import is_submitit_available -# Default slurm options to pass to the executor. -_DEFAULT_SLURM_OPTIONS = { - "NAME": "vissl", - "COMMENT": "vissl evaluation job", - "CONSTRAINT": "", - "TIMEOUT_MIN": 72 * 60, # Timeout in minutes. - "CPUS_PER_TASK": 8, - "MEM_GB": 16, - "ADDITIONAL_PARAMETERS": {}, -} +# Default config options +default_config_file = pkg_resources.resource_filename( + "dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json" +) +_DEFAULT_CONFIG = load_file(default_config_file) class SlurmEvaluatorJob: @@ -60,7 +56,10 @@ def checkpoint(self): def launch_benchmark_suite_scheduler(config_file): assert PathManager.exists(config_file), "Slurm evaluator config file must exist" - config = load_file(config_file) + user_config = load_file(config_file) + config = _DEFAULT_CONFIG.copy() + recursive_dict_merge(config ,user_config) + benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"]) benchmark_suite_scheduler_job = SlurmEvaluatorJob( benchmark_suite_scheduler=benchmark_suite_scheduler @@ -68,14 +67,12 @@ def launch_benchmark_suite_scheduler(config_file): executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir()) assert "slurm_options" in config, "slurm_options must be specified" - override_slurm_options = config["slurm_options"] assert ( - "PARTITION" in override_slurm_options + "PARTITION" in config["slurm_options"] ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm" - slurm_options = {**_DEFAULT_SLURM_OPTIONS, **override_slurm_options} - slurm_options = AttrDict(slurm_options) - + slurm_options = AttrDict(config["slurm_options"]) + benchmark_suite_scheduler.evaluate() executor.update_parameters( name=slurm_options.NAME, slurm_comment=slurm_options.COMMENT, diff --git a/tools/benchmark_suite_scheduler.py b/vissl/utils/benchmark_suite_scheduler.py similarity index 96% rename from tools/benchmark_suite_scheduler.py rename to vissl/utils/benchmark_suite_scheduler.py index cd6027363..67a186f5b 100644 --- a/tools/benchmark_suite_scheduler.py +++ b/vissl/utils/benchmark_suite_scheduler.py @@ -18,18 +18,18 @@ from vissl.trainer.trainer_main import build_task from vissl.utils.distributed_launcher import launch_distributed_on_slurm from vissl.utils.hydra_config import convert_to_attrdict -from vissl.utils.io import load_file +from vissl.utils.io import load_file, makedir from vissl.utils.misc import flatten_dict, retry """ - This class is designed to be used to run multiple evaluations on a single (pre)training. - Using the #evaluate method we continuously monitor training checkpoints, launch evaluations - dynamically as they become available, and amalgamate the evaluation results as they become - available. +This class is designed to be used to run multiple evaluations on a single (pre)training. +Using the #evaluate method we continuously monitor training checkpoints, launch evaluations +dynamically as they become available, and amalgamate the evaluation results as they become +available. - For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json) - and use the launch_benchmark_suite_scheduler_slurm.sh for convenience. +For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json) +and use the launch_benchmark_suite_scheduler_slurm.sh for convenience. """ # How many times to retry a slurm job submission. @@ -203,8 +203,7 @@ def save_evaluation_benchmarks(self): evaluation_dir = self.evaluation_dir() parent_metrics_file = os.path.join(evaluation_dir, "evaluation_metrics.json") - if not PathManager.exists(evaluation_dir): - PathManager.mkdirs(evaluation_dir) + makedir(evaluation_dir) self._write_json_file(self.evaluation_results, parent_metrics_file) @@ -215,8 +214,7 @@ def save_evaluation_benchmarks(self): child_metrics_dir, "evaluation_metrics.json" ) - if not PathManager.exists(child_metrics_dir): - PathManager.mkdirs(child_metrics_dir) + makedir(child_metrics_dir) self._write_json_file(benchmarks, child_metrics_file) diff --git a/vissl/utils/misc.py b/vissl/utils/misc.py index 25810e6ee..7447fb1f0 100644 --- a/vissl/utils/misc.py +++ b/vissl/utils/misc.py @@ -406,3 +406,18 @@ def flatten_dict(d: dict, parent_key="", sep="_"): else: items.append((new_key, v)) return dict(items) + + +# Credit: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries +def recursive_dict_merge(dict1, dict2): + """ + Recursively merges dict2 into dict1 + """ + if not isinstance(dict1, dict) or not isinstance(dict2, dict): + return dict2 + for k in dict2: + if k in dict1: + dict1[k] = recursive_dict_merge(dict1[k], dict2[k]) + else: + dict1[k] = dict2[k] + return dict1