Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
iseessel committed Jun 7, 2021
1 parent 8200338 commit 63fec7b
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 29 deletions.
Empty file added dev/__init__.py
Empty file.
21 changes: 21 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,21 @@
{
"params": {
"evaluation_iter_freq": -1,
"evaluation_phase_freq": -1,
"evaluate_final_phase": true,
"autoload_slurm_evaluator_checkpoint": false,
"slurm_evaluator_checkpoint": null,
"auto_retry_evaluations": false,
"retry_evaluation_job_ids": [],
"max_retries": 3
},
"slurm_options": {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"CONSTRAINT": "",
"TIMEOUT_MIN": 4320,
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {}
}
}
25 changes: 25 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_swav.json
@@ -0,0 +1,25 @@
{
"params": {
"training_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-05-27-12-52-01/checkpoints",
"benchmarks": [
{
"evaluation_name": "quick_eval_in1k_linear",
"config_files": [
"config=test/integration_test/quick_eval_in1k_linear.yaml"
]
}
],
"evaluation_iter_freq": 20,
"evaluation_phase_freq": 2,
"evaluate_final_phase": true,
"autoload_slurm_evaluator_checkpoint": false,
"slurm_evaluator_checkpoint": null,
"max_training_iterations": null,
"auto_retry_evaluations": true,
"retry_evaluation_job_ids": [],
"max_retries": 3
},
"slurm_options": {
"PARTITION": "learnfair"
}
}
33 changes: 15 additions & 18 deletions tools/launch_benchmark_suite_scheduler_slurm.py
@@ -1,30 +1,26 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import pkg_resources
import sys

import submitit
from fvcore.common.file_io import PathManager
from tools.benchmark_suite_scheduler import BenchmarkSuiteScheduler
from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
from vissl.config.attr_dict import AttrDict
from vissl.utils.hydra_config import is_hydra_available
from vissl.utils.io import load_file
from vissl.utils.misc import recursive_dict_merge
from vissl.utils.slurm import is_submitit_available


# Default slurm options to pass to the executor.
_DEFAULT_SLURM_OPTIONS = {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"CONSTRAINT": "",
"TIMEOUT_MIN": 72 * 60, # Timeout in minutes.
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {},
}
# Default config options
default_config_file = pkg_resources.resource_filename(
"dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
)
_DEFAULT_CONFIG = load_file(default_config_file)


class SlurmEvaluatorJob:
Expand Down Expand Up @@ -60,22 +56,23 @@ def checkpoint(self):
def launch_benchmark_suite_scheduler(config_file):
assert PathManager.exists(config_file), "Slurm evaluator config file must exist"

config = load_file(config_file)
user_config = load_file(config_file)
config = _DEFAULT_CONFIG.copy()
recursive_dict_merge(config ,user_config)

benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
benchmark_suite_scheduler_job = SlurmEvaluatorJob(
benchmark_suite_scheduler=benchmark_suite_scheduler
)
executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())

assert "slurm_options" in config, "slurm_options must be specified"
override_slurm_options = config["slurm_options"]
assert (
"PARTITION" in override_slurm_options
"PARTITION" in config["slurm_options"]
), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"

slurm_options = {**_DEFAULT_SLURM_OPTIONS, **override_slurm_options}
slurm_options = AttrDict(slurm_options)

slurm_options = AttrDict(config["slurm_options"])
benchmark_suite_scheduler.evaluate()
executor.update_parameters(
name=slurm_options.NAME,
slurm_comment=slurm_options.COMMENT,
Expand Down
Expand Up @@ -18,18 +18,18 @@
from vissl.trainer.trainer_main import build_task
from vissl.utils.distributed_launcher import launch_distributed_on_slurm
from vissl.utils.hydra_config import convert_to_attrdict
from vissl.utils.io import load_file
from vissl.utils.io import load_file, makedir
from vissl.utils.misc import flatten_dict, retry


"""
This class is designed to be used to run multiple evaluations on a single (pre)training.
Using the #evaluate method we continuously monitor training checkpoints, launch evaluations
dynamically as they become available, and amalgamate the evaluation results as they become
available.
This class is designed to be used to run multiple evaluations on a single (pre)training.
Using the #evaluate method we continuously monitor training checkpoints, launch evaluations
dynamically as they become available, and amalgamate the evaluation results as they become
available.
For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json)
and use the launch_benchmark_suite_scheduler_slurm.sh for convenience.
For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json)
and use the launch_benchmark_suite_scheduler_slurm.sh for convenience.
"""

# How many times to retry a slurm job submission.
Expand Down Expand Up @@ -203,8 +203,7 @@ def save_evaluation_benchmarks(self):
evaluation_dir = self.evaluation_dir()
parent_metrics_file = os.path.join(evaluation_dir, "evaluation_metrics.json")

if not PathManager.exists(evaluation_dir):
PathManager.mkdirs(evaluation_dir)
makedir(evaluation_dir)

self._write_json_file(self.evaluation_results, parent_metrics_file)

Expand All @@ -215,8 +214,7 @@ def save_evaluation_benchmarks(self):
child_metrics_dir, "evaluation_metrics.json"
)

if not PathManager.exists(child_metrics_dir):
PathManager.mkdirs(child_metrics_dir)
makedir(child_metrics_dir)

self._write_json_file(benchmarks, child_metrics_file)

Expand Down
15 changes: 15 additions & 0 deletions vissl/utils/misc.py
Expand Up @@ -406,3 +406,18 @@ def flatten_dict(d: dict, parent_key="", sep="_"):
else:
items.append((new_key, v))
return dict(items)


# Credit: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries
def recursive_dict_merge(dict1, dict2):
"""
Recursively merges dict2 into dict1
"""
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
return dict2
for k in dict2:
if k in dict1:
dict1[k] = recursive_dict_merge(dict1[k], dict2[k])
else:
dict1[k] = dict2[k]
return dict1

0 comments on commit 63fec7b

Please sign in to comment.