This repository has been archived by the owner on Mar 19, 2024. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduce slurm evaluator, Continuous monitoring and launching of eva…
…luations for slurm jobs
- Loading branch information
Showing
11 changed files
with
931 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
22 changes: 22 additions & 0 deletions
22
dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"params": { | ||
"evaluation_iter_freq": -1, | ||
"evaluation_phase_freq": -1, | ||
"evaluate_final_phase": true, | ||
"autoload_slurm_evaluator_checkpoint": false, | ||
"slurm_evaluator_checkpoint": null, | ||
"auto_retry_evaluations": false, | ||
"retry_evaluation_job_ids": [], | ||
"max_retries": 3, | ||
"pytorch_ports": [40050] | ||
}, | ||
"slurm_options": { | ||
"NAME": "vissl", | ||
"COMMENT": "vissl evaluation job", | ||
"CONSTRAINT": "", | ||
"TIMEOUT_MIN": 4320, | ||
"CPUS_PER_TASK": 8, | ||
"MEM_GB": 16, | ||
"ADDITIONAL_PARAMETERS": {} | ||
} | ||
} |
33 changes: 33 additions & 0 deletions
33
dev/benchmark_suite/benchmark_suite_scheduler_template.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
{ | ||
"params": { | ||
"training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config", | ||
"benchmarks": [ | ||
{ | ||
"evaluation_name": "(str) Name of benchmark for convenience", | ||
"config_files": [ | ||
"config=path/to/evaluation/config", | ||
"config.OVERRIDES=new_value" | ||
] | ||
} | ||
], | ||
"evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations", | ||
"evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases", | ||
"evaluate_final_phase": "(bool, default=True) Evaluate the final phase", | ||
"autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint", | ||
"slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint", | ||
"auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations", | ||
"retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry", | ||
"max_retries": "(int, default=3) Maximum number of retries", | ||
"pytorch_ports": "(List[int], default=[40500]) List of pytorch ports to cycle through as you are launching your evaluations, in order to prevent Pytorch DDP port colissions." | ||
}, | ||
"slurm_options": { | ||
"PARTITION": "(str) Partition", | ||
"NAME": "(str, default=vissl) Name of slurm job", | ||
"COMMENT": "(str, default=vissl evaluation job) Comment of slurm job", | ||
"CONSTRAINT": "(str, default='') Constraing of slurm job", | ||
"TIMEOUT_MIN": "(int, default=72 * 60) Minimum amount of minutes to timeout", | ||
"CPUS_PER_TASK": "(int, default=8) Numer of cpus per task.", | ||
"MEM_GB": "(int, default=32) Amount of RAM to request from slurm", | ||
"ADDITIONAL_PARAMETERS": "(Dict[[str, Any]], default={}) Any default slurm options to pass to submitit", | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
|
||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
# This benchmark suite script launches a benchmark suite scheduler slurm job. | ||
# The job takes an absolute json config path (see benchmark_suite_scheduler_template.json for info) | ||
# The job continuously monitors training benchmarks, and dynamically launches evaluation jobs | ||
# and amalgamates the results. | ||
|
||
######################### EXAMPLE USAGE ################################# | ||
|
||
# cd into vissl root directory. | ||
# | ||
# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh /path/to/benchmark_suite_scheduler.json | ||
|
||
# See benchmark_suite_scheduler_template.json or for config information or slurm_evaluator.py for class structure. | ||
######################### INPUT PARAMS ################################## | ||
|
||
FILE=( "$@" ) | ||
|
||
####################### setup experiment dir ################################### | ||
|
||
# create a temporary experiment folder to run the SLURM job in isolation | ||
RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S') | ||
EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID" | ||
|
||
echo "EXP_ROOT_DIR: $EXP_ROOT_DIR" | ||
echo "CONFIG_FILE: $FILE" | ||
|
||
rm -rf $EXP_ROOT_DIR | ||
mkdir -p "$EXP_ROOT_DIR" | ||
cp -r . $EXP_ROOT_DIR | ||
|
||
####################### setup experiment dir ################################### | ||
export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH" | ||
python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \ | ||
"${FILE[@]}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
|
||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import pkg_resources | ||
import sys | ||
|
||
import submitit | ||
from fvcore.common.file_io import PathManager | ||
from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler | ||
from vissl.config.attr_dict import AttrDict | ||
from vissl.utils.hydra_config import is_hydra_available | ||
from vissl.utils.io import load_file | ||
from vissl.utils.misc import recursive_dict_merge | ||
from vissl.utils.slurm import is_submitit_available | ||
|
||
|
||
# Default config options | ||
default_config_file = pkg_resources.resource_filename( | ||
"dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json" | ||
) | ||
_DEFAULT_CONFIG = load_file(default_config_file) | ||
|
||
|
||
class SlurmEvaluatorJob: | ||
""" | ||
The slurm evaluator job is a thin wrapper around the BenchmarkSuiteScheduler used by submitit. | ||
It's main function is run multiple evaluations on a single training. | ||
""" | ||
def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler): | ||
self.benchmark_suite_scheduler = benchmark_suite_scheduler | ||
|
||
def __call__(self): | ||
self.benchmark_suite_scheduler.evaluate() | ||
|
||
def checkpoint(self): | ||
""" | ||
This method is called whenever a job is pre-empted, timedout, etc,. | ||
Here we save the evaluation benchmarks, so that we can reload them | ||
and continue where we left off. | ||
""" | ||
self.benchmark_suite_scheduler.save_evaluation_benchmarks() | ||
# Forces the benchmark_suite_scheduler to automatically reload it's | ||
# checkpoint, the benchmark results. | ||
self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = ( | ||
True | ||
) | ||
|
||
trainer = SlurmEvaluatorJob( | ||
benchmark_suite_scheduler=self.benchmark_suite_scheduler | ||
) | ||
return submitit.helpers.DelayedSubmission(trainer) | ||
|
||
|
||
def launch_benchmark_suite_scheduler(config_file): | ||
assert PathManager.exists(config_file), "Slurm evaluator config file must exist" | ||
|
||
user_config = load_file(config_file) | ||
config = _DEFAULT_CONFIG.copy() | ||
recursive_dict_merge(config ,user_config) | ||
|
||
benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"]) | ||
benchmark_suite_scheduler_job = SlurmEvaluatorJob( | ||
benchmark_suite_scheduler=benchmark_suite_scheduler | ||
) | ||
executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir()) | ||
|
||
assert "slurm_options" in config, "slurm_options must be specified" | ||
assert ( | ||
"PARTITION" in config["slurm_options"] | ||
), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm" | ||
|
||
slurm_options = AttrDict(config["slurm_options"]) | ||
executor.update_parameters( | ||
name=slurm_options.NAME, | ||
slurm_comment=slurm_options.COMMENT, | ||
slurm_partition=slurm_options.PARTITION, | ||
slurm_constraint=slurm_options.CONSTRAINT, | ||
timeout_min=slurm_options.TIMEOUT_MIN, | ||
nodes=1, | ||
cpus_per_task=slurm_options.CPUS_PER_TASK, | ||
tasks_per_node=1, | ||
mem_gb=slurm_options.MEM_GB, | ||
slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS, | ||
) | ||
|
||
job = executor.submit(benchmark_suite_scheduler_job) | ||
print(f"SUBMITTED EVALUATION JOB: {job.job_id}") | ||
|
||
|
||
if __name__ == "__main__": | ||
""" | ||
Example usage: | ||
python -u "./vissl/engines/benchmark_suite_scheduler.py" \ | ||
"/path/to/benchmark_suite_scheduler_example.json" | ||
""" | ||
assert is_hydra_available(), "Make sure to install hydra" | ||
|
||
assert ( | ||
is_submitit_available() | ||
), "Please 'pip install submitit' to schedule jobs on SLURM" | ||
|
||
config_file = sys.argv[1] | ||
launch_benchmark_suite_scheduler(config_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.