Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Introduce slurm evaluator, Continuous monitoring and launching of eva…
Browse files Browse the repository at this point in the history
…luations for slurm jobs
  • Loading branch information
iseessel committed Jun 14, 2021
1 parent b5fefc2 commit 2d17ad6
Show file tree
Hide file tree
Showing 11 changed files with 931 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Expand Up @@ -97,5 +97,5 @@ website/pages/tutorials/*
**/.ipynb_checkpoints/**

# Configs for local development
configs/config_local/*
configs/config/config_local/*
train_config.yaml
Empty file added dev/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,22 @@
{
"params": {
"evaluation_iter_freq": -1,
"evaluation_phase_freq": -1,
"evaluate_final_phase": true,
"autoload_slurm_evaluator_checkpoint": false,
"slurm_evaluator_checkpoint": null,
"auto_retry_evaluations": false,
"retry_evaluation_job_ids": [],
"max_retries": 3,
"pytorch_ports": [40050]
},
"slurm_options": {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"CONSTRAINT": "",
"TIMEOUT_MIN": 4320,
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {}
}
}
33 changes: 33 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_template.json
@@ -0,0 +1,33 @@
{
"params": {
"training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config",
"benchmarks": [
{
"evaluation_name": "(str) Name of benchmark for convenience",
"config_files": [
"config=path/to/evaluation/config",
"config.OVERRIDES=new_value"
]
}
],
"evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations",
"evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases",
"evaluate_final_phase": "(bool, default=True) Evaluate the final phase",
"autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint",
"slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint",
"auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations",
"retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry",
"max_retries": "(int, default=3) Maximum number of retries",
"pytorch_ports": "(List[int], default=[40500]) List of pytorch ports to cycle through as you are launching your evaluations, in order to prevent Pytorch DDP port colissions."
},
"slurm_options": {
"PARTITION": "(str) Partition",
"NAME": "(str, default=vissl) Name of slurm job",
"COMMENT": "(str, default=vissl evaluation job) Comment of slurm job",
"CONSTRAINT": "(str, default='') Constraing of slurm job",
"TIMEOUT_MIN": "(int, default=72 * 60) Minimum amount of minutes to timeout",
"CPUS_PER_TASK": "(int, default=8) Numer of cpus per task.",
"MEM_GB": "(int, default=32) Amount of RAM to request from slurm",
"ADDITIONAL_PARAMETERS": "(Dict[[str, Any]], default={}) Any default slurm options to pass to submitit",
}
}
39 changes: 39 additions & 0 deletions dev/launch_benchmark_suite_scheduler_slurm.sh
@@ -0,0 +1,39 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# This benchmark suite script launches a benchmark suite scheduler slurm job.
# The job takes an absolute json config path (see benchmark_suite_scheduler_template.json for info)
# The job continuously monitors training benchmarks, and dynamically launches evaluation jobs
# and amalgamates the results.

######################### EXAMPLE USAGE #################################

# cd into vissl root directory.
#
# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh /path/to/benchmark_suite_scheduler.json

# See benchmark_suite_scheduler_template.json or for config information or slurm_evaluator.py for class structure.
######################### INPUT PARAMS ##################################

FILE=( "$@" )

####################### setup experiment dir ###################################

# create a temporary experiment folder to run the SLURM job in isolation
RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S')
EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID"

echo "EXP_ROOT_DIR: $EXP_ROOT_DIR"
echo "CONFIG_FILE: $FILE"

rm -rf $EXP_ROOT_DIR
mkdir -p "$EXP_ROOT_DIR"
cp -r . $EXP_ROOT_DIR

####################### setup experiment dir ###################################
export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \
"${FILE[@]}"
105 changes: 105 additions & 0 deletions tools/launch_benchmark_suite_scheduler_slurm.py
@@ -0,0 +1,105 @@
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import pkg_resources
import sys

import submitit
from fvcore.common.file_io import PathManager
from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
from vissl.config.attr_dict import AttrDict
from vissl.utils.hydra_config import is_hydra_available
from vissl.utils.io import load_file
from vissl.utils.misc import recursive_dict_merge
from vissl.utils.slurm import is_submitit_available


# Default config options
default_config_file = pkg_resources.resource_filename(
"dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
)
_DEFAULT_CONFIG = load_file(default_config_file)


class SlurmEvaluatorJob:
"""
The slurm evaluator job is a thin wrapper around the BenchmarkSuiteScheduler used by submitit.
It's main function is run multiple evaluations on a single training.
"""
def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler):
self.benchmark_suite_scheduler = benchmark_suite_scheduler

def __call__(self):
self.benchmark_suite_scheduler.evaluate()

def checkpoint(self):
"""
This method is called whenever a job is pre-empted, timedout, etc,.
Here we save the evaluation benchmarks, so that we can reload them
and continue where we left off.
"""
self.benchmark_suite_scheduler.save_evaluation_benchmarks()
# Forces the benchmark_suite_scheduler to automatically reload it's
# checkpoint, the benchmark results.
self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = (
True
)

trainer = SlurmEvaluatorJob(
benchmark_suite_scheduler=self.benchmark_suite_scheduler
)
return submitit.helpers.DelayedSubmission(trainer)


def launch_benchmark_suite_scheduler(config_file):
assert PathManager.exists(config_file), "Slurm evaluator config file must exist"

user_config = load_file(config_file)
config = _DEFAULT_CONFIG.copy()
recursive_dict_merge(config ,user_config)

benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
benchmark_suite_scheduler_job = SlurmEvaluatorJob(
benchmark_suite_scheduler=benchmark_suite_scheduler
)
executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())

assert "slurm_options" in config, "slurm_options must be specified"
assert (
"PARTITION" in config["slurm_options"]
), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"

slurm_options = AttrDict(config["slurm_options"])
executor.update_parameters(
name=slurm_options.NAME,
slurm_comment=slurm_options.COMMENT,
slurm_partition=slurm_options.PARTITION,
slurm_constraint=slurm_options.CONSTRAINT,
timeout_min=slurm_options.TIMEOUT_MIN,
nodes=1,
cpus_per_task=slurm_options.CPUS_PER_TASK,
tasks_per_node=1,
mem_gb=slurm_options.MEM_GB,
slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
)

job = executor.submit(benchmark_suite_scheduler_job)
print(f"SUBMITTED EVALUATION JOB: {job.job_id}")


if __name__ == "__main__":
"""
Example usage:
python -u "./vissl/engines/benchmark_suite_scheduler.py" \
"/path/to/benchmark_suite_scheduler_example.json"
"""
assert is_hydra_available(), "Make sure to install hydra"

assert (
is_submitit_available()
), "Please 'pip install submitit' to schedule jobs on SLURM"

config_file = sys.argv[1]
launch_benchmark_suite_scheduler(config_file)
5 changes: 5 additions & 0 deletions vissl/hooks/log_hooks.py
Expand Up @@ -245,6 +245,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
"eta": eta_string,
"peak_mem(M)": peak_mem_used,
}

if iteration == 1:
# Set max iterations. Currently used in benchmark_suite_scheduler.py
log_data["max_iterations"] = task.max_iteration

if self.btime_freq and len(batch_times) >= self.btime_freq:
rolling_avg_time = (
sum(batch_times[-self.btime_freq :]) / self.btime_freq
Expand Down

0 comments on commit 2d17ad6

Please sign in to comment.