Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
iseessel committed Jun 1, 2021
1 parent 85fa89a commit 82d000a
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 100 deletions.

This file was deleted.

@@ -0,0 +1,23 @@
{
"params": {
"training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config",
"benchmarks": [
{
"evaluation_name": "(str) Name of benchmark for convenience",
"config_files": [
"config=path/to/evaluation/config",
"config.OVERRIDES=new_value"
]
}
],
"evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations",
"evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases",
"evaluate_final_phase": "(bool, default=True) Evaluate the final phase",
"autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint",
"slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint",
"max_training_iterations": "(int, default=None) Max training iterations",
"auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations",
"retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry",
"max_retries": "(int, default=3) Maximum number of retries"
}
}
Expand Up @@ -7,8 +7,8 @@
######################### EXAMPLE USAGE #################################
# cd into vissl root directory.
#
# bash ./dev/launch_slurm_evaluator.sh
# /private/home/iseessel/vissl/configs/config/benchmark/slurm_evaluations/slurm_evaluation_example.json
# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh \
#/private/home/iseessel/vissl/configs/config/benchmark/slurm_evaluations/slurm_evaluation_example.json
#
# See slurm_evaluation_example.json or for an example config or slurm_evaluator.py for class structure.
######################### INPUT PARAMS ##################################
Expand All @@ -30,5 +30,5 @@ cp -r . $EXP_ROOT_DIR

####################### setup experiment dir ###################################
export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
python -u "$EXP_ROOT_DIR/vissl/engines/slurm_evaluator.py" \
python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \
"${FILE[@]}"
73 changes: 73 additions & 0 deletions tools/launch_benchmark_suite_scheduler_slurm.py
@@ -0,0 +1,73 @@
import sys

import submitit
from fvcore.common.file_io import PathManager
from vissl.utils.io import load_file
from vissl.engines.benchmark_suite_scheduler import BenchmarkSuiteScheduler

# Default slurm options to pass to the executor.
_DEFAULT_SLURM_OPTIONS = {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"PARTITION": "learnfair",
"CONSTRAINT": "",
"TIMEOUT_MIN": 4320, # Timeout in minutes.
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {},
}

class SlurmEvaluatorJob:
def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler):
self.benchmark_suite_scheduler = benchmark_suite_scheduler

def __call__(self):
self.benchmark_suite_scheduler.evaluate()

def checkpoint(self):
self.benchmark_suite_scheduler.save_evaluation_results()
self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = True

trainer = SlurmEvaluatorJob(benchmark_suite_scheduler=self.benchmark_suite_scheduler)
return submitit.helpers.DelayedSubmission(trainer)

def launch_benchmark_suite_scheduler(config_file):
assert PathManager.exists(config_file), "Slurm evaluator config file must exist."

config = load_file(config_file)
benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
benchmark_suite_scheduler.evaluate()
benchmark_suite_scheduler_job = SlurmEvaluatorJob(benchmark_suite_scheduler=benchmark_suite_scheduler)
executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())

override_slurm_options = config.get("slurm_options", {})
slurm_options = {**_DEFAULT_SLURM_OPTIONS, **override_slurm_options}
slurm_options = AttrDict(slurm_options)

executor.update_parameters(
name=slurm_options.NAME,
slurm_comment=slurm_options.COMMENT,
slurm_partition=slurm_options.PARTITION,
slurm_constraint=slurm_options.CONSTRAINT,
timeout_min=slurm_options.TIMEOUT_MIN,
nodes=1,
cpus_per_task=slurm_options.CPUS_PER_TASK,
tasks_per_node=1,
mem_gb=slurm_options.MEM_GB,
slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
)

job = executor.submit(benchmark_suite_scheduler_job)
print(f"SUBMITTED EVALUATION JOB: {job.job_id}")

return job


if __name__ == "__main__":
"""
Example usage:
python -u "./vissl/engines/benchmark_suite_scheduler.py" \
"/path/to/checkpoint_file/checkpoint.torch"
"""
config_file = sys.argv[1]
launch_benchmark_suite_scheduler(config_file)
Expand Up @@ -6,7 +6,6 @@
import collections
import json
import os
import sys
import time
from pathlib import Path
from typing import List
Expand Down Expand Up @@ -44,20 +43,8 @@
# Wait for the training checkpoint folder to be available for 1 hour.
_TRAINING_CHECKPOINT_FILE_WAIT_SECONDS = 60 * 60

# Default slurm options to pass to the executor.
_DEFAULT_SLURM_OPTIONS = {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"PARTITION": "learnfair",
"CONSTRAINT": "",
"TIMEOUT_MIN": 4320, # Timeout in minutes.
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {},
}


class SlurmEvaluator:
class BenchmarkSuiteScheduler:
"""
The Slurm Evaluator is a class designed to continuously monitor VISSL pretrainings
and launch evaluations as checkpoints become available. The method takes a
Expand Down Expand Up @@ -350,7 +337,7 @@ def _monitor_benchmark_job(self, benchmark):

if job.state != benchmark["slurm_state"]:
# Job state has changed, log transition, and update state in json file.
checkpoint_str = benchmark["weights_init_params_file"].split("/")[-1]
checkpoint_str = os.path.split(benchmark["weights_init_params_file"])[-1]

log = f"""
Slurm Evaluation job changed states.
Expand Down Expand Up @@ -419,7 +406,7 @@ def _generate_initial_evaluation_results(self):
evaluation_configs = {}

for benchmark in self.benchmarks:
default_evaluation_name = benchmark["config_files"][0].split("/")[-1]
default_evaluation_name = os.path.split(benchmark["config_files"][0])[-1]
evaluation_name = (
benchmark.get("evaluation_name") or default_evaluation_name
)
Expand Down Expand Up @@ -543,7 +530,6 @@ def _set_initial_benchmark_result(
config_files.insert(1, f"config.SLURM.LOG_FOLDER='{log_dir}'")
config_files.insert(1, f"config.CHECKPOINT.DIR='{checkpoint_dir}'")
config_files.insert(1, f"hydra.run.dir='{ log_dir }'")
config_files.insert(1, "hydra.verbose=true")

evaluation_configs[training_checkpoint].append(benchmark_result)

Expand Down Expand Up @@ -571,60 +557,3 @@ def _generate_config(self, config):
config = compose("defaults", overrides=config)

return convert_to_attrdict(config)


class SlurmEvaluatorJob:
def __init__(self, slurm_evaluator: SlurmEvaluator):
self.slurm_evaluator = slurm_evaluator

def __call__(self):
self.slurm_evaluator.evaluate()

def checkpoint(self):
self.slurm_evaluator.save_evaluation_results()
self.slurm_evaluator.autoload_slurm_evaluator_checkpoint = True

trainer = SlurmEvaluatorJob(slurm_evaluator=self.slurm_evaluator)
return submitit.helpers.DelayedSubmission(trainer)


def launch_slurm_evaluator(config_file):
assert PathManager.exists(config_file), "Slurm evaluator config file must exist."

config = load_file(config_file)
slurm_evaluator = SlurmEvaluator(**config["params"])
slurm_evaluator.evaluate()
slurm_evaluator_job = SlurmEvaluatorJob(slurm_evaluator=slurm_evaluator)
executor = submitit.AutoExecutor(folder=slurm_evaluator.evaluation_dir())

override_slurm_options = config.get("slurm_options", {})
slurm_options = {**_DEFAULT_SLURM_OPTIONS, **override_slurm_options}
slurm_options = AttrDict(slurm_options)

executor.update_parameters(
name=slurm_options.NAME,
slurm_comment=slurm_options.COMMENT,
slurm_partition=slurm_options.PARTITION,
slurm_constraint=slurm_options.CONSTRAINT,
timeout_min=slurm_options.TIMEOUT_MIN,
nodes=1,
cpus_per_task=slurm_options.CPUS_PER_TASK,
tasks_per_node=1,
mem_gb=slurm_options.MEM_GB,
slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
)

job = executor.submit(slurm_evaluator_job)
print(f"SUBMITTED EVALUATION JOB: {job.job_id}")

return job


if __name__ == "__main__":
"""
Example usage:
python -u "./vissl/engines/slurm_evaluator.py" \
"/path/to/checkpoint_file/checkpoint.torch"
"""
config_file = sys.argv[1]
launch_slurm_evaluator(config_file)
2 changes: 1 addition & 1 deletion vissl/utils/distributed_launcher.py
Expand Up @@ -275,4 +275,4 @@ def launch_distributed_on_slurm(cfg: AttrDict, engine_name: str):
job = executor.submit(trainer)
print(f"SUBMITTED: {job.job_id}")

return job
return job

0 comments on commit 82d000a

Please sign in to comment.