Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Continuous evaluations init commit (#325)
Browse files Browse the repository at this point in the history
Summary:
Create a script that continuously evaluates benchmarks as they become available from a pretraining.

![Uploading Screen Shot 2021-06-02 at 10.22.01 AM.png…]()
![Uploading Screen Shot 2021-06-02 at 10.22.19 AM.png…]()
<img width="593" alt="Screen Shot 2021-06-02 at 10 22 37 AM" src="https://user-images.githubusercontent.com/25669348/120497511-7888c880-c38c-11eb-8bc1-78bacc5d968b.png">
<img width="1237" alt="Screen Shot 2021-06-02 at 10 22 59 AM" src="https://user-images.githubusercontent.com/25669348/120497575-85a5b780-c38c-11eb-9445-2076e15be888.png">

Next Steps:
1. Deal with sharded checkpoints and their conversion
1. Improve max_iteration logic
1. Extend to FB infra.
1. Write unit tests
1. Think about how these tricky evaluation tests: #325 (comment)
1. Try not to replicate so much logic in the class (e.g. get path names from vissl code, requires some refactoring).
1. Look into email notifications.

Testing:

1. Run 8node Swav with 10 epochs with 3 different benchmark evaluations with different resource requirements. SUCCESS.

json config:

```
{
    "params": {
           "training_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints",
           "benchmarks": [
               {
                   "evaluation_name": "clevr_count_linear",
                   "config_files": [
                       "config=config_local/eval_resnet_8gpu_transfer_clevr_count_linear_benchmark_suite_scheduler_test.yaml"
                   ]
               },
               {
                   "evaluation_name": "clevr_dist_linear",
                   "config_files": [
                       "config=config_local/eval_resnet_8gpu_transfer_clevr_dist_linear_benchmark_suite_scheduler_test.yaml"
                   ]
               },
               {
                   "evaluation_name": "in1k_linear",
                   "config_files": [
                       "config=config_local/eval_resnet_8gpu_transfer_in1k_linear_benchmark_suite_scheduler_test.yaml"
                   ]
               }
           ],
           "evaluation_iter_freq": 600,
           "evaluation_phase_freq": 2,
           "evaluate_final_phase": true,
           "autoload_slurm_evaluator_checkpoint": false,
           "slurm_evaluator_checkpoint": null,
           "auto_retry_evaluations": true,
           "retry_evaluation_job_ids": [],
           "max_retries": 3,
           "pytorch_ports": [40050, 40051, 40052, 40053, 40054, 40055, 40056, 40057, 40058, 40059, 40060, 40061, 40062, 40063]
       },
       "slurm_options": {
           "PARTITION": "learnfair"
       }
}
```

Example snippet from `evaluation_metrics.json`:

```
{
    "model_final_checkpoint_phase9": [
        {
            "checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints",
            "config_files": [
                "config=config_local/eval_resnet_8gpu_transfer_clevr_count_linear_benchmark_suite_scheduler_test.yaml",
                "hydra.run.dir='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'",
                "config.CHECKPOINT.DIR='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints'",
                "config.SLURM.LOG_FOLDER='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'",
                "config.SLURM.LOG_FOLDER='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'",
                "config.SLURM.USE_SLURM=true",
                "config.MODEL.WEIGHTS_INIT.PARAMS_FILE='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/model_final_checkpoint_phase9.torch'"
            ],
            "evaluation_name": "clevr_count_linear",
            "job_id": "42410489",
            "metrics": {
                "test_accuracy_list_meter_top_1_res5": {
                    "iteration": 822,
                    "metric": 34.62,
                    "train_phase_idx": 2
                },
                "train_accuracy_list_meter_top_1_res5": {
                    "iteration": 822,
                    "metric": 33.8514,
                    "train_phase_idx": 2
                }
            },
            "num_retries": 1,
            "slurm_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints",
            "slurm_log_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear",
            "slurm_state": "COMPLETED",
            "weights_init_params_file": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/model_final_checkpoint_phase9.torch"
        }, ...
```

The following hold:
1. Training completes appropriately, w/o errors.
1. Able to resume checkpoints.
1. Evaluation folder structure is as expected above.
1. Best Metrics are extracted.

Pull Request resolved: #325

Reviewed By: prigoyal

Differential Revision: D28901750

Pulled By: iseessel

fbshipit-source-id: 732074043200ac51f3e709d5e67e686f26d36835
  • Loading branch information
iseessel authored and facebook-github-bot committed Jun 14, 2021
1 parent 5376448 commit b9c5b34
Show file tree
Hide file tree
Showing 11 changed files with 944 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Expand Up @@ -97,5 +97,5 @@ website/pages/tutorials/*
**/.ipynb_checkpoints/**

# Configs for local development
configs/config_local/*
configs/config/config_local/*
train_config.yaml
Empty file added dev/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,22 @@
{
"params": {
"evaluation_iter_freq": -1,
"evaluation_phase_freq": -1,
"evaluate_final_phase": true,
"autoload_slurm_evaluator_checkpoint": false,
"slurm_evaluator_checkpoint": null,
"auto_retry_evaluations": false,
"retry_evaluation_job_ids": [],
"max_retries": 3,
"pytorch_ports": [40050]
},
"slurm_options": {
"NAME": "vissl",
"COMMENT": "vissl evaluation job",
"CONSTRAINT": "",
"TIMEOUT_MIN": 4320,
"CPUS_PER_TASK": 8,
"MEM_GB": 16,
"ADDITIONAL_PARAMETERS": {}
}
}
33 changes: 33 additions & 0 deletions dev/benchmark_suite/benchmark_suite_scheduler_template.json
@@ -0,0 +1,33 @@
{
"params": {
"training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config",
"benchmarks": [
{
"evaluation_name": "(str) Name of benchmark for convenience",
"config_files": [
"config=path/to/evaluation/config",
"config.OVERRIDES=new_value"
]
}
],
"evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations",
"evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases",
"evaluate_final_phase": "(bool, default=True) Evaluate the final phase",
"autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint",
"slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint",
"auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations",
"retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry",
"max_retries": "(int, default=3) Maximum number of retries",
"pytorch_ports": "(List[int], default=[40500]) List of pytorch ports to cycle through as you are launching your evaluations, in order to prevent Pytorch DDP port colissions."
},
"slurm_options": {
"PARTITION": "(str) Partition",
"NAME": "(str, default=vissl) Name of slurm job",
"COMMENT": "(str, default=vissl evaluation job) Comment of slurm job",
"CONSTRAINT": "(str, default='') Constraing of slurm job",
"TIMEOUT_MIN": "(int, default=72 * 60) Minimum amount of minutes to timeout",
"CPUS_PER_TASK": "(int, default=8) Numer of cpus per task.",
"MEM_GB": "(int, default=32) Amount of RAM to request from slurm",
"ADDITIONAL_PARAMETERS": "(Dict[[str, Any]], default={}) Any default slurm options to pass to submitit"
}
}
39 changes: 39 additions & 0 deletions dev/launch_benchmark_suite_scheduler_slurm.sh
@@ -0,0 +1,39 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# This benchmark suite script launches a benchmark suite scheduler slurm job.
# The job takes an absolute json config path (see benchmark_suite_scheduler_template.json for info)
# The job continuously monitors training benchmarks, and dynamically launches evaluation jobs
# and amalgamates the results.

######################### EXAMPLE USAGE #################################

# cd into vissl root directory.
#
# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh /path/to/benchmark_suite_scheduler.json

# See benchmark_suite_scheduler_template.json or for config information or slurm_evaluator.py for class structure.
######################### INPUT PARAMS ##################################

FILE=( "$@" )

####################### setup experiment dir ###################################

# create a temporary experiment folder to run the SLURM job in isolation
RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S')
EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID"

echo "EXP_ROOT_DIR: $EXP_ROOT_DIR"
echo "CONFIG_FILE: ${FILE[0]}"

rm -rf "$EXP_ROOT_DIR"
mkdir -p "$EXP_ROOT_DIR"
cp -r . "$EXP_ROOT_DIR"

####################### setup experiment dir ###################################
export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \
"${FILE[@]}"
107 changes: 107 additions & 0 deletions tools/launch_benchmark_suite_scheduler_slurm.py
@@ -0,0 +1,107 @@
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import sys

import pkg_resources
import submitit
from fvcore.common.file_io import PathManager
from vissl.config.attr_dict import AttrDict
from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
from vissl.utils.hydra_config import is_hydra_available
from vissl.utils.io import load_file
from vissl.utils.misc import recursive_dict_merge
from vissl.utils.slurm import is_submitit_available


# Default config options
default_config_file = pkg_resources.resource_filename(
"dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
)
_DEFAULT_CONFIG = load_file(default_config_file)


class SlurmEvaluatorJob:
"""
The slurm evaluator job is a thin wrapper around BenchmarkSuiteScheduler
used by submitit. It's main function is to run multiple evaluations
on a single training.
"""

def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler):
self.benchmark_suite_scheduler = benchmark_suite_scheduler

def __call__(self):
self.benchmark_suite_scheduler.evaluate()

def checkpoint(self):
"""
This method is called whenever a job is pre-empted, timedout, etc,.
Here we save the evaluation benchmarks, so that we can reload them
and continue where we left off.
"""
self.benchmark_suite_scheduler.save_evaluation_benchmarks()
# Forces the benchmark_suite_scheduler to automatically reload it's
# checkpoint, the benchmark results.
self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = (
True
)

trainer = SlurmEvaluatorJob(
benchmark_suite_scheduler=self.benchmark_suite_scheduler
)
return submitit.helpers.DelayedSubmission(trainer)


def launch_benchmark_suite_scheduler(config_file):
assert PathManager.exists(config_file), "Slurm evaluator config file must exist"

user_config = load_file(config_file)
config = _DEFAULT_CONFIG.copy()
recursive_dict_merge(config, user_config)

benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
benchmark_suite_scheduler_job = SlurmEvaluatorJob(
benchmark_suite_scheduler=benchmark_suite_scheduler
)
executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())

assert "slurm_options" in config, "slurm_options must be specified"
assert (
"PARTITION" in config["slurm_options"]
), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"

slurm_options = AttrDict(config["slurm_options"])
executor.update_parameters(
name=slurm_options.NAME,
slurm_comment=slurm_options.COMMENT,
slurm_partition=slurm_options.PARTITION,
slurm_constraint=slurm_options.CONSTRAINT,
timeout_min=slurm_options.TIMEOUT_MIN,
nodes=1,
cpus_per_task=slurm_options.CPUS_PER_TASK,
tasks_per_node=1,
mem_gb=slurm_options.MEM_GB,
slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
)

job = executor.submit(benchmark_suite_scheduler_job)
print(f"SUBMITTED EVALUATION JOB: {job.job_id}")


if __name__ == "__main__":
"""
Example usage:
python -u "./vissl/engines/benchmark_suite_scheduler.py" \
"/path/to/benchmark_suite_scheduler_example.json"
"""
assert is_hydra_available(), "Make sure to install hydra"

assert (
is_submitit_available()
), "Please 'pip install submitit' to schedule jobs on SLURM"

config_file = sys.argv[1]
launch_benchmark_suite_scheduler(config_file)
5 changes: 5 additions & 0 deletions vissl/hooks/log_hooks.py
Expand Up @@ -245,6 +245,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
"eta": eta_string,
"peak_mem(M)": peak_mem_used,
}

if iteration == 1:
# Set max iterations. Currently used in benchmark_suite_scheduler.py
log_data["max_iterations"] = task.max_iteration

if self.btime_freq and len(batch_times) >= self.btime_freq:
rolling_avg_time = (
sum(batch_times[-self.btime_freq :]) / self.btime_freq
Expand Down

0 comments on commit b9c5b34

Please sign in to comment.