Address comments

facebookresearch · Jun 7, 2021 · 63fec7b · 63fec7b
1 parent 8200338
commit 63fec7b
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 29 deletions.
diff --git a/dev/__init__.py b/dev/__init__.py
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json b/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,21 @@
+{
+    "params": {
+           "evaluation_iter_freq": -1,
+           "evaluation_phase_freq": -1,
+           "evaluate_final_phase": true,
+           "autoload_slurm_evaluator_checkpoint": false,
+           "slurm_evaluator_checkpoint": null,
+           "auto_retry_evaluations": false,
+           "retry_evaluation_job_ids": [],
+           "max_retries": 3
+       },
+       "slurm_options": {
+            "NAME": "vissl",
+            "COMMENT": "vissl evaluation job",
+            "CONSTRAINT": "",
+            "TIMEOUT_MIN": 4320,
+            "CPUS_PER_TASK": 8,
+            "MEM_GB": 16,
+            "ADDITIONAL_PARAMETERS": {}
+        }
+}
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_swav.json b/dev/benchmark_suite/benchmark_suite_scheduler_swav.json
@@ -0,0 +1,25 @@
+{
+    "params": {
+           "training_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-05-27-12-52-01/checkpoints",
+           "benchmarks": [
+               {
+                   "evaluation_name": "quick_eval_in1k_linear",
+                   "config_files": [
+                       "config=test/integration_test/quick_eval_in1k_linear.yaml"
+                   ]
+               }
+           ],
+           "evaluation_iter_freq": 20,
+           "evaluation_phase_freq": 2,
+           "evaluate_final_phase": true,
+           "autoload_slurm_evaluator_checkpoint": false,
+           "slurm_evaluator_checkpoint": null,
+           "max_training_iterations": null,
+           "auto_retry_evaluations": true,
+           "retry_evaluation_job_ids": [],
+           "max_retries": 3
+       },
+       "slurm_options": {
+           "PARTITION": "learnfair"
+       }
+   }
diff --git a/tools/launch_benchmark_suite_scheduler_slurm.py b/tools/launch_benchmark_suite_scheduler_slurm.py
@@ -1,30 +1,26 @@
-#!/bin/bash
 # Copyright (c) Facebook, Inc. and its affiliates.
 
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import pkg_resources
 import sys
 
 import submitit
 from fvcore.common.file_io import PathManager
-from tools.benchmark_suite_scheduler import BenchmarkSuiteScheduler
+from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
 from vissl.config.attr_dict import AttrDict
 from vissl.utils.hydra_config import is_hydra_available
 from vissl.utils.io import load_file
+from vissl.utils.misc import recursive_dict_merge
 from vissl.utils.slurm import is_submitit_available
 
 
-# Default slurm options to pass to the executor.
-_DEFAULT_SLURM_OPTIONS = {
-    "NAME": "vissl",
-    "COMMENT": "vissl evaluation job",
-    "CONSTRAINT": "",
-    "TIMEOUT_MIN": 72 * 60,  # Timeout in minutes.
-    "CPUS_PER_TASK": 8,
-    "MEM_GB": 16,
-    "ADDITIONAL_PARAMETERS": {},
-}
+# Default config options
+default_config_file = pkg_resources.resource_filename(
+    "dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
+)
+_DEFAULT_CONFIG = load_file(default_config_file)
 
 
 class SlurmEvaluatorJob:
@@ -60,22 +56,23 @@ def checkpoint(self):
 def launch_benchmark_suite_scheduler(config_file):
     assert PathManager.exists(config_file), "Slurm evaluator config file must exist"
 
-    config = load_file(config_file)
+    user_config = load_file(config_file)
+    config = _DEFAULT_CONFIG.copy()
+    recursive_dict_merge(config ,user_config)
+
     benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
     benchmark_suite_scheduler_job = SlurmEvaluatorJob(
         benchmark_suite_scheduler=benchmark_suite_scheduler
     )
     executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())
 
     assert "slurm_options" in config, "slurm_options must be specified"
-    override_slurm_options = config["slurm_options"]
     assert (
-        "PARTITION" in override_slurm_options
+        "PARTITION" in config["slurm_options"]
     ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"
 
-    slurm_options = {**_DEFAULT_SLURM_OPTIONS, **override_slurm_options}
-    slurm_options = AttrDict(slurm_options)
-
+    slurm_options = AttrDict(config["slurm_options"])
+    benchmark_suite_scheduler.evaluate()
     executor.update_parameters(
         name=slurm_options.NAME,
         slurm_comment=slurm_options.COMMENT,

diff --git a/tools/benchmark_suite_scheduler.py → vissl/utils/benchmark_suite_scheduler.py b/tools/benchmark_suite_scheduler.py → vissl/utils/benchmark_suite_scheduler.py
@@ -18,18 +18,18 @@
 from vissl.trainer.trainer_main import build_task
 from vissl.utils.distributed_launcher import launch_distributed_on_slurm
 from vissl.utils.hydra_config import convert_to_attrdict
-from vissl.utils.io import load_file
+from vissl.utils.io import load_file, makedir
 from vissl.utils.misc import flatten_dict, retry
 
 
 """
-    This class is designed to be used to run multiple evaluations on a single (pre)training.
-    Using the #evaluate method we continuously monitor training checkpoints, launch evaluations
-    dynamically as they become available, and amalgamate the evaluation results as they become
-    available.
+This class is designed to be used to run multiple evaluations on a single (pre)training.
+Using the #evaluate method we continuously monitor training checkpoints, launch evaluations
+dynamically as they become available, and amalgamate the evaluation results as they become
+available.
 
-    For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json)
-    and use the launch_benchmark_suite_scheduler_slurm.sh for convenience.
+For SLURM usage, you should create a JSON configuration file (see benchmark_suite_scheduler_template.json)
+and use the launch_benchmark_suite_scheduler_slurm.sh for convenience.
 """
 
 # How many times to retry a slurm job submission.
@@ -203,8 +203,7 @@ def save_evaluation_benchmarks(self):
         evaluation_dir = self.evaluation_dir()
         parent_metrics_file = os.path.join(evaluation_dir, "evaluation_metrics.json")
 
-        if not PathManager.exists(evaluation_dir):
-            PathManager.mkdirs(evaluation_dir)
+        makedir(evaluation_dir)
 
         self._write_json_file(self.evaluation_results, parent_metrics_file)
 
@@ -215,8 +214,7 @@ def save_evaluation_benchmarks(self):
                 child_metrics_dir, "evaluation_metrics.json"
             )
 
-            if not PathManager.exists(child_metrics_dir):
-                PathManager.mkdirs(child_metrics_dir)
+            makedir(child_metrics_dir)
 
             self._write_json_file(benchmarks, child_metrics_file)
 

diff --git a/vissl/utils/misc.py b/vissl/utils/misc.py
@@ -406,3 +406,18 @@ def flatten_dict(d: dict, parent_key="", sep="_"):
         else:
             items.append((new_key, v))
     return dict(items)
+
+
+# Credit: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries
+def recursive_dict_merge(dict1, dict2):
+    """
+    Recursively merges dict2 into dict1
+    """
+    if not isinstance(dict1, dict) or not isinstance(dict2, dict):
+        return dict2
+    for k in dict2:
+        if k in dict1:
+            dict1[k] = recursive_dict_merge(dict1[k], dict2[k])
+        else:
+            dict1[k] = dict2[k]
+    return dict1