From b3de180d1bcd7411bb213c744c256d5f02819799 Mon Sep 17 00:00:00 2001 From: Isaac Seessel Date: Tue, 5 Oct 2021 13:43:25 -0400 Subject: [PATCH] Fixes for release --- dev/packaging/apex_conda/README.md | 3 ++ dev/packaging/apex_conda/inside/build.py | 16 ++++++++- dev/packaging/apex_pip/README.md | 4 ++- dev/packaging/apex_pip/inside/a.sh | 4 +++ dev/packaging/vissl_conda/build_all_conda.sh | 5 +++ dev/packaging/vissl_conda/vissl/meta.yaml | 4 +-- dev/run_quick_tests.sh | 34 +------------------- tests/test_tasks.py | 5 +++ vissl/utils/test_utils.py | 6 ++-- 9 files changed, 42 insertions(+), 39 deletions(-) mode change 100755 => 100644 dev/run_quick_tests.sh diff --git a/dev/packaging/apex_conda/README.md b/dev/packaging/apex_conda/README.md index 8a85f1471..74320b5a6 100644 --- a/dev/packaging/apex_conda/README.md +++ b/dev/packaging/apex_conda/README.md @@ -11,6 +11,9 @@ nothing to do here. 2. Go into the `inside` directory and clone apex with `git clone https://github.com/NVIDIA/apex.git`. +Move to the appropriate commit. +`git checkout 1f2aa9156547377a023932a1512752c392d9bbdf`. + 3. You may want to `docker pull pytorch/conda-cuda:latest`. 4. Run `bash go.sh` in this directory. This takes ages diff --git a/dev/packaging/apex_conda/inside/build.py b/dev/packaging/apex_conda/inside/build.py index f7b6e41c3..2d8e19a33 100644 --- a/dev/packaging/apex_conda/inside/build.py +++ b/dev/packaging/apex_conda/inside/build.py @@ -24,6 +24,9 @@ "1.7.0": ["cu101", "cu102", "cu110"], "1.7.1": ["cu101", "cu102", "cu110"], "1.8.0": ["cu101", "cu102", "cu111"], + "1.8.1": ["cu101", "cu102", "cu111"], + "1.9.0": ["cu102", "cu111"], + "1.9.1": ["cu102", "cu111"], } CUDA_HOMES = { @@ -72,6 +75,15 @@ def pytorch_versions_for_python(python_version): print() print("python", python_version, "pytorch", ptv, "cuda", cuv, flush=True) + + apex_file_name = ( + f"apex-{VERSION}-py{python_version_nodot}_{cuv}_pyt{ptv_nodot}.tar.bz2" + ) + + if os.path.exists(f"./inside/packaging/{apex_file_name}"): + print(f"Package: {apex_file_name} already found") + continue + args = [ "conda", "build", @@ -87,8 +99,10 @@ def pytorch_versions_for_python(python_version): if python_version == "3.9" or cuv == "cu111": args.insert(4, "conda-forge") args.insert(4, "-c") + subprocess.check_call(args) - file = f"/opt/conda/conda-bld/linux-64/apex-{VERSION}-py{python_version_nodot}_{cuv}_pyt{ptv_nodot}.tar.bz2" + + file = f"/opt/conda/conda-bld/linux-64/{apex_file_name}" shutil.copy(file, "inside/packaging") print("DONE") diff --git a/dev/packaging/apex_pip/README.md b/dev/packaging/apex_pip/README.md index d5a131552..0f6070642 100644 --- a/dev/packaging/apex_pip/README.md +++ b/dev/packaging/apex_pip/README.md @@ -11,6 +11,9 @@ nothing to do here. 2. Go into the `inside` directory and clone apex with `git clone https://github.com/NVIDIA/apex.git`. +Move to the appropriate commit. +`git checkout 1f2aa9156547377a023932a1512752c392d9bbdf`. + 3. You may want to `docker pull pytorch/conda-cuda:latest`. 4. Run `bash go.sh` in this directory. This takes ages @@ -19,7 +22,6 @@ and writes packages to `inside/output`. 5. You can upload the packages to s3, along with basic html files which enable them to be used, with `bash after.sh`. - In particular, if you are in a jupyter/colab notebook you can then install using these wheels with the following series of commands. diff --git a/dev/packaging/apex_pip/inside/a.sh b/dev/packaging/apex_pip/inside/a.sh index 53c0cbfa1..89780864f 100644 --- a/dev/packaging/apex_pip/inside/a.sh +++ b/dev/packaging/apex_pip/inside/a.sh @@ -34,6 +34,9 @@ declare -A CONDA_CUDA_VERSIONS=( ["1.7.0"]="cu101 cu102 cu110" ["1.7.1"]="cu101 cu102 cu110" ["1.8.0"]="cu101 cu102 cu111" + ["1.8.1"]="cu101 cu102 cu111" + ["1.9.0"]="cu102 cu111" + ["1.9.1"]="cu102 cu111" ) #VERSION=$(python -c "exec(open('${script_dir}/apex/__init__.py').read()); print(__version__)") @@ -85,6 +88,7 @@ do outdir="../output/$tag" if [[ -d "$outdir" ]] then + echo "skipping" "$outdir" continue fi diff --git a/dev/packaging/vissl_conda/build_all_conda.sh b/dev/packaging/vissl_conda/build_all_conda.sh index 3334a93d5..cdf9c9b6e 100644 --- a/dev/packaging/vissl_conda/build_all_conda.sh +++ b/dev/packaging/vissl_conda/build_all_conda.sh @@ -8,8 +8,13 @@ set -ex rm -rf dev/packaging/vissl_conda/ClassyVision git clone https://github.com/facebookresearch/ClassyVision.git dev/packaging/vissl_conda/ClassyVision + rm -rf dev/packaging/vissl_conda/fairscale git clone https://github.com/facebookresearch/fairscale.git dev/packaging/vissl_conda/fairscale +cd dev/packaging/vissl_conda/fairscale +git reset --hard df7db85cef7f9c30a5b821007754b96eb1f977b6 +cd ../../../../ + rm -rf classy_vision cp -r dev/packaging/vissl_conda/ClassyVision/classy_vision classy_vision rm -rf fairscale diff --git a/dev/packaging/vissl_conda/vissl/meta.yaml b/dev/packaging/vissl_conda/vissl/meta.yaml index fc0ed7fe3..b22c87eb8 100644 --- a/dev/packaging/vissl_conda/vissl/meta.yaml +++ b/dev/packaging/vissl_conda/vissl/meta.yaml @@ -38,7 +38,7 @@ test: source_files: - tests - tools - - dev/run_quick_tests.sh + - dev/run_quick_integration_tests.sh - configs requires: - ca-certificates @@ -47,7 +47,7 @@ test: - tensorboard commands: - python -m unittest discover -v -s tests - - ./dev/run_quick_tests.sh + - ./dev/run_quick_integration_tests.sh build: string: py{{py}} diff --git a/dev/run_quick_tests.sh b/dev/run_quick_tests.sh old mode 100755 new mode 100644 index 1f98f3236..76fe825b1 --- a/dev/run_quick_tests.sh +++ b/dev/run_quick_tests.sh @@ -43,36 +43,4 @@ popd # - verify that the associated jobs run to the end # ----------------------------------------------------------------------------- -CFG_LIST=( - "test/integration_test/quick_barlow_twins" - "test/integration_test/quick_deepcluster_v2" - "test/integration_test/quick_pirl" - "test/integration_test/quick_simclr" - "test/integration_test/quick_simclr_efficientnet" - "test/integration_test/quick_simclr_multicrop" - "test/integration_test/quick_simclr_regnet" - "test/integration_test/quick_swav" -) - -echo "========================================================================" -echo "Configs to run:" -echo "${CFG_LIST[@]}" -echo "========================================================================" - -BINARY="python ${SRC_DIR}/tools/run_distributed_engines.py" - -for cfg in "${CFG_LIST[@]}"; do - echo "========================================================================" - echo "Running $cfg ..." - echo "========================================================================" - CHECKPOINT_DIR=$(mktemp -d) - # shellcheck disable=SC2102 - # shellcheck disable=SC2086 - CUDA_LAUNCH_BLOCKING=1 $BINARY config=$cfg \ - config.DATA.TRAIN.DATA_SOURCES=[synthetic] \ - hydra.verbose=true \ - config.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD=true \ - config.CHECKPOINT.DIR="$CHECKPOINT_DIR" && echo "TEST OK" || exit - - rm -rf $CHECKPOINT_DIR -done +bash "${SRC_DIR}/dev/run_quick_integration_tests.sh" diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 5464b6c27..f03cd40ab 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -7,6 +7,7 @@ import unittest import pkg_resources +import torch from parameterized import parameterized from utils import UNIT_TEST_CONFIGS, SSLHydraConfig from vissl.engines.train import train_main @@ -38,6 +39,10 @@ def test_run(self, config_file_path: str): pkg_resources.resource_filename(__name__, "test_data") ] + # Destroy process groups as torch may be initialized with NCCL, which + # is incompatible with test_cpu_regnet_moco.yaml + torch.distributed.destroy_process_group() + # run training and make sure no exception is raised dist_run_id = get_dist_run_id(config, config.DISTRIBUTED.NUM_NODES) train_main( diff --git a/vissl/utils/test_utils.py b/vissl/utils/test_utils.py index d41b3655d..8e6ea4f30 100644 --- a/vissl/utils/test_utils.py +++ b/vissl/utils/test_utils.py @@ -29,8 +29,10 @@ def in_temporary_directory(enabled: bool = True): old_cwd = os.getcwd() with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) - yield temp_dir - os.chdir(old_cwd) + try: + yield temp_dir + finally: + os.chdir(old_cwd) else: yield os.getcwd()