From ca300c0a04f843da2c5c8559e7d728926f7e8bf2 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 20 Nov 2023 11:41:57 -0500 Subject: [PATCH] New CI Runners (#2087) * Try merge tests * Fix * Checkout branch * Fix pip install * rebase * Colons * right one * use master * Rm * Add needs * Better clean * always * Forgot other * test on AWS * update all labels * fix multi-gpu working directory * limit to 2 GPU * force run on kube * move build docker image to new ci * test build on CPU instance * move build docker image release to new ci * move scheduled slow tests to new ci * move integration test to new ci * Comments * Right CPU tags * Right machines * PR comments --------- Co-authored-by: Guillaume LEGENDRE --- .../workflows/build-docker-images-release.yml | 4 +- .github/workflows/build_docker_images.yml | 15 +---- .github/workflows/nightly.yml | 23 +++++--- .github/workflows/run_merge_tests.yml | 59 +++++++++++-------- .../self_hosted_integration_tests.yml | 44 +++++++------- 5 files changed, 75 insertions(+), 70 deletions(-) diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml index 8d5f2194b78..efb6a95da6c 100644 --- a/.github/workflows/build-docker-images-release.yml +++ b/.github/workflows/build-docker-images-release.yml @@ -21,7 +21,7 @@ jobs: version-cpu: name: "Latest Accelerate CPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] needs: get-version steps: - name: Set up Docker Buildx @@ -41,7 +41,7 @@ jobs: version-cuda: name: "Latest Accelerate GPU [version]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] needs: get-version steps: - name: Set up Docker Buildx diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 59f3e4dda61..75b9fb9eefe 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -11,19 +11,9 @@ concurrency: cancel-in-progress: false jobs: - clean-storage: - name: "Clean docker image storage" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - steps: - - name: Clean storage - run: | - docker image prune --all -f --filter "until=48h" - docker system prune --all -f --filter "until=48h" - latest-cpu: name: "Latest Accelerate CPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, intel-cpu, 8-cpu, ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -41,8 +31,7 @@ jobs: latest-cuda: name: "Latest Accelerate GPU [dev]" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] - needs: clean-storage + runs-on: [self-hosted, nvidia-gpu, t4, daily-ci] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 09e64e060e3..a06cae176c7 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -13,7 +13,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci] env: CUDA_VISIBLE_DEVICES: "0" TEST_TYPE: "single_gpu" @@ -22,23 +22,25 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone & pip install run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | source activate accelerate make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -46,13 +48,14 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci] env: CUDA_VISIBLE_DEVICES: "0,1" TEST_TYPE: "multi_gpu" @@ -61,18 +64,19 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; pip install -e . --no-deps pip install pytest-reportlog tabulate - name: Run core and big modeling tests on GPUs + working-directory: accelerate run: | source activate accelerate make test_core @@ -80,12 +84,14 @@ jobs: make test_cli - name: Run Integration tests on GPUs + working-directory: accelerate if: always() run: | source activate accelerate make test_integrations - name: Run examples on GPUs + working-directory: accelerate if: always() run: | source activate accelerate @@ -93,6 +99,7 @@ jobs: make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | pip install slack_sdk tabulate diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml index 7dacab8c508..42bf11e7516 100644 --- a/.github/workflows/run_merge_tests.yml +++ b/.github/workflows/run_merge_tests.yml @@ -10,7 +10,7 @@ env: jobs: run_all_tests_single_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci] env: CUDA_VISIBLE_DEVICES: "0" container: @@ -18,72 +18,81 @@ jobs: options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - - name: Update clone & pip install + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U - pip install pytest-reportlog tabulate + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; + pip install pytest-reportlog tabulate ; - - name: Run CLI tests + - name: Run CLI tests (use make cli) + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test_cli - name: Run test on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate + pip install tabulate; python utils/log_reports.py >> $GITHUB_STEP_SUMMARY run_all_tests_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] + env: + CUDA_VISIBLE_DEVICES: 0,1 container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" defaults: run: - working-directory: accelerate/ shell: bash steps: - name: Update clone run: | - source activate accelerate - git config --global --add safe.directory '*' - git fetch && git checkout ${{ github.sha }} - pip install -e .[testing,test_trackers] -U + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing,test_trackers] -U; pip install pytest-reportlog tabulate - name: Run test on GPUs + working-directory: accelerate run: | - source activate accelerate + source activate accelerate; make test - name: Run examples on GPUs + working-directory: accelerate if: always() run: | - source activate accelerate - pip uninstall comet_ml -y + source activate accelerate; + pip uninstall comet_ml -y; make test_examples - name: Generate Report + working-directory: accelerate if: always() run: | - pip install tabulate - python utils/log_reports.py >> $GITHUB_STEP_SUMMARY \ No newline at end of file + source activate accelerate; + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml index 94e50e61ff3..cd82295e4e2 100644 --- a/.github/workflows/self_hosted_integration_tests.yml +++ b/.github/workflows/self_hosted_integration_tests.yml @@ -25,7 +25,7 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] strategy: fail-fast: false matrix: @@ -34,22 +34,22 @@ jobs: "0,1" ] steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ - run: + - name: Install transformers + run: | source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/transformers --depth 1; + cd transformers; + pip install .[torch,deepspeed-testing]; + cd ..; - - name: Update transformers clone & pip install - working-directory: transformers/ + - name: Install accelerate run: | - source activate accelerate - git config --global --add safe.directory '*' - git checkout main && git pull - pip install .[torch,deepspeed-testing] - pip uninstall comet_ml wandb -y + source activate accelerate; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }} ; + pip install -e .[testing]; + cd ..; - name: Show installed libraries run: | @@ -89,20 +89,20 @@ jobs: container: image: huggingface/accelerate-gpu:latest options: --gpus all --shm-size "16gb" - runs-on: [self-hosted, docker-gpu, multi-gpu, gcp] + runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci] strategy: fail-fast: false steps: - - name: Update accelerate clone and pip install - working-directory: accelerate/ + - name: Install accelerate run: source activate accelerate; - git config --global --add safe.directory '*'; - git checkout main && git fetch && git checkout ${{ github.sha }}; - pip install -e .; + git clone https://github.com/huggingface/accelerate; + cd accelerate; + git checkout ${{ github.sha }}; + pip install -e .[testing]; + cd .. - - name: Update skorch clone & pip install - working-directory: skorch/ + - name: Install skorch run: | source activate accelerate git config --global --add safe.directory '*'