New CI Runners (huggingface#2087)

* Try merge tests * Fix * Checkout branch * Fix pip install * rebase * Colons * right one * use master * Rm * Add needs * Better clean * always * Forgot other * test on AWS * update all labels * fix multi-gpu working directory * limit to 2 GPU * force run on kube * move build docker image to new ci * test build on CPU instance * move build docker image release to new ci * move scheduled slow tests to new ci * move integration test to new ci * Comments * Right CPU tags * Right machines * PR comments --------- Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
graemenail · Nov 20, 2023 · ca300c0 · ca300c0
1 parent 427ef8b
commit ca300c0
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 70 deletions.
diff --git a/.github/workflows/build-docker-images-release.yml b/.github/workflows/build-docker-images-release.yml
@@ -21,7 +21,7 @@ jobs:
 
   version-cpu:
     name: "Latest Accelerate CPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx
@@ -41,7 +41,7 @@ jobs:
 
   version-cuda:
     name: "Latest Accelerate GPU [version]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
     needs: get-version
     steps:
       - name: Set up Docker Buildx

diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
@@ -11,19 +11,9 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  clean-storage:
-    name: "Clean docker image storage"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    steps:
-      - name: Clean storage
-        run: |
-          docker image prune --all -f --filter "until=48h"
-          docker system prune --all -f --filter "until=48h"
-
   latest-cpu:
     name: "Latest Accelerate CPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, intel-cpu, 8-cpu, ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -41,8 +31,7 @@ jobs:
 
   latest-cuda:
     name: "Latest Accelerate GPU [dev]"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
-    needs: clean-storage
+    runs-on: [self-hosted, nvidia-gpu, t4, daily-ci]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -13,7 +13,7 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, daily-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
       TEST_TYPE: "single_gpu"
@@ -22,37 +22,40 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone & pip install
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }} 
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test
           
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           pip uninstall comet_ml -y
           make test_examples
           
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, daily-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"
       TEST_TYPE: "multi_gpu"
@@ -61,38 +64,42 @@ jobs:
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
           source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
           pip install -e . --no-deps
           pip install pytest-reportlog tabulate
 
       - name: Run core and big modeling tests on GPUs
+        working-directory: accelerate
         run: |
           source activate accelerate
           make test_core
           make test_big_modeling
           make test_cli
 
       - name: Run Integration tests on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           make test_integrations
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
           source activate accelerate
           pip uninstall comet_ml -y
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
           pip install slack_sdk tabulate

diff --git a/.github/workflows/run_merge_tests.yml b/.github/workflows/run_merge_tests.yml
@@ -10,80 +10,89 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, push-ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
-      - name: Update clone & pip install
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
-          pip install pytest-reportlog tabulate
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
+          pip install pytest-reportlog tabulate  ;
 
-      - name: Run CLI tests
+      - name: Run CLI tests (use make cli)
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test_cli
           
       - name: Run test on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
+          pip install tabulate;
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
+    env:
+      CUDA_VISIBLE_DEVICES: 0,1
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
     defaults:
       run:
-        working-directory: accelerate/
         shell: bash
     steps:
       - name: Update clone
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
-          pip install -e .[testing,test_trackers] -U
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing,test_trackers] -U;
           pip install pytest-reportlog tabulate
 
       - name: Run test on GPUs
+        working-directory: accelerate
         run: |
-          source activate accelerate
+          source activate accelerate;
           make test
 
       - name: Run examples on GPUs
+        working-directory: accelerate
         if: always()
         run: |
-          source activate accelerate
-          pip uninstall comet_ml -y
+          source activate accelerate;
+          pip uninstall comet_ml -y;
           make test_examples
 
       - name: Generate Report
+        working-directory: accelerate
         if: always()
         run: |
-          pip install tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          source activate accelerate;
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/self_hosted_integration_tests.yml b/.github/workflows/self_hosted_integration_tests.yml
@@ -25,7 +25,7 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
     strategy:
       fail-fast: false
       matrix:
@@ -34,22 +34,22 @@ jobs:
           "0,1"
         ]
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
-        run: 
+      - name: Install transformers
+        run: |
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/transformers --depth 1;
+          cd transformers;
+          pip install .[torch,deepspeed-testing];
+          cd ..;
 
-      - name: Update transformers clone & pip install
-        working-directory: transformers/
+      - name: Install accelerate
         run: |
-          source activate accelerate
-          git config --global --add safe.directory '*'
-          git checkout main && git pull
-          pip install .[torch,deepspeed-testing]
-          pip uninstall comet_ml wandb -y
+          source activate accelerate;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }} ;
+          pip install -e .[testing];
+          cd ..;
       
       - name: Show installed libraries
         run: |
@@ -89,20 +89,20 @@ jobs:
     container:
       image: huggingface/accelerate-gpu:latest
       options: --gpus all --shm-size "16gb"
-    runs-on: [self-hosted, docker-gpu, multi-gpu, gcp]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, push-ci]
     strategy:
       fail-fast: false
     steps:
-      - name: Update accelerate clone and pip install
-        working-directory: accelerate/
+      - name: Install accelerate
         run: 
           source activate accelerate;
-          git config --global --add safe.directory '*';
-          git checkout main && git fetch && git checkout ${{ github.sha }};
-          pip install -e .;
+          git clone https://github.com/huggingface/accelerate;
+          cd accelerate;
+          git checkout ${{ github.sha }};
+          pip install -e .[testing];
+          cd ..
 
-      - name: Update skorch clone & pip install
-        working-directory: skorch/
+      - name: Install skorch
         run: |
           source activate accelerate
           git config --global --add safe.directory '*'