.github/workflows/ci-dockers-pytorch.yml

name: Docker

on:
  push:
    branches: [master, "release/*"]
  pull_request:
    branches: [master, "release/*"]
    types: [opened, reopened, ready_for_review, synchronize]  # added `ready_for_review` since draft is skipped
    paths:
      - ".actions/**"
      - ".github/workflows/ci-pytorch-dockers.yml"
      - "dockers/**"
      - "requirements/pytorch/**"
      - "requirements/fabric/**"
      - "environment.yml"
      - "setup.py"
      - "!requirements/*/docs.txt"
      - "!*.md"
      - "!**/*.md"
  schedule:
    - cron: "0 0 * * *"  # at the end of every day
  release:
    types: [published]

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }}
  cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}

env:
  PUSH_NIGHTLY: ${{ github.event_name == 'schedule' }}
  PUSH_RELEASE: ${{ github.event_name == 'release' }}

jobs:
  build-pl:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        include:
          # We only release one docker image per PyTorch version.
          # The matrix here is the same as the one in release-docker.yml.
          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
          - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.7.1"}
    steps:
      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
            CUDA_VERSION=${{ matrix.cuda_version }}
          file: dockers/release/Dockerfile
          push: ${{ env.PUSH_RELEASE }}  # pushed in release-docker.yml only when PL is released
        timeout-minutes: 50

  build-xla:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        # the config used in '.github/workflows/tpu-tests.yml'
        python_version: ["3.7"]
        xla_version: ["1.12"]
    steps:
      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v2
        if: env.PUSH_NIGHTLY == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            XLA_VERSION=${{ matrix.xla_version }}
          file: dockers/base-xla/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
        timeout-minutes: 60
      - uses: ravsamhq/notify-slack-action@v2
        if: failure() && env.PUSH_NIGHTLY == 'true'
        with:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('XLA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.xla_version) }}
          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>'  # kaushikb11
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  build-cuda:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        include:
          # These are the base images for PL release docker images,
          # so include at least all of the combinations in release-dockers.yml.
          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
          - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.7.1"}
    steps:
      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v2
        if: env.PUSH_NIGHTLY == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
            CUDA_VERSION=${{ matrix.cuda_version }}
          file: dockers/base-cuda/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
        timeout-minutes: 95
      - uses: ravsamhq/notify-slack-action@v2
        if: failure() && env.PUSH_NIGHTLY == 'true'
        with:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('CUDA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>'  # akihironitta
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  build-ipu:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        include:
          # the config used in 'dockers/ci-runner-ipu/Dockerfile'
          - {python_version: "3.9", pytorch_version: "1.10"}
    steps:
      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v2
        if: env.PUSH_NIGHTLY == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
          file: dockers/base-ipu/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
        timeout-minutes: 100
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
          file: dockers/ci-runner-ipu/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
        timeout-minutes: 10
      - uses: ravsamhq/notify-slack-action@v2
        if: failure() && env.PUSH_NIGHTLY == 'true'
        with:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>'  # kaushikb11
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  build-hpu:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        include:
          # the config used in 'dockers/ci-runner-hpu/Dockerfile'
          - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"}
    steps:
      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v2
        if: env.PUSH_NIGHTLY == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - uses: docker/build-push-action@v3
        with:
          build-args: |
            DIST=latest
            GAUDI_VERSION=${{ matrix.gaudi_version }}
            PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }}
          file: dockers/ci-runner-hpu/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
        timeout-minutes: 10
      - uses: ravsamhq/notify-slack-action@v2
        if: failure() && env.PUSH_NIGHTLY == 'true'
        with:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('HPU; {0} py{1} for *{2}*', runner.os, matrix.gaudi_version, matrix.pytorch_version) }}
          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U02PV6CL144> <@U0355SJN6HK>'  # arao & Mythravarun N R
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  build-NGC:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Build Conda Docker
        # publish master/release
        uses: docker/build-push-action@v3
        with:
          file: dockers/nvidia/Dockerfile
          push: false
        timeout-minutes: 55