Skip to content

Commit

Permalink
Merge pull request #6808 from kmaehashi/bp-6804-v10-fix-head-nccl
Browse files Browse the repository at this point in the history
[backport] Fix CI Docker image build failing in head test
  • Loading branch information
kmaehashi committed Jun 24, 2022
2 parents e26c73f + a7e5d2c commit 024ee91
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 22 deletions.
8 changes: 4 additions & 4 deletions .pfnci/coverage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -787,9 +787,9 @@ CuPy CI Test Coverage
- ✅
-
-
- ✅
-
-
- 6
- 7
* -
- 2.8
- ✅
Expand Down Expand Up @@ -903,9 +903,9 @@ CuPy CI Test Coverage
-
-
-
-
- ✅
- ✅
- 6
- 5
* - cutensor
- null
- ✅
Expand Down
16 changes: 1 addition & 15 deletions .pfnci/linux/main-flexci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,7 @@ if [[ "${FLEXCI_BRANCH:-}" == refs/pull/* ]]; then
fi

# TODO(kmaehashi): Hack for CUDA 11.6+ until FlexCI base image update
if [[ "${TARGET}" == cuda116* || "${TARGET}" == cuda117* ]]; then
if dpkg -s cuda-drivers-495; then
killall Xorg
nvidia-smi -pm 0

apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
apt-get purge -qqy "cuda-drivers*" "*nvidia*-495"
apt-get install -qqy "cuda-drivers"

modprobe -r nvidia_drm nvidia_uvm nvidia_modeset nvidia
nvidia-smi -pm 1
nvidia-smi
fi
fi
.pfnci/linux/update-cuda-driver.sh

gcloud auth configure-docker

Expand Down
2 changes: 1 addition & 1 deletion .pfnci/linux/tests/cuda-head.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
liblzma-dev && \
apt-get -qqy install ccache git curl && \
apt-get -qqy --allow-change-held-packages \
--allow-downgrades install 'libnccl2=2.11.*+cuda11.7' 'libnccl-dev=2.11.*+cuda11.7' 'libcutensor1=1.5.*' 'libcutensor-dev=1.5.*' 'libcusparselt0=0.2.0.*' 'libcusparselt-dev=0.2.0.*' 'libcudnn8=8.4.*+cuda11.6' 'libcudnn8-dev=8.4.*+cuda11.6'
--allow-downgrades install 'libcutensor1=1.5.*' 'libcutensor-dev=1.5.*' 'libcusparselt0=0.2.0.*' 'libcusparselt-dev=0.2.0.*' 'libcudnn8=8.4.*+cuda11.6' 'libcudnn8-dev=8.4.*+cuda11.6'

ENV PATH "/usr/lib/ccache:${PATH}"

Expand Down
17 changes: 17 additions & 0 deletions .pfnci/linux/update-cuda-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -ue

if dpkg -s cuda-drivers-495 && ls /dev/nvidiactl ; then
killall Xorg
nvidia-smi -pm 0

apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
apt-get purge -qqy "cuda-drivers*" "*nvidia*-495"
apt-get install -qqy "cuda-drivers"

modprobe -r nvidia_drm nvidia_uvm nvidia_modeset nvidia
nvidia-smi -pm 1
nvidia-smi
fi
2 changes: 1 addition & 1 deletion .pfnci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@
os: "ubuntu:20.04"
cuda: "11.7"
rocm: null
nccl: "2.11"
nccl: null # TODO use 2.12 once supported
cutensor: "1.5"
cusparselt: "0.2.0"
cudnn: "8.4"
Expand Down
1 change: 0 additions & 1 deletion .pfnci/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ nccl:
"11.4":
"11.5":
"11.6":
"11.7":
cutensor:
# https://docs.nvidia.com/cuda/cutensor/
# https://developer.nvidia.com/cutensor/downloads
Expand Down

0 comments on commit 024ee91

Please sign in to comment.