diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index a42788984..e97c1b60b 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -2,7 +2,7 @@ ARG UBUNTU_VERSION # Build stage -FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder +FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index ae76f3012..c585c4dad 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION -FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 +FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 ARG _UV_HOME="/opt/uv" diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index c2c31c02e..863bfc769 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -63,6 +63,15 @@ "./install-docker.sh --version {{user `docker_version`}}" ] }, + { + "type": "shell", + "script": "provisioners/downgrade-azure-kernel.sh" + }, + { + "type": "shell", + "inline": ["sudo reboot"], + "expect_disconnect": true + }, { "type": "shell", "script": "provisioners/install-nvidia-grid-driver-for-azure.sh" diff --git a/scripts/packer/provisioners/downgrade-azure-kernel.sh b/scripts/packer/provisioners/downgrade-azure-kernel.sh new file mode 100755 index 000000000..9c1d278b2 --- /dev/null +++ b/scripts/packer/provisioners/downgrade-azure-kernel.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# based on https://learn.microsoft.com/en-us/azure/virtual-machines/extensions/hpccompute-gpu-linux#known-issues +# this is a temporary solution only required until the issue is fixed + +set -e + +# Install 6.8 kernel +sudo apt-get update +sudo DEBIAN_FRONTEND=noninteractive apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y + +# Update the Grub entry name +grub_entry_name="$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6\.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1)" +sudo sed -i "s/^\s*GRUB_DEFAULT=.*$/GRUB_DEFAULT='Advanced options for Ubuntu>$grub_entry_name'/" /etc/default/grub +sudo update-grub + +# Disable the kernel package upgrade +sudo apt-mark hold $(dpkg --get-selections | grep -Po "^linux[^\t]+${grub_entry_name##* }") diff --git a/src/dstack/_internal/core/backends/vastai/compute.py b/src/dstack/_internal/core/backends/vastai/compute.py index 86391cc09..ec853b69e 100644 --- a/src/dstack/_internal/core/backends/vastai/compute.py +++ b/src/dstack/_internal/core/backends/vastai/compute.py @@ -47,7 +47,7 @@ def __init__(self, config: VastAIConfig): "reliability2": {"gte": 0.9}, "inet_down": {"gt": 128}, "verified": {"eq": True}, - "cuda_max_good": {"gte": 12.1}, + "cuda_max_good": {"gte": 12.8}, "compute_cap": {"gte": 600}, } ) diff --git a/src/dstack/version.py b/src/dstack/version.py index cc54f9a7d..a0bac024f 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -5,5 +5,5 @@ __version__ = "0.0.0" __is_release__ = False -base_image = "0.11rc2" +base_image = "0.11" base_image_ubuntu_version = "22.04" diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 45a2479e0..be868f1e5 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -333,7 +333,7 @@ def get_dev_env_run_dict( " && tail -f /dev/null" ), ] - image_name = "dstackai/base:0.11rc2-base-ubuntu22.04" + image_name = "dstackai/base:0.11-base-ubuntu22.04" return { "id": run_id,