From e0f7058d74a2ac1d56986b863d5210f34d5c495d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Wed, 8 Oct 2025 11:03:44 +0200 Subject: [PATCH 1/5] [Docker] Update the CUDA version in the default Docker image to 12.8 (from 12.1) #3163 --- docker/base/Dockerfile | 2 +- docker/base/Dockerfile.common | 2 +- src/dstack/_internal/core/backends/vastai/compute.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index a42788984..e97c1b60b 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -2,7 +2,7 @@ ARG UBUNTU_VERSION # Build stage -FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder +FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index ae76f3012..c585c4dad 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION -FROM nvidia/cuda:12.1.1-base-ubuntu${UBUNTU_VERSION}.04 +FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 ARG _UV_HOME="/opt/uv" diff --git a/src/dstack/_internal/core/backends/vastai/compute.py b/src/dstack/_internal/core/backends/vastai/compute.py index 86391cc09..ec853b69e 100644 --- a/src/dstack/_internal/core/backends/vastai/compute.py +++ b/src/dstack/_internal/core/backends/vastai/compute.py @@ -47,7 +47,7 @@ def __init__(self, config: VastAIConfig): "reliability2": {"gte": 0.9}, "inet_down": {"gt": 128}, "verified": {"eq": True}, - "cuda_max_good": {"gte": 12.1}, + "cuda_max_good": {"gte": 12.8}, "compute_cap": {"gte": 600}, } ) From e3b6571c2916ce2a03e20902a94e3343ec689778 Mon Sep 17 00:00:00 2001 From: Jvst Me Date: Thu, 9 Oct 2025 11:07:26 +0200 Subject: [PATCH 2/5] Pin base image version for Azure GRID image --- scripts/packer/azure-image-grid.json | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index c2c31c02e..2205f7414 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -24,6 +24,7 @@ "image_publisher": "canonical", "image_offer": "ubuntu-24_04-lts", "image_sku": "server", + "image_version": "24.04.202509170", "azure_tags": { "Name": "DSTACK-GRID" }, From bf82f440dcf91593653dbb05d8d61765850b0acd Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 9 Oct 2025 14:26:25 +0200 Subject: [PATCH 3/5] [Azure] Downgrade Linux Kernel to 6.8 as a workaround to install Grid driver --- scripts/packer/azure-image-grid.json | 10 +++++++++- .../provisioners/downgrade-azure-kernel.sh | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100755 scripts/packer/provisioners/downgrade-azure-kernel.sh diff --git a/scripts/packer/azure-image-grid.json b/scripts/packer/azure-image-grid.json index 2205f7414..863bfc769 100644 --- a/scripts/packer/azure-image-grid.json +++ b/scripts/packer/azure-image-grid.json @@ -24,7 +24,6 @@ "image_publisher": "canonical", "image_offer": "ubuntu-24_04-lts", "image_sku": "server", - "image_version": "24.04.202509170", "azure_tags": { "Name": "DSTACK-GRID" }, @@ -64,6 +63,15 @@ "./install-docker.sh --version {{user `docker_version`}}" ] }, + { + "type": "shell", + "script": "provisioners/downgrade-azure-kernel.sh" + }, + { + "type": "shell", + "inline": ["sudo reboot"], + "expect_disconnect": true + }, { "type": "shell", "script": "provisioners/install-nvidia-grid-driver-for-azure.sh" diff --git a/scripts/packer/provisioners/downgrade-azure-kernel.sh b/scripts/packer/provisioners/downgrade-azure-kernel.sh new file mode 100755 index 000000000..3e9bc0274 --- /dev/null +++ b/scripts/packer/provisioners/downgrade-azure-kernel.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# based on https://learn.microsoft.com/en-us/azure/virtual-machines/extensions/hpccompute-gpu-linux#known-issues +# this is a temporary solution only required until the issue is fixed + +set -e + +# Install 6.8 kernel +sudo apt-get update +sudo DEBIAN_FRONTEND=noninteractive sudo apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y + +# Update the Grub entry name +sudo grub_entry_name=$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1) +sudo sed -i "s/^\s*GRUB_DEFAULT=.*$/GRUB_DEFAULT='Advanced options for Ubuntu>$grub_entry_name'/" /etc/default/grub +sudo update-grub + +# Disable the kernel package upgrade +sudo apt-mark hold $(dpkg --get-selections | grep -Po "^linux[^\t]+${grub_entry_name##* }") From 26345eecffd7a99aff0d73d45aa5b2def52b9269 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 9 Oct 2025 14:44:05 +0200 Subject: [PATCH 4/5] Bugfix --- scripts/packer/provisioners/downgrade-azure-kernel.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/packer/provisioners/downgrade-azure-kernel.sh b/scripts/packer/provisioners/downgrade-azure-kernel.sh index 3e9bc0274..9c1d278b2 100755 --- a/scripts/packer/provisioners/downgrade-azure-kernel.sh +++ b/scripts/packer/provisioners/downgrade-azure-kernel.sh @@ -7,10 +7,10 @@ set -e # Install 6.8 kernel sudo apt-get update -sudo DEBIAN_FRONTEND=noninteractive sudo apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y +sudo DEBIAN_FRONTEND=noninteractive apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y # Update the Grub entry name -sudo grub_entry_name=$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1) +grub_entry_name="$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6\.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1)" sudo sed -i "s/^\s*GRUB_DEFAULT=.*$/GRUB_DEFAULT='Advanced options for Ubuntu>$grub_entry_name'/" /etc/default/grub sudo update-grub From 0b33c1a961260471308474baa03eb72dd3cfd73a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 9 Oct 2025 16:12:29 +0200 Subject: [PATCH 5/5] [Docker] Update the CUDA version in the default Docker image to 12.8 (from 12.1) #3163 Updated base_image to 0.11 --- src/dstack/version.py | 2 +- src/tests/_internal/server/routers/test_runs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dstack/version.py b/src/dstack/version.py index cc54f9a7d..a0bac024f 100644 --- a/src/dstack/version.py +++ b/src/dstack/version.py @@ -5,5 +5,5 @@ __version__ = "0.0.0" __is_release__ = False -base_image = "0.11rc2" +base_image = "0.11" base_image_ubuntu_version = "22.04" diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 45a2479e0..be868f1e5 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -333,7 +333,7 @@ def get_dev_env_run_dict( " && tail -f /dev/null" ), ] - image_name = "dstackai/base:0.11rc2-base-ubuntu22.04" + image_name = "dstackai/base:0.11-base-ubuntu22.04" return { "id": run_id,