From 26c8e8f88183d4c0e27cccb61b363bb1466f0afd Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 28 Apr 2026 11:19:39 +0545 Subject: [PATCH 1/3] support per-replica-group image, docker, python, nvcc, privileged --- .../_internal/core/models/configurations.py | 148 +++++++- .../services/jobs/configurators/service.py | 99 ++++- .../core/models/test_configurations.py | 350 ++++++++++++++++++ .../jobs/configurators/test_service.py | 236 ++++++++++++ 4 files changed, 824 insertions(+), 9 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 15da190b8..5bd45ed41 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -840,6 +840,39 @@ class ReplicaGroup(CoreModel): CommandsList, Field(description="The shell commands to run for replicas in this group"), ] = [] + image: Annotated[ + Optional[str], + Field( + description="The name of the Docker image to run for replicas in this group. " + "Mutually exclusive with group-level `docker` and `python`." + ), + ] = None + python: Annotated[ + Optional[PythonVersion], + Field( + description="The major version of Python for replicas in this group. " + "Mutually exclusive with group-level `image` and `docker`." + ), + ] = None + nvcc: Annotated[ + Optional[bool], + Field( + description="Use the image with NVIDIA CUDA Compiler (NVCC) included for replicas in this group. " + "Mutually exclusive with group-level `docker`." + ), + ] = None + docker: Annotated[ + Optional[bool], + Field( + description="Use the docker-in-docker image for this group " + "(injects `start-dockerd` and runs privileged). Mutually " + "exclusive with group-level `image`, `python`, and `nvcc`." + ), + ] = None + privileged: Annotated[ + Optional[bool], + Field(description="Run replicas in this group in privileged mode."), + ] = None router: Annotated[ Optional[ReplicaGroupRouterConfig], Field( @@ -858,6 +891,42 @@ def validate_name(cls, v: Optional[str]) -> Optional[str]: def convert_count(cls, v: Range[int]) -> Range[int]: return _validate_replica_range(v) + @validator("python", pre=True, always=True) + def convert_python(cls, v, values) -> Optional[PythonVersion]: + if v is not None and values.get("image"): + raise ValueError("`image` and `python` are mutually exclusive within a replica group") + if isinstance(v, float): + v = str(v) + if v == "3.1": + v = "3.10" + if isinstance(v, str): + return PythonVersion(v) + return v + + @validator("docker", pre=True, always=True) + def _docker(cls, v, values) -> Optional[bool]: + if v is True and values.get("image"): + raise ValueError("`image` and `docker` are mutually exclusive within a replica group") + if v is True and values.get("python"): + raise ValueError("`python` and `docker` are mutually exclusive within a replica group") + if v is True and values.get("nvcc"): + raise ValueError("`nvcc` and `docker` are mutually exclusive within a replica group") + return v + + @validator("privileged", pre=True, always=True) + def _privileged(cls, v, values) -> Optional[bool]: + # Docker-in-docker requires privileged mode. The service level + # cannot enforce this rule because its `privileged` field defaults + # to `False` (existing backwards-compatibility constraint), so it + # cannot distinguish "unset" from explicit `False`. At the group + # level we keep `privileged` as `Optional[bool] = None`, so we can. + if v is False and values.get("docker") is True: + raise ValueError( + "`privileged: false` is incompatible with `docker: true` within " + "a replica group (docker-in-docker requires privileged mode)" + ) + return v + @root_validator() def validate_scaling(cls, values): scaling = values.get("scaling") @@ -1057,22 +1126,91 @@ def validate_top_level_properties_with_replica_groups(cls, values): return values + @root_validator() + def validate_no_mixed_service_and_group_container_fields(cls, values): + """ + When replicas is a list (image, docker, privileged) may be set + at the service level OR in replica groups, never both. Mixing is + rejected — including partial mixing, where only some groups set a + field the service also sets — because it leaves precedence ambiguous. + """ + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + + checks = [ + ( + "image", + values.get("image") is not None, + lambda g: g.image is not None, + ), + ( + "docker", + values.get("docker") is True, + lambda g: g.docker is True, + ), + ( + "privileged", + values.get("privileged") is True, + lambda g: g.privileged is not None, + ), + ( + "python", + values.get("python") is not None, + lambda g: g.python is not None, + ), + ( + "nvcc", + values.get("nvcc") is True, + lambda g: g.nvcc is True, + ), + ] + + for field, service_set, group_set in checks: + if service_set: + conflicting = [g.name for g in replicas if group_set(g)] + if conflicting: + raise ValueError( + f"`{field}` is set at both the service level and in " + f"replica group(s) {conflicting}. Set `{field}` in one " + f"place only — either at the service level (all groups " + f"inherit) or per group, but not both." + ) + return values + @root_validator() def validate_replica_groups_have_commands_or_image(cls, values): """ - When replicas is a list, ensure each ReplicaGroup has commands OR service has image. + When replicas is a list, ensure each ReplicaGroup has something + to run. That means at least one of: + - group.commands + - group.image / group.docker / group.python / group.nvcc + - service-level image / docker / python / nvcc """ replicas = values.get("replicas") - image = values.get("image") if not isinstance(replicas, list): return values + service_has_image_source = ( + values.get("image") is not None + or values.get("docker") is True + or values.get("python") is not None + or values.get("nvcc") is True + ) + for group in replicas: - if not group.commands and not image: + group_has_image_source = ( + group.image is not None + or group.docker is True + or group.python is not None + or group.nvcc is True + ) + if not group.commands and not group_has_image_source and not service_has_image_source: raise ValueError( - f"Replica group '{group.name}' has no commands. " - "Either set `commands` in the replica group or set `image` at the service level." + f"Replica group '{group.name}' has nothing to run. " + "Set `commands`, `image`, `docker`, `python`, or `nvcc` " + "in the group, or set one of these at the service level." ) return values diff --git a/src/dstack/_internal/server/services/jobs/configurators/service.py b/src/dstack/_internal/server/services/jobs/configurators/service.py index 6b5aa8c2d..7968c8ad7 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/service.py +++ b/src/dstack/_internal/server/services/jobs/configurators/service.py @@ -1,20 +1,111 @@ from typing import List, Optional -from dstack._internal.core.models.configurations import PortMapping, RunConfigurationType +from dstack._internal import settings +from dstack._internal.core.models.configurations import ( + PortMapping, + ReplicaGroup, + RunConfigurationType, +) from dstack._internal.core.models.profiles import SpotPolicy -from dstack._internal.server.services.jobs.configurators.base import JobConfigurator +from dstack._internal.core.models.unix import UnixUser +from dstack._internal.server.services.jobs.configurators.base import ( + JobConfigurator, + get_default_image, +) class ServiceJobConfigurator(JobConfigurator): TYPE: RunConfigurationType = RunConfigurationType.SERVICE - def _shell_commands(self) -> List[str]: + def _current_replica_group(self) -> Optional[ReplicaGroup]: assert self.run_spec.configuration.type == "service" for group in self.run_spec.configuration.replica_groups: if group.name == self.replica_group_name: - return group.commands + return group + return None + + def _shell_commands(self) -> List[str]: + assert self.run_spec.configuration.type == "service" + group = self._current_replica_group() + if group is not None: + return group.commands return self.run_spec.configuration.commands + def _image_name(self) -> str: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return settings.DSTACK_DIND_IMAGE + if group.image is not None: + return group.image + if group.nvcc is True: + return get_default_image(nvcc=True) + return super()._image_name() + + def _privileged(self) -> bool: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return True + if group.privileged is not None: + return group.privileged + return super()._privileged() + + def _dstack_image_commands(self) -> List[str]: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return ["start-dockerd"] + if group.image is not None: + return [] + return super()._dstack_image_commands() + + def _shell(self) -> str: + # Shell resolution order: + # 1. If `shell:` is set explicitly, the base honors it. + # 2. If this group sets `docker: true`, use /bin/bash — the + # DIND image ships bash, matching the service-level path. + # 3. If this group sets its own `image`, force /bin/sh. The + # base returns /bin/bash when service-level `image` is None, + # but a group-level custom image (e.g. alpine) may not ship + # bash. + # 4. Otherwise defer to the base (bash for dstackai/base, sh + # for a service-level custom image). + if self.run_spec.configuration.shell is None: + group = self._current_replica_group() + if group is not None: + if group.docker is True: + return "/bin/bash" + if group.image is not None: + return "/bin/sh" + return super()._shell() + + async def _user(self) -> Optional[UnixUser]: + # Base `_user()` only queries the image for a default user when + # `configuration.image` is set at the service level. When the + # group supplies its own `image`, perform the lookup here so the + # container runs as that image's default user. + # + # We intentionally do NOT look up the DIND image when the group + # sets `docker: true`. That matches service-level behavior: when + # `configuration.docker is True`, `configuration.image` is None, + # so the base skips the lookup. DIND is always privileged and + # effectively root anyway. + if self.run_spec.configuration.user is None: + group = self._current_replica_group() + if group is not None and group.image is not None: + image_config = await self._get_image_config() + if image_config.user is None: + return None + return UnixUser.parse(image_config.user) + return await super()._user() + + def _python(self) -> str: + group = self._current_replica_group() + if group is not None and group.python is not None: + return group.python.value + return super()._python() + def _default_single_branch(self) -> bool: return True diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 677fa07da..31cb7c36c 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -6,6 +6,7 @@ from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( DevEnvironmentConfigurationParams, + PythonVersion, RepoSpec, ServiceConfiguration, parse_run_configuration, @@ -162,6 +163,355 @@ def test_shell_invalid(self): parse_run_configuration(conf) +class TestReplicaGroupContainerFields: + """Per-replica-group image-source fields: `image`, `docker`, `python`, + `nvcc`, `privileged`. Covers field-level mutex validators, the + cross-level no-mixing validator, the runnable-check validator, and + YAML coercion for `python`.""" + + def test_replica_group_accepts_image_python_nvcc_docker(self): + conf = { + "type": "service", + "port": 8000, + "replicas": [ + {"name": "a", "count": 1, "image": "nginx:latest", "commands": ["x"]}, + {"name": "b", "count": 1, "python": "3.12", "commands": ["x"]}, + {"name": "c", "count": 1, "nvcc": True, "commands": ["x"]}, + {"name": "d", "count": 1, "docker": True, "commands": ["x"]}, + ], + } + parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + groups = {g.name: g for g in parsed.replicas} + assert groups["a"].image == "nginx:latest" + assert groups["b"].python == PythonVersion.PY312 + assert groups["c"].nvcc is True + assert groups["d"].docker is True + + def test_replica_group_accepts_privileged(self): + conf = { + "type": "service", + "port": 8000, + "replicas": [ + { + "name": "a", + "count": 1, + "image": "x", + "privileged": True, + "commands": ["x"], + }, + ], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].privileged is True + + @pytest.mark.parametrize( + "yaml_value,expected", + [ + (3.10, PythonVersion.PY310), + (3.12, PythonVersion.PY312), + ("3.10", PythonVersion.PY310), + ("3.12", PythonVersion.PY312), + ], + ) + def test_replica_group_python_yaml_coercion(self, yaml_value, expected): + """YAML may parse `3.10` as float 3.1 — must coerce back to '3.10'.""" + conf = { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "python": yaml_value, "commands": ["x"]}], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].python == expected + + def test_replica_group_image_python_mutex(self): + with pytest.raises( + ConfigurationError, + match="`image` and `python` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "image": "x", "python": "3.12", "commands": ["x"]}, + ], + } + ) + + def test_replica_group_image_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`image` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "image": "x", "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_python_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`python` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "python": "3.12", "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_nvcc_docker_mutex(self): + with pytest.raises( + ConfigurationError, + match="`nvcc` and `docker` are mutually exclusive", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "nvcc": True, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_replica_group_python_nvcc_allowed_together(self): + """python + nvcc is the dstackai/base + CUDA combo, must be allowed.""" + conf = { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "python": "3.12", "nvcc": True, "commands": ["x"]}, + ], + } + parsed = parse_run_configuration(conf) + assert parsed.replicas[0].python == PythonVersion.PY312 + assert parsed.replicas[0].nvcc is True + + def test_replica_group_docker_with_privileged_false_rejected(self): + with pytest.raises( + ConfigurationError, + match="`privileged: false` is incompatible with `docker: true`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + { + "count": 1, + "docker": True, + "privileged": False, + "commands": ["x"], + }, + ], + } + ) + + def test_replica_group_docker_with_privileged_unset_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [ + {"count": 1, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_image_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`image` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"count": 1, "image": "grp:1.0", "commands": ["x"]}, + ], + } + ) + + def test_docker_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`docker` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [ + {"count": 1, "docker": True, "commands": ["x"]}, + ], + } + ) + + def test_python_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`python` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [ + {"count": 1, "python": "3.12", "commands": ["x"]}, + ], + } + ) + + def test_nvcc_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`nvcc` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [ + {"count": 1, "nvcc": True, "commands": ["x"]}, + ], + } + ) + + def test_privileged_set_at_both_service_and_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="`privileged` is set at both", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "privileged": True, + "replicas": [ + { + "count": 1, + "image": "x", + "privileged": True, + "commands": ["x"], + }, + ], + } + ) + + def test_image_at_service_with_groups_inheriting_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"count": 1, "commands": ["x"]}, + {"count": 1, "commands": ["x"]}, + ], + } + ) + + def test_docker_at_service_with_groups_inheriting_allowed(self): + """Service-level `docker: true` combined with groups that don't set + docker should parse cleanly — groups inherit the service-level value. + Guards against the no-mixing validator accidentally rejecting the + inherit case.""" + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [ + {"count": 1, "commands": ["x"]}, + {"count": 1, "commands": ["x"]}, + ], + } + ) + + def test_partial_mix_rejected(self): + """Service sets image; only one group overrides — still a mix.""" + with pytest.raises( + ConfigurationError, + match=r"replica group\(s\) \['b'\]", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [ + {"name": "a", "count": 1, "commands": ["x"]}, + {"name": "b", "count": 1, "image": "g:2", "commands": ["x"]}, + ], + } + ) + + def test_replica_group_with_only_image_no_commands_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "image": "nginx:latest"}], + } + ) + + def test_replica_group_with_only_python_no_commands_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "python": "3.12"}], + } + ) + + def test_replica_group_with_only_nvcc_no_commands_allowed(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "nvcc": True}], + } + ) + + def test_empty_replica_group_rejected(self): + with pytest.raises( + ConfigurationError, + match="has nothing to run", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1}], + } + ) + + def test_service_level_image_satisfies_groups_runnable_check(self): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "svc:1.0", + "replicas": [{"count": 1}, {"count": 1}], + } + ) + + class TestRepoSpec: @pytest.mark.parametrize("value", [".", "rel/path", "/abs/path/"]) def test_parse_local_path_no_path(self, value: str): diff --git a/src/tests/_internal/server/services/jobs/configurators/test_service.py b/src/tests/_internal/server/services/jobs/configurators/test_service.py index cafab73d9..9d5c9bdf7 100644 --- a/src/tests/_internal/server/services/jobs/configurators/test_service.py +++ b/src/tests/_internal/server/services/jobs/configurators/test_service.py @@ -1,11 +1,19 @@ +from unittest.mock import Mock + import pytest +from dstack._internal import settings from dstack._internal.core.models.configurations import ( OPENAI_MODEL_PROBE_TIMEOUT, ProbeConfig, + PythonVersion, + ReplicaGroup, ServiceConfiguration, ) +from dstack._internal.core.models.resources import Range from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.server.services.docker import ImageConfig +from dstack._internal.server.services.jobs.configurators.base import get_default_image from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator from dstack._internal.server.testing.common import get_run_spec @@ -95,3 +103,231 @@ async def test_no_probe_when_no_model(self): assert len(job_specs) == 1 assert len(job_specs[0].probes) == 0 + + +def _make_run_spec(replicas, **service_kwargs): + configuration = ServiceConfiguration( + port=80, + replicas=replicas, + **service_kwargs, + ) + return get_run_spec(run_name="run", repo_id="id", configuration=configuration) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestPerGroupOverrides: + """Verifies that ServiceJobConfigurator picks up per-replica-group + image-source fields (image, docker, python, nvcc, privileged).""" + + async def test_image_name_uses_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="custom:1.0", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == "custom:1.0" + + async def test_image_name_uses_dind_when_group_docker_true(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == settings.DSTACK_DIND_IMAGE + + async def test_image_name_uses_nvcc_default_when_group_nvcc_true(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + nvcc=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == get_default_image(nvcc=True) + + async def test_image_name_falls_back_to_service_image(self): + run_spec = _make_run_spec( + image="svc:1.0", + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._image_name() == "svc:1.0" + + async def test_privileged_true_when_group_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._privileged() is True + + async def test_privileged_returns_group_privileged(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="x", + privileged=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._privileged() is True + + async def test_privileged_defers_to_super_when_group_unset(self): + run_spec = _make_run_spec( + image="svc:1.0", + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + # Service-level privileged defaults to False + assert configurator._privileged() is False + + async def test_dstack_image_commands_injects_start_dockerd_for_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._dstack_image_commands() == ["start-dockerd"] + + async def test_dstack_image_commands_empty_for_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="alpine", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._dstack_image_commands() == [] + + async def test_shell_bash_when_group_docker(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._shell() == "/bin/bash" + + async def test_shell_sh_when_group_image(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="alpine", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._shell() == "/bin/sh" + + async def test_python_uses_group_python(self): + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + python=PythonVersion.PY312, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + assert configurator._python() == "3.12" + + async def test_user_looks_up_group_image(self, monkeypatch: pytest.MonkeyPatch): + """When a group sets its own `image`, _user() queries that image's config.""" + image_config = ImageConfig.parse_obj({"User": "nginx", "Entrypoint": None, "Cmd": []}) + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + Mock(return_value=image_config), + ) + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + image="nginxinc/nginx-unprivileged", + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + user = await configurator._user() + assert user is not None + + async def test_user_does_not_lookup_for_group_docker(self, monkeypatch: pytest.MonkeyPatch): + """`docker: true` should not trigger an image-config registry call.""" + mock_get_image_config = Mock() + monkeypatch.setattr( + "dstack._internal.server.services.jobs.configurators.base._get_image_config", + mock_get_image_config, + ) + run_spec = _make_run_spec( + replicas=[ + ReplicaGroup( + name="a", + count=Range(min=1, max=1), + docker=True, + commands=["x"], + ) + ], + ) + configurator = ServiceJobConfigurator(run_spec, replica_group_name="a") + await configurator._user() + mock_get_image_config.assert_not_called() From b17f4f65aeea80afe46106c4c827d40e3eeabea0 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 30 Apr 2026 16:33:09 +0545 Subject: [PATCH 2/3] Merge Conflict Resolved --- src/dstack/_internal/core/compatibility/runs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py index f9dbaf4e2..cb9c35c05 100644 --- a/src/dstack/_internal/core/compatibility/runs.py +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -108,6 +108,16 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType: replica_group_excludes["router"] = True if all(g.scaling is None or g.scaling.window is None for g in replicas): replica_group_excludes["scaling"] = {"window": True} + if all(g.image is None for g in replicas): + replica_group_excludes["image"] = True + if all(g.docker is None for g in replicas): + replica_group_excludes["docker"] = True + if all(g.python is None for g in replicas): + replica_group_excludes["python"] = True + if all(g.nvcc is None for g in replicas): + replica_group_excludes["nvcc"] = True + if all(g.privileged is None for g in replicas): + replica_group_excludes["privileged"] = True if replica_group_excludes: configuration_excludes["replicas"] = {"__all__": replica_group_excludes} From f8a61f6630997c0c4407c780ac674efddb8070f0 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 30 Apr 2026 16:02:09 +0545 Subject: [PATCH 3/3] Resolve Review Comments --- .../_internal/core/models/configurations.py | 66 ++++-- .../core/models/test_configurations.py | 206 +++++++++++++++++- 2 files changed, 243 insertions(+), 29 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 5bd45ed41..984b61fea 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -1147,7 +1147,7 @@ def validate_no_mixed_service_and_group_container_fields(cls, values): ( "docker", values.get("docker") is True, - lambda g: g.docker is True, + lambda g: g.docker is not None, ), ( "privileged", @@ -1162,7 +1162,7 @@ def validate_no_mixed_service_and_group_container_fields(cls, values): ( "nvcc", values.get("nvcc") is True, - lambda g: g.nvcc is True, + lambda g: g.nvcc is not None, ), ] @@ -1178,39 +1178,61 @@ def validate_no_mixed_service_and_group_container_fields(cls, values): ) return values + @root_validator() + def validate_no_conflicting_image_sources_across_levels(cls, values): + """ + Image-source fields (`image`, `docker`, `python`, `nvcc`) cannot + be mixed across service and group levels in conflicting ways. + """ + replicas = values.get("replicas") + if not isinstance(replicas, list): + return values + + forbidden = [ + ("image", values.get("image") is not None, "docker", lambda g: g.docker is not None), + ("image", values.get("image") is not None, "python", lambda g: g.python is not None), + ("image", values.get("image") is not None, "nvcc", lambda g: g.nvcc is not None), + ("docker", values.get("docker") is True, "image", lambda g: g.image is not None), + ("docker", values.get("docker") is True, "python", lambda g: g.python is not None), + ("docker", values.get("docker") is True, "nvcc", lambda g: g.nvcc is not None), + ("python", values.get("python") is not None, "image", lambda g: g.image is not None), + ("python", values.get("python") is not None, "docker", lambda g: g.docker is not None), + ("nvcc", values.get("nvcc") is True, "image", lambda g: g.image is not None), + ("nvcc", values.get("nvcc") is True, "docker", lambda g: g.docker is not None), + ] + + for s_field, s_set, g_field, g_pred in forbidden: + if s_set: + conflicting = [g.name for g in replicas if g_pred(g)] + if conflicting: + raise ValueError( + f"Service-level `{s_field}` conflicts with group-level " + f"`{g_field}` in replica group(s) {conflicting}. " + f"These image-source fields are mutually exclusive." + ) + return values + @root_validator() def validate_replica_groups_have_commands_or_image(cls, values): """ When replicas is a list, ensure each ReplicaGroup has something - to run. That means at least one of: - - group.commands - - group.image / group.docker / group.python / group.nvcc - - service-level image / docker / python / nvcc + to run. Mirrors the service-level rule: either explicit + `commands` or an `image` (group-level or service-level) is + required. """ replicas = values.get("replicas") if not isinstance(replicas, list): return values - service_has_image_source = ( - values.get("image") is not None - or values.get("docker") is True - or values.get("python") is not None - or values.get("nvcc") is True - ) + service_has_image = values.get("image") is not None for group in replicas: - group_has_image_source = ( - group.image is not None - or group.docker is True - or group.python is not None - or group.nvcc is True - ) - if not group.commands and not group_has_image_source and not service_has_image_source: + if not group.commands and group.image is None and not service_has_image: raise ValueError( - f"Replica group '{group.name}' has nothing to run. " - "Set `commands`, `image`, `docker`, `python`, or `nvcc` " - "in the group, or set one of these at the service level." + f"Replica group '{group.name}': either `commands` or " + "`image` must be set in the group, or `image` at the " + "service level." ) return values diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 31cb7c36c..5027e2973 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -461,37 +461,229 @@ def test_partial_mix_rejected(self): } ) - def test_replica_group_with_only_image_no_commands_allowed(self): + # ---- Cross-level conflicting image sources ---- + # Validates `validate_no_conflicting_image_sources_across_levels`. + + def test_service_image_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_image_conflicts_with_group_python_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `python`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], + } + ) + + def test_service_image_conflicts_with_group_nvcc_rejected(self): + """Reviewer's exact example.""" + with pytest.raises( + ConfigurationError, + match="Service-level `image` conflicts with group-level `nvcc`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "image": "alpine", + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_python_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `python`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], + } + ) + + def test_service_docker_conflicts_with_group_nvcc_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `docker` conflicts with group-level `nvcc`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "docker": True, + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], + } + ) + + def test_service_python_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `python` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_python_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `python` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "python": "3.12", + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_nvcc_conflicts_with_group_image_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `nvcc` conflicts with group-level `image`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [{"count": 1, "image": "alpine", "commands": ["x"]}], + } + ) + + def test_service_nvcc_conflicts_with_group_docker_rejected(self): + with pytest.raises( + ConfigurationError, + match="Service-level `nvcc` conflicts with group-level `docker`", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "nvcc": True, + "replicas": [{"count": 1, "docker": True, "commands": ["x"]}], + } + ) + + def test_service_python_with_group_nvcc_allowed(self): + """`python` and `nvcc` are compatible base-image knobs and may + coexist across levels.""" parse_run_configuration( { "type": "service", "port": 8000, - "replicas": [{"count": 1, "image": "nginx:latest"}], + "python": "3.12", + "replicas": [{"count": 1, "nvcc": True, "commands": ["x"]}], } ) - def test_replica_group_with_only_python_no_commands_allowed(self): + def test_service_nvcc_with_group_python_allowed(self): parse_run_configuration( { "type": "service", "port": 8000, - "replicas": [{"count": 1, "python": "3.12"}], + "nvcc": True, + "replicas": [{"count": 1, "python": "3.12", "commands": ["x"]}], } ) - def test_replica_group_with_only_nvcc_no_commands_allowed(self): + def test_replica_group_with_only_image_no_commands_allowed(self): parse_run_configuration( { "type": "service", "port": 8000, - "replicas": [{"count": 1, "nvcc": True}], + "replicas": [{"count": 1, "image": "nginx:latest"}], } ) + def test_replica_group_with_only_python_no_commands_rejected(self): + """`python` configures the base image but doesn't supply a runnable + workload — must be paired with `commands` or `image`. Matches + service-level behavior.""" + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "python": "3.12"}], + } + ) + + def test_replica_group_with_only_nvcc_no_commands_rejected(self): + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "nvcc": True}], + } + ) + + def test_replica_group_with_only_docker_no_commands_rejected(self): + """`docker: true` runs DIND but injects only `start-dockerd`; + without user commands the replica has no actual workload.""" + with pytest.raises( + ConfigurationError, + match="either `commands` or `image` must be set", + ): + parse_run_configuration( + { + "type": "service", + "port": 8000, + "replicas": [{"count": 1, "docker": True}], + } + ) + def test_empty_replica_group_rejected(self): with pytest.raises( ConfigurationError, - match="has nothing to run", + match="either `commands` or `image` must be set", ): parse_run_configuration( {