Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/dstack/_internal/core/compatibility/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
replica_group_excludes["router"] = True
if all(g.scaling is None or g.scaling.window is None for g in replicas):
replica_group_excludes["scaling"] = {"window": True}
if all(g.image is None for g in replicas):
replica_group_excludes["image"] = True
if all(g.docker is None for g in replicas):
replica_group_excludes["docker"] = True
if all(g.python is None for g in replicas):
replica_group_excludes["python"] = True
if all(g.nvcc is None for g in replicas):
replica_group_excludes["nvcc"] = True
if all(g.privileged is None for g in replicas):
replica_group_excludes["privileged"] = True
if replica_group_excludes:
configuration_excludes["replicas"] = {"__all__": replica_group_excludes}

Expand Down
170 changes: 165 additions & 5 deletions src/dstack/_internal/core/models/configurations.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more misconfiguration case not covered by validation is conflicting image sources on the service and replica level. For example, I wouldn't expect this configuration to pass validation, but currently it does:

type: service
port: 8000

image: alpine
replicas:
- count: 1
  commands: ["x"]
  nvcc: true  # conflicts with `image`

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,39 @@ class ReplicaGroup(CoreModel):
CommandsList,
Field(description="The shell commands to run for replicas in this group"),
] = []
image: Annotated[
Optional[str],
Field(
description="The name of the Docker image to run for replicas in this group. "
"Mutually exclusive with group-level `docker` and `python`."
),
] = None
python: Annotated[
Optional[PythonVersion],
Field(
description="The major version of Python for replicas in this group. "
"Mutually exclusive with group-level `image` and `docker`."
),
] = None
nvcc: Annotated[
Optional[bool],
Field(
description="Use the image with NVIDIA CUDA Compiler (NVCC) included for replicas in this group. "
"Mutually exclusive with group-level `docker`."
),
] = None
docker: Annotated[
Optional[bool],
Field(
description="Use the docker-in-docker image for this group "
"(injects `start-dockerd` and runs privileged). Mutually "
"exclusive with group-level `image`, `python`, and `nvcc`."
),
] = None
privileged: Annotated[
Optional[bool],
Field(description="Run replicas in this group in privileged mode."),
] = None
router: Annotated[
Optional[ReplicaGroupRouterConfig],
Field(
Expand All @@ -858,6 +891,42 @@ def validate_name(cls, v: Optional[str]) -> Optional[str]:
def convert_count(cls, v: Range[int]) -> Range[int]:
return _validate_replica_range(v)

@validator("python", pre=True, always=True)
def convert_python(cls, v, values) -> Optional[PythonVersion]:
if v is not None and values.get("image"):
raise ValueError("`image` and `python` are mutually exclusive within a replica group")
if isinstance(v, float):
v = str(v)
if v == "3.1":
v = "3.10"
if isinstance(v, str):
return PythonVersion(v)
return v

@validator("docker", pre=True, always=True)
def _docker(cls, v, values) -> Optional[bool]:
if v is True and values.get("image"):
raise ValueError("`image` and `docker` are mutually exclusive within a replica group")
if v is True and values.get("python"):
raise ValueError("`python` and `docker` are mutually exclusive within a replica group")
if v is True and values.get("nvcc"):
raise ValueError("`nvcc` and `docker` are mutually exclusive within a replica group")
return v

@validator("privileged", pre=True, always=True)
def _privileged(cls, v, values) -> Optional[bool]:
# Docker-in-docker requires privileged mode. The service level
# cannot enforce this rule because its `privileged` field defaults
# to `False` (existing backwards-compatibility constraint), so it
# cannot distinguish "unset" from explicit `False`. At the group
# level we keep `privileged` as `Optional[bool] = None`, so we can.
if v is False and values.get("docker") is True:
raise ValueError(
"`privileged: false` is incompatible with `docker: true` within "
"a replica group (docker-in-docker requires privileged mode)"
)
return v

@root_validator()
def validate_scaling(cls, values):
scaling = values.get("scaling")
Expand Down Expand Up @@ -1057,22 +1126,113 @@ def validate_top_level_properties_with_replica_groups(cls, values):

return values

@root_validator()
def validate_no_mixed_service_and_group_container_fields(cls, values):
"""
When replicas is a list (image, docker, privileged) may be set
at the service level OR in replica groups, never both. Mixing is
rejected — including partial mixing, where only some groups set a
field the service also sets — because it leaves precedence ambiguous.
"""
replicas = values.get("replicas")
if not isinstance(replicas, list):
return values

checks = [
(
"image",
values.get("image") is not None,
lambda g: g.image is not None,
),
(
"docker",
values.get("docker") is True,
lambda g: g.docker is not None,
),
(
"privileged",
values.get("privileged") is True,
lambda g: g.privileged is not None,
),
(
"python",
values.get("python") is not None,
lambda g: g.python is not None,
),
(
"nvcc",
values.get("nvcc") is True,
lambda g: g.nvcc is not None,
),
Comment on lines +1147 to +1166
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For docker and nvcc — why is only True forbidden? I would expect anything except None to fail. For example, I wouldn't expect this configuration to pass validation, but currently it does:

type: service
port: 80

nvcc: true
replicas:
- count: 1
  nvcc: false

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

]

for field, service_set, group_set in checks:
if service_set:
conflicting = [g.name for g in replicas if group_set(g)]
if conflicting:
raise ValueError(
f"`{field}` is set at both the service level and in "
f"replica group(s) {conflicting}. Set `{field}` in one "
f"place only — either at the service level (all groups "
f"inherit) or per group, but not both."
)
return values

@root_validator()
def validate_no_conflicting_image_sources_across_levels(cls, values):
"""
Image-source fields (`image`, `docker`, `python`, `nvcc`) cannot
be mixed across service and group levels in conflicting ways.
"""
replicas = values.get("replicas")
if not isinstance(replicas, list):
return values

forbidden = [
("image", values.get("image") is not None, "docker", lambda g: g.docker is not None),
("image", values.get("image") is not None, "python", lambda g: g.python is not None),
("image", values.get("image") is not None, "nvcc", lambda g: g.nvcc is not None),
("docker", values.get("docker") is True, "image", lambda g: g.image is not None),
("docker", values.get("docker") is True, "python", lambda g: g.python is not None),
("docker", values.get("docker") is True, "nvcc", lambda g: g.nvcc is not None),
("python", values.get("python") is not None, "image", lambda g: g.image is not None),
("python", values.get("python") is not None, "docker", lambda g: g.docker is not None),
("nvcc", values.get("nvcc") is True, "image", lambda g: g.image is not None),
("nvcc", values.get("nvcc") is True, "docker", lambda g: g.docker is not None),
]

for s_field, s_set, g_field, g_pred in forbidden:
if s_set:
conflicting = [g.name for g in replicas if g_pred(g)]
if conflicting:
raise ValueError(
f"Service-level `{s_field}` conflicts with group-level "
f"`{g_field}` in replica group(s) {conflicting}. "
f"These image-source fields are mutually exclusive."
)
return values

@root_validator()
def validate_replica_groups_have_commands_or_image(cls, values):
"""
When replicas is a list, ensure each ReplicaGroup has commands OR service has image.
When replicas is a list, ensure each ReplicaGroup has something
to run. Mirrors the service-level rule: either explicit
`commands` or an `image` (group-level or service-level) is
required.
"""
replicas = values.get("replicas")
image = values.get("image")

if not isinstance(replicas, list):
return values

service_has_image = values.get("image") is not None

for group in replicas:
if not group.commands and not image:
if not group.commands and group.image is None and not service_has_image:
raise ValueError(
f"Replica group '{group.name}' has no commands. "
"Either set `commands` in the replica group or set `image` at the service level."
f"Replica group '{group.name}': either `commands` or "
"`image` must be set in the group, or `image` at the "
"service level."
)

return values
Expand Down
99 changes: 95 additions & 4 deletions src/dstack/_internal/server/services/jobs/configurators/service.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,111 @@
from typing import List, Optional

from dstack._internal.core.models.configurations import PortMapping, RunConfigurationType
from dstack._internal import settings
from dstack._internal.core.models.configurations import (
PortMapping,
ReplicaGroup,
RunConfigurationType,
)
from dstack._internal.core.models.profiles import SpotPolicy
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
from dstack._internal.core.models.unix import UnixUser
from dstack._internal.server.services.jobs.configurators.base import (
JobConfigurator,
get_default_image,
)


class ServiceJobConfigurator(JobConfigurator):
TYPE: RunConfigurationType = RunConfigurationType.SERVICE

def _shell_commands(self) -> List[str]:
def _current_replica_group(self) -> Optional[ReplicaGroup]:
assert self.run_spec.configuration.type == "service"
for group in self.run_spec.configuration.replica_groups:
if group.name == self.replica_group_name:
return group.commands
return group
return None

def _shell_commands(self) -> List[str]:
assert self.run_spec.configuration.type == "service"
group = self._current_replica_group()
if group is not None:
return group.commands
return self.run_spec.configuration.commands

def _image_name(self) -> str:
group = self._current_replica_group()
if group is not None:
if group.docker is True:
return settings.DSTACK_DIND_IMAGE
if group.image is not None:
return group.image
if group.nvcc is True:
return get_default_image(nvcc=True)
return super()._image_name()

def _privileged(self) -> bool:
group = self._current_replica_group()
if group is not None:
if group.docker is True:
return True
if group.privileged is not None:
return group.privileged
return super()._privileged()

def _dstack_image_commands(self) -> List[str]:
group = self._current_replica_group()
if group is not None:
if group.docker is True:
return ["start-dockerd"]
if group.image is not None:
return []
return super()._dstack_image_commands()

def _shell(self) -> str:
# Shell resolution order:
# 1. If `shell:` is set explicitly, the base honors it.
# 2. If this group sets `docker: true`, use /bin/bash — the
# DIND image ships bash, matching the service-level path.
# 3. If this group sets its own `image`, force /bin/sh. The
# base returns /bin/bash when service-level `image` is None,
# but a group-level custom image (e.g. alpine) may not ship
# bash.
# 4. Otherwise defer to the base (bash for dstackai/base, sh
# for a service-level custom image).
if self.run_spec.configuration.shell is None:
group = self._current_replica_group()
if group is not None:
if group.docker is True:
return "/bin/bash"
if group.image is not None:
return "/bin/sh"
return super()._shell()

async def _user(self) -> Optional[UnixUser]:
# Base `_user()` only queries the image for a default user when
# `configuration.image` is set at the service level. When the
# group supplies its own `image`, perform the lookup here so the
# container runs as that image's default user.
#
# We intentionally do NOT look up the DIND image when the group
# sets `docker: true`. That matches service-level behavior: when
# `configuration.docker is True`, `configuration.image` is None,
# so the base skips the lookup. DIND is always privileged and
# effectively root anyway.
if self.run_spec.configuration.user is None:
group = self._current_replica_group()
if group is not None and group.image is not None:
image_config = await self._get_image_config()
if image_config.user is None:
return None
return UnixUser.parse(image_config.user)
return await super()._user()

def _python(self) -> str:
group = self._current_replica_group()
if group is not None and group.python is not None:
return group.python.value
return super()._python()

def _default_single_branch(self) -> bool:
return True

Expand Down
Loading
Loading