Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/docs/reference/dstack.yml/dev-environment.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ The `dev-environment` configuration type allows running [dev environments](../..
type:
required: true

### `utilization_policy`

#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy
overrides:
show_root_heading: false
type:
required: true

### `resources`

#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema
Expand Down
28 changes: 18 additions & 10 deletions docs/docs/reference/dstack.yml/service.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,31 @@ The `service` configuration type allows running [services](../../concepts/servic

> TGI provides an OpenAI-compatible API starting with version 1.4.0,
so models served by TGI can be defined with `format: openai` too.

#SCHEMA# dstack.api.TGIChatModel
overrides:
show_root_heading: false
type:
required: true

??? info "Chat template"

By default, `dstack` loads the [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating)
from the model's repository. If it is not present there, manual configuration is required.

```yaml
type: service

image: ghcr.io/huggingface/text-generation-inference:latest
env:
- MODEL_ID=TheBloke/Llama-2-13B-chat-GPTQ
commands:
- text-generation-launcher --port 8000 --trust-remote-code --quantize gptq
port: 8000

resources:
gpu: 80GB

# Enable the OpenAI-compatible endpoint
model:
type: chat
Expand All @@ -57,13 +57,13 @@ The `service` configuration type allows running [services](../../concepts/servic
chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' </s>' }}{% endif %}{% endfor %}"
eos_token: "</s>"
```

Please note that model mapping is an experimental feature with the following limitations:

1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself.
2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template).
If you encounter any other issues, please make sure to file a

If you encounter any other issues, please make sure to file a
[GitHub issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/new/choose){:target="_blank"}.

### `scaling`
Expand All @@ -80,6 +80,14 @@ The `service` configuration type allows running [services](../../concepts/servic
overrides:
show_root_heading: false

### `utilization_policy`

#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy
overrides:
show_root_heading: false
type:
required: true

### `resources`

#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema
Expand Down
8 changes: 8 additions & 0 deletions docs/docs/reference/dstack.yml/task.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ The `task` configuration type allows running [tasks](../../concepts/tasks.md).
type:
required: true

### `utilization_policy`

#SCHEMA# dstack._internal.core.models.profiles.UtilizationPolicy
overrides:
show_root_heading: false
type:
required: true

### `resources`

#SCHEMA# dstack._internal.core.models.resources.ResourcesSpecSchema
Expand Down
21 changes: 18 additions & 3 deletions src/dstack/_internal/core/models/profiles.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import List, Optional, Union
from typing import List, Optional, Union, overload

from pydantic import Field, root_validator, validator
from typing_extensions import Annotated, Literal
Expand Down Expand Up @@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
DESTROY_AFTER_IDLE = "destroy-after-idle"


@overload
def parse_duration(v: None) -> None: ...


@overload
def parse_duration(v: Union[int, str]) -> int: ...


def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
if v is None:
return None
Expand Down Expand Up @@ -113,6 +121,8 @@ def _validate_fields(cls, values):


class UtilizationPolicy(CoreModel):
_min_time_window = "5m"

min_gpu_utilization: Annotated[
int,
Field(
Expand All @@ -130,12 +140,17 @@ class UtilizationPolicy(CoreModel):
Field(
description=(
"The time window of metric samples taking into account to measure utilization"
" (e.g., `30m`, `1h`)"
f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
)
),
]

_validate_time_window = validator("time_window", pre=True, allow_reuse=True)(parse_duration)
@validator("time_window", pre=True)
def validate_time_window(cls, v: Union[int, str]) -> int:
v = parse_duration(v)
if v < parse_duration(cls._min_time_window):
raise ValueError(f"Minimum time_window is {cls._min_time_window}")
return v


class ProfileParams(CoreModel):
Expand Down
9 changes: 9 additions & 0 deletions src/dstack/_internal/server/services/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
)
from dstack._internal.core.services import validate_dstack_resource_name
from dstack._internal.core.services.diff import diff_models
from dstack._internal.server import settings
from dstack._internal.server.db import get_db
from dstack._internal.server.models import (
JobModel,
Expand Down Expand Up @@ -838,6 +839,14 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
run_spec.repo_id = DEFAULT_VIRTUAL_REPO_ID
if run_spec.repo_data is None:
run_spec.repo_data = VirtualRunRepoData()
if (
run_spec.merged_profile.utilization_policy is not None
and run_spec.merged_profile.utilization_policy.time_window
> settings.SERVER_METRICS_TTL_SECONDS
):
raise ServerClientError(
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
)


_UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
Expand Down
3 changes: 3 additions & 0 deletions src/dstack/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, Tuple

import yaml
from pydantic import ValidationError

from dstack._internal.core.errors import ConfigurationError
from dstack._internal.core.models.configurations import AnyRunConfiguration
Expand Down Expand Up @@ -96,6 +97,8 @@ def _load_profile_from_path(profiles_path: Path, profile_name: Optional[str]) ->
config = ProfilesConfig.parse_obj(yaml.safe_load(f))
except FileNotFoundError:
return None
except ValidationError as e:
raise ConfigurationError(e)

if profile_name is None:
return config.default()
Expand Down