dstackai · jvstme · Nov 6, 2024 · Nov 5, 2024
diff --git a/docs/blog/posts/amd-on-runpod.md b/docs/blog/posts/amd-on-runpod.md
@@ -64,10 +64,7 @@ you can now specify an AMD GPU under `resources`. Below are a few examples.
 
     spot_policy: auto
 
-    model:
-      type: chat
-      name: meta-llama/Meta-Llama-3.1-70B-Instruct
-      format: openai
+    model: meta-llama/Meta-Llama-3.1-70B-Instruct
     ```
 
     </div>

diff --git a/docs/blog/posts/dstack-sky.md b/docs/blog/posts/dstack-sky.md
@@ -115,10 +115,7 @@ resources:
   gpu: 48GB..80GB
 
 # Enable OpenAI compatible endpoint
-model:
-  type: chat
-  name: mixtral
-  format: openai
+model: mixtral
 ```
 </div>
 

diff --git a/docs/blog/posts/tpu-on-gcp.md b/docs/blog/posts/tpu-on-gcp.md
@@ -115,10 +115,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-
     resources:
       gpu: v5litepod-4
 
-    model:
-      format: openai
-      type: chat
-      name: meta-llama/Meta-Llama-3.1-8B-Instruct
+    model: meta-llama/Meta-Llama-3.1-8B-Instruct
     ```
     </div>
 

diff --git a/docs/blog/posts/volumes-on-runpod.md b/docs/blog/posts/volumes-on-runpod.md
@@ -46,10 +46,7 @@ spot_policy: auto
 resources:
   gpu: 24GB
 
-model:
-  format: openai
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-8B-Instruct
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 </div>
@@ -123,10 +120,7 @@ spot_policy: auto
 resources:
   gpu: 24GB
 
-model:
-  format: openai
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-8B-Instruct
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 </div>

diff --git a/docs/docs/reference/dstack.yml/service.md b/docs/docs/reference/dstack.yml/service.md
@@ -112,8 +112,9 @@ If you want, you can specify your own Docker image via `image`.
 
 By default, if you run a service, its endpoint is accessible at `https://<run name>.<gateway domain>`.
 
-If you run a model, you can optionally configure the mapping to make it accessible via the 
-OpenAI-compatible interface.
+If you are running a chat model with an OpenAI-compatible interface,
+you can optionally set the [`model`](#model) property to make the model accessible via
+the model gateway provided by `dstack`.
 
 <div editor-title="service.dstack.yml"> 
 
@@ -124,7 +125,9 @@ name: llama31-service
 
 python: "3.10"
 
-# Commands of the service
+# Required environment variables
+env:
+  - HF_TOKEN
 commands:
   - pip install vllm
   - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096
@@ -135,58 +138,22 @@ resources:
   # Change to what is required
   gpu: 24GB
 
-# Comment if you don't want to access the model via https://gateway.<gateway domain>
-model:
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-8B-Instruct
-  format: openai
+# Make the model accessible at https://gateway.<gateway domain>
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
+
+# Alternatively, use this syntax to set more model settings:
+# model:
+#   type: chat
+#   name: meta-llama/Meta-Llama-3.1-8B-Instruct
+#   format: openai
+#   prefix: /v1
 ```
 
 </div>
 
-In this case, with such a configuration, once the service is up, you'll be able to access the model at
+With such a configuration, once the service is up, you'll be able to access the model at
 `https://gateway.<gateway domain>` via the OpenAI-compatible interface.
 
-The `format` supports only `tgi` (Text Generation Inference)
-and `openai` (if you are using Text Generation Inference or vLLM with OpenAI-compatible mode).
-
-??? info "Chat template"
-
-    By default, `dstack` loads the [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating)
-    from the model's repository. If it is not present there, manual configuration is required.
-
-    ```yaml
-    type: service
-
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    env:
-      - MODEL_ID=TheBloke/Llama-2-13B-chat-GPTQ
-    commands:
-      - text-generation-launcher --port 8000 --trust-remote-code --quantize gptq
-    port: 8000
-
-    resources:
-      gpu: 80GB
-
-    # Enable the OpenAI-compatible endpoint
-    model:
-      type: chat
-      name: TheBloke/Llama-2-13B-chat-GPTQ
-      format: tgi
-      chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' </s>' }}{% endif %}{% endfor %}"
-      eos_token: "</s>"
-    ```
-
-    ##### Limitations
-
-    Please note that model mapping is an experimental feature with the following limitations:
-
-    1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself.
-    2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template).
-
-    If you encounter any other issues, please make sure to file a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose).
-
-
 ### Auto-scaling
 
 By default, `dstack` runs a single replica of the service.
@@ -201,7 +168,9 @@ name: llama31-service
 
 python: "3.10"
 
-# Commands of the service
+# Required environment variables
+env:
+  - HF_TOKEN
 commands:
   - pip install vllm
   - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096
@@ -461,14 +430,61 @@ The `service` configuration type supports many other options. See below.
       type:
         required: true
 
-## `model`
+## `model[format=openai]`
 
-#SCHEMA# dstack._internal.core.models.gateways.BaseChatModel
+#SCHEMA# dstack._internal.core.models.gateways.OpenAIChatModel
     overrides:
       show_root_heading: false
       type:
         required: true
 
+## `model[format=tgi]`
+
+> TGI provides an OpenAI-compatible API starting with version 1.4.0,
+so models served by TGI can be defined with `format: openai` too.
+
+#SCHEMA# dstack._internal.core.models.gateways.TGIChatModel
+    overrides:
+      show_root_heading: false
+      type:
+        required: true
+
+??? info "Chat template"
+
+    By default, `dstack` loads the [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating)
+    from the model's repository. If it is not present there, manual configuration is required.
+
+    ```yaml
+    type: service
+
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    env:
+      - MODEL_ID=TheBloke/Llama-2-13B-chat-GPTQ
+    commands:
+      - text-generation-launcher --port 8000 --trust-remote-code --quantize gptq
+    port: 8000
+
+    resources:
+      gpu: 80GB
+
+    # Enable the OpenAI-compatible endpoint
+    model:
+      type: chat
+      name: TheBloke/Llama-2-13B-chat-GPTQ
+      format: tgi
+      chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' </s>' }}{% endif %}{% endfor %}"
+      eos_token: "</s>"
+    ```
+
+    ##### Limitations
+
+    Please note that model mapping is an experimental feature with the following limitations:
+
+    1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself.
+    2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template).
+
+    If you encounter any other issues, please make sure to file a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose).
+
 ## `scaling`
 
 #SCHEMA# dstack._internal.core.models.configurations.ScalingSpec
@@ -486,7 +502,7 @@ The `service` configuration type supports many other options. See below.
         required: true
       item_id_prefix: resources-
 
-## `resouces.gpu` { #resources-gpu data-toc-label="resources.gpu" } 
+## `resouces.gpu` { #resources-gpu data-toc-label="resources.gpu" }
 
 #SCHEMA# dstack._internal.core.models.resources.GPUSpecSchema
     overrides:

diff --git a/docs/docs/services.md b/docs/docs/services.md
@@ -44,11 +44,8 @@ resources:
   # Change to what is required
   gpu: 24GB
 
-# Comment if you don't to access the model via https://gateway.<gateway domain>
-model:
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-8B-Instruct
-  format: openai
+# Comment out if you won't access the model via https://gateway.<gateway domain>
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 </div>

diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
@@ -39,10 +39,7 @@ Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](h
     spot_policy: auto
 
     # Register the model    
-    model:
-      type: chat
-      name: meta-llama/Meta-Llama-3.1-70B-Instruct
-      format: openai
+    model: meta-llama/Meta-Llama-3.1-70B-Instruct
     ```
 
     </div>
@@ -98,10 +95,7 @@ Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](h
       disk: 200GB
 
     # Register the model
-    model:
-      format: openai
-      type: chat
-      name: meta-llama/Meta-Llama-3.1-70B-Instruct
+    model: meta-llama/Meta-Llama-3.1-70B-Instruct
     ```
     </div>
 

diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md
@@ -87,10 +87,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-
     resources:
       gpu: v5litepod-4
 
-    model:
-      format: openai
-      type: chat
-      name: meta-llama/Meta-Llama-3.1-8B-Instruct
+    model: meta-llama/Meta-Llama-3.1-8B-Instruct
     ```
     </div>
 

diff --git a/examples/deployment/ollama/serve.dstack.yml b/examples/deployment/ollama/serve.dstack.yml
@@ -13,7 +13,4 @@ resources:
   gpu: 48GB..80GB
 
 # (Optional) Enable the OpenAI-compatible endpoint
-model:
-  type: chat
-  name: mixtral
-  format: openai
+model: mixtral
diff --git a/examples/deployment/tgi/amd/service.dstack.yml b/examples/deployment/tgi/amd/service.dstack.yml
@@ -17,7 +17,4 @@ resources:
 
 spot_policy: auto
 
-model:
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-70B-Instruct
-  format: openai
+model: meta-llama/Meta-Llama-3.1-70B-Instruct
diff --git a/examples/deployment/vllm/amd/service.dstack.yml b/examples/deployment/vllm/amd/service.dstack.yml
@@ -43,7 +43,4 @@ resources:
   disk: 200GB
 
 # (Optional) Enable the OpenAI-compatible endpoint
-model:
-  format: openai
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-70B-Instruct
+model: meta-llama/Meta-Llama-3.1-70B-Instruct
diff --git a/examples/deployment/vllm/serve.dstack.yml b/examples/deployment/vllm/serve.dstack.yml
@@ -13,7 +13,4 @@ resources:
   gpu: 24GB
 
 # (Optional) Enable the OpenAI-compatible endpoint
-model:
-  format: openai
-  type: chat
-  name: NousResearch/Llama-2-7b-chat-hf
+model: NousResearch/Llama-2-7b-chat-hf
diff --git a/examples/deployment/vllm/service-tpu.dstack.yml b/examples/deployment/vllm/service-tpu.dstack.yml
@@ -34,7 +34,4 @@ resources:
   gpu: v5litepod-4
 
 # (Optional) Enable the OpenAI-compatible endpoint
-model:
-  format: openai
-  type: chat
-  name: meta-llama/Meta-Llama-3.1-8B-Instruct
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
diff --git a/examples/llms/mixtral/vllm.dstack.yml b/examples/llms/mixtral/vllm.dstack.yml
@@ -17,7 +17,4 @@ resources:
   disk: 200GB
 
 # (Optional) Enable the OpenAI-compatible endpoint
-model:
-  type: chat
-  name: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ
-  format: openai
+model: TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ
diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py
@@ -9,7 +9,7 @@
 from dstack._internal.core.models.common import CoreModel, Duration, RegistryAuth
 from dstack._internal.core.models.envs import Env
 from dstack._internal.core.models.fleets import FleetConfiguration
-from dstack._internal.core.models.gateways import AnyModel, GatewayConfiguration
+from dstack._internal.core.models.gateways import AnyModel, GatewayConfiguration, OpenAIChatModel
 from dstack._internal.core.models.profiles import ProfileParams
 from dstack._internal.core.models.repos.base import Repo
 from dstack._internal.core.models.repos.virtual import VirtualRepo
@@ -211,8 +211,15 @@ class ServiceConfigurationParams(CoreModel):
         Field(description="The port, that application listens on or the mapping"),
     ]
     model: Annotated[
-        Optional[AnyModel],
-        Field(description="Mapping of the model for the OpenAI-compatible endpoint"),
+        Optional[Union[AnyModel, str]],
+        Field(
+            description=(
+                "Mapping of the model for the model gateway."
+                " Can be a full model format definition or just a model name."
+                " If it's a name, the service is expected to expose an OpenAI-compatible"
+                " API at the `/v1` path"
+            )
+        ),
     ] = None
     https: Annotated[
         bool,
@@ -243,6 +250,12 @@ def convert_port(cls, v) -> PortMapping:
             return PortMapping.parse(v)
         return v
 
+    @validator("model")
+    def convert_model(cls, v: Optional[Union[AnyModel, str]]) -> Optional[AnyModel]:
+        if isinstance(v, str):
+            return OpenAIChatModel(type="chat", name=v, format="openai")
+        return v
+
     @validator("replicas")
     def convert_replicas(cls, v: Any) -> Range[int]:
         if isinstance(v, str) and ".." in v: