dstackai · peterschmidt85 · Nov 19, 2024 · Nov 12, 2024 · Nov 19, 2024
diff --git a/docs/examples.md b/docs/examples.md
@@ -13,6 +13,37 @@ hide:
 }
 </style>
 
+## Deployment
+<div class="tx-landing__highlights_grid">
+    <a href="/examples/deployment/vllm" 
+       class="feature-cell">
+       <h3>
+           vLLM
+       </h3>
+       <p>
+            Deploy Llama 3.1 with vLLM
+        </p>
+    </a>
+    <a href="/examples/deployment/tgi" 
+       class="feature-cell">
+       <h3>
+           TGI
+       </h3>
+       <p>
+            Deploy Llama 3.1 with TGI
+        </p>
+    </a>
+    <a href="/examples/deployment/nim" 
+       class="feature-cell">
+       <h3>
+           NIM
+       </h3>
+       <p>
+            Deploy Llama 3.1 with NIM
+        </p>
+    </a>
+</div>
+
 ## Fine-tuning
 
 <div class="tx-landing__highlights_grid">
@@ -27,17 +58,6 @@ hide:
         </p>
     </a>
 
-    <a href="/examples/fine-tuning/alignment-handbook"
-       class="feature-cell">
-        <h3>
-            Alignment Handbook
-        </h3>
-
-        <p>
-            Fine-tune Gemma 7B on a custom dataset.
-        </p>
-    </a>
-
     <a href="/examples/fine-tuning/trl"
        class="feature-cell">
         <h3>

diff --git a/...s/fine-tuning/alignment-handbook/index.md → docs/examples/deployment/nim/index.md b/...s/fine-tuning/alignment-handbook/index.md → docs/examples/deployment/nim/index.md
diff --git a/docs/examples/deployment/tgi/index.md b/docs/examples/deployment/tgi/index.md
diff --git a/docs/examples/deployment/vllm/index.md b/docs/examples/deployment/vllm/index.md
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
@@ -116,6 +116,7 @@
 
                 <div class="tx-footer__section">
                     <div class="tx-footer__section-title">Examples</div>
+                    <a href="/examples#deployment" class="tx-footer__section-link">Deployment</a>
                     <a href="/examples#fine-tuning" class="tx-footer__section-link">Fine-tuning</a>
                     <a href="/examples#accelerators" class="tx-footer__section-link">Accelerators</a>
                     <a href="/examples#llms" class="tx-footer__section-link">LLMs</a>

diff --git a/examples/deployment/nim/service.dstack.yml → examples/deployment/nim/.dstack.yml b/examples/deployment/nim/service.dstack.yml → examples/deployment/nim/.dstack.yml
@@ -1,18 +1,18 @@
 type: service
 
 image: nvcr.io/nim/meta/llama3-8b-instruct:latest
-
 env:
   - NGC_API_KEY
 registry_auth:
   username: $oauthtoken
   password: ${{ env.NGC_API_KEY }}
-
 port: 8000
+model: meta/llama3-8b-instruct
 
 spot_policy: auto
 
 resources:
   gpu: 24GB
 
+# Exclude non-VM backends
 backends: ["aws", "azure", "cudo", "datacrunch", "gcp", "lambda", "oci", "tensordock"]
diff --git a/examples/deployment/nim/README.md b/examples/deployment/nim/README.md
@@ -1,6 +1,6 @@
 # NIM
 
-This example shows how to deploy `Meta/LLama3-8b-instruct` with `dstack` using [NIM :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html).
+This example shows how to deploy LLama 3.1 with `dstack` using [NIM :material-arrow-top-right-thin:{ .external }](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html){:target="_blank"}.
 
 ??? info "Prerequisites"
     Once `dstack` is [installed](https://dstack.ai/docs/installation), go ahead clone the repo, and run `dstack init`.
@@ -17,40 +17,33 @@ This example shows how to deploy `Meta/LLama3-8b-instruct` with `dstack` using [
 
 ## Deployment
 
-### Running as a task
-If you'd like to run Meta/Llama 3-8b for development purposes, consider using `dstack` [tasks](https://dstack.ai/docs/tasks/).
+Here's an example of a service that deploys Llama 3.1 8B using vLLM.
 
-<div editor-title="examples/deployment/nim/task.dstack.yml">
+<div editor-title="examples/deployment/nim/.dstack.yml">
 
 ```yaml
-type: task
+type: service
 
-name: llama3-nim-task
 image: nvcr.io/nim/meta/llama3-8b-instruct:latest
-
 env:
   - NGC_API_KEY
 registry_auth:
   username: $oauthtoken
   password: ${{ env.NGC_API_KEY }}
-
-ports: 
-  - 8000
+port: 8000
+model: meta/llama3-8b-instruct
 
 spot_policy: auto
 
 resources:
   gpu: 24GB
 
+# Exclude non-VM backends
 backends: ["aws", "azure", "cudo", "datacrunch", "gcp", "lambda", "oci", "tensordock"]
 ```
 </div>
-Note, Currently NIM is supported on every backend except RunPod and Vast.ai.
-
-### Deploying as a service
 
-If you'd like to deploy the model as an auto-scalable and secure endpoint,
-use the [service](https://dstack.ai/docs/services) configuration. You can find it at [`examples/deployment/nim/service.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/nim/service.dstack.yml)
+> Currently, NIM is supported on every backend except RunPod, Vast.ai, and Kubernetes.
 
 ### Running a configuration
 
@@ -61,7 +54,7 @@ To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/referenc
 ```shell
 $ NGC_API_KEY=...
 
-$ dstack apply -f examples/deployment/nim/task.dstack.yml
+$ dstack apply -f examples/deployment/nim/.dstack.yml
 
  #  BACKEND  REGION             RESOURCES                 SPOT  PRICE       
  1  gcp      asia-northeast3    4xCPU, 16GB, 1xL4 (24GB)  yes   $0.17   
@@ -75,6 +68,33 @@ Provisioning...
 ```
 </div>
 
+If no gateway is created, the service’s endpoint will be accessible at 
+`<dstack server URL>/proxy/services/<project name>/<run name>`.
+
+<div class="termy">
+
+```shell
+$ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "meta/llama3-8b-instruct",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "What is Deep Learning?"
+        }
+      ],
+      "max_tokens": 128
+    }'
+```
+
+</div>
 
 ## Source code
 
@@ -83,6 +103,8 @@ The source-code of this example can be found in
 
 ## What's next?
 
-1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
-   [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips).
-2. Browse [Available models in NGC Catalog :material-arrow-top-right-thin:{ .external }](https://catalog.ngc.nvidia.com/containers?filters=nvidia_nim%7CNVIDIA+NIM%7Cnimmcro_nvidia_nim&orderBy=scoreDESC&query=&page=&pageSize=).
+1. Check [services](https://dstack.ai/docs/services)
+2. Browse the [Llama 3.1](https://dstack.ai/examples/llms/llama31/), [TGI](https://dstack.ai/examples/deployment/tgi/), 
+   and [vLLM](https://dstack.ai/examples/deployment/vllm/) examples
+3. See also [AMD](https://dstack.ai/examples/accelerators/amd/) and
+   [TPU](https://dstack.ai/examples/accelerators/tpu/)
diff --git a/examples/deployment/nim/task.dstack.yml b/examples/deployment/nim/task.dstack.yml
diff --git a/examples/deployment/ollama/.dstack.yml b/examples/deployment/ollama/.dstack.yml
@@ -1,9 +1,18 @@
-type: dev-environment
-# Launches a dev environment to play with Ollama
+type: service
+name: llama31
 
 image: ollama/ollama
+commands:
+  - ollama serve &
+  - sleep 3
+  - ollama pull llama3.1
+  - fg
+port: 11434
+model: llama3.1
 
-ide: vscode
+# Use either spot or on-demand instances
+spot_policy: auto
 
+# Required resources
 resources:
-  gpu: 48GB..80GB
+  gpu: 24GB
diff --git a/examples/deployment/ollama/serve.dstack.yml b/examples/deployment/ollama/serve.dstack.yml
diff --git a/examples/deployment/tgi/.dstack.yml b/examples/deployment/tgi/.dstack.yml
@@ -1,9 +1,21 @@
-type: dev-environment
-# Launches a dev environment to play with TGI
+type: service
+name: llama31
 
 image: ghcr.io/huggingface/text-generation-inference:latest
+env:
+  - HF_TOKEN
+  - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+  - MAX_INPUT_LENGTH=4000
+  - MAX_TOTAL_TOKENS=4096
+commands:
+  - NUM_SHARD=$DSTACK_GPUS_NUM text-generation-launcher
+port: 80
+model: meta-llama/Meta-Llama-3.1-8B-Instruct
 
-ide: vscode
+# Use either spot or on-demand instances
+spot_policy: auto
 
-resources: 
-  gpu: 24GB
+resources:
+  gpu: 24GB
+  # Uncomment if using multiple GPUs
+  #shm_size: 24GB