From 970c864349c4035c12d1cabde56ae2ceef536c6c Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Fri, 24 Apr 2026 21:36:53 +0200
Subject: [PATCH 1/2] Add DeepSeek V4 model docs

---
 docs/examples.md                          |  11 ++
 docs/examples/models/deepseek-v4/index.md |   0
 examples/inference/sglang/README.md       |   3 +
 examples/models/deepseek-v4/README.md     | 151 ++++++++++++++++++++++
 mkdocs.yml                                |   5 +-
 5 files changed, 168 insertions(+), 2 deletions(-)
 create mode 100644 docs/examples/models/deepseek-v4/index.md
 create mode 100644 examples/models/deepseek-v4/README.md
diff --git a/docs/examples.md b/docs/examples.md
index b3e3e0d42..5770425b9 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -188,6 +188,17 @@ hide:
 ## Models
 
 <div class="tx-landing__highlights_grid">
+    <a href="/examples/models/deepseek-v4"
+       class="feature-cell">
+        <h3>
+            DeepSeek V4
+        </h3>
+
+        <p>
+            Deploy DeepSeek V4 with SGLang on B200:8
+        </p>
+    </a>
+
     <a href="/examples/models/qwen36"
        class="feature-cell">
         <h3>
diff --git a/docs/examples/models/deepseek-v4/index.md b/docs/examples/models/deepseek-v4/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
index ddf49fd31..c3f7c267c 100644
--- a/examples/inference/sglang/README.md
+++ b/examples/inference/sglang/README.md
@@ -8,6 +8,9 @@ description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs
 This example shows how to deploy `Qwen/Qwen3.6-27B` using
 [SGLang](https://github.com/sgl-project/sglang) and `dstack`.
 
+> For a `DeepSeek-V4-Pro` deployment on `B200:8`, see the
+[DeepSeek V4](../../models/deepseek-v4/index.md) model page.
+
 ## Apply a configuration
 
 Here's an example of a service that deploys
diff --git a/examples/models/deepseek-v4/README.md b/examples/models/deepseek-v4/README.md
new file mode 100644
index 000000000..35525df6d
--- /dev/null
+++ b/examples/models/deepseek-v4/README.md
@@ -0,0 +1,151 @@
+---
+title: DeepSeek V4
+description: Deploying DeepSeek-V4-Pro using SGLang on NVIDIA B200:8
+---
+
+# DeepSeek V4
+
+This example shows how to deploy `deepseek-ai/DeepSeek-V4-Pro` as a
+[service](https://dstack.ai/docs/services) using
+[SGLang](https://github.com/sgl-project/sglang) and `dstack`.
+
+## Apply a configuration
+
+Save the following configuration as `deepseek-v4.dstack.yml`.
+
+<div editor-title="deepseek-v4.dstack.yml">
+
+```yaml
+type: service
+name: deepseek-v4
+
+image: lmsysorg/sglang:deepseek-v4-blackwell
+
+env:
+  - HF_TOKEN
+  - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+  - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+commands:
+  - |
+    sglang serve \
+      --trust-remote-code \
+      --model-path deepseek-ai/DeepSeek-V4-Pro \
+      --tp 8 \
+      --dp 8 \
+      --enable-dp-attention \
+      --moe-a2a-backend deepep \
+      --mem-fraction-static 0.82 \
+      --cuda-graph-max-bs 64 \
+      --max-running-requests 256 \
+      --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
+      --tool-call-parser deepseekv4 \
+      --reasoning-parser deepseek-v4 \
+      --host 0.0.0.0 \
+      --port 30000
+
+port: 30000
+model: deepseek-ai/DeepSeek-V4-Pro
+
+resources:
+  gpu: B200:8
+  shm_size: 32GB
+  disk: 2TB..
+```
+
+</div>
+
+This configuration uses the single-node Blackwell `DeepSeek-V4-Pro` recipe
+shape for `8 x NVIDIA B200`.
+
+Export your Hugging Face token and apply the configuration with
+[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md).
+
+<div class="termy">
+
+```shell
+$ export HF_TOKEN=<your-hf-token>
+$ dstack apply -f deepseek-v4.dstack.yml
+```
+
+</div>
+
+If no gateway is created, the service endpoint will be available at
+`<dstack server URL>/proxy/services/<project name>/<run name>/`.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "deepseek-ai/DeepSeek-V4-Pro",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is 15% of 240? Reply with just the number."
+        }
+      ],
+      "temperature": 0,
+      "max_tokens": 32
+    }'
+```
+
+</div>
+
+## Reasoning mode
+
+To separate the model's reasoning into `reasoning_content`, keep
+`--reasoning-parser deepseek-v4` in the server command and send
+`chat_template_kwargs` in the request body.
+
+For raw HTTP requests, `chat_template_kwargs` and `separate_reasoning` must be
+top-level JSON fields.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "deepseek-ai/DeepSeek-V4-Pro",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Solve step by step: If 3x + 5 = 20, what is x?"
+        }
+      ],
+      "temperature": 0,
+      "max_tokens": 256,
+      "chat_template_kwargs": {
+        "thinking": true
+      },
+      "separate_reasoning": true
+    }'
+```
+
+</div>
+
+This returns both:
+
+- `reasoning_content`: a separate reasoning trace
+- `content`: the final user-visible answer
+
+## Deployment notes
+
+- Use `lmsysorg/sglang:deepseek-v4-blackwell` for `B200:8`.
+- The first startup can take several minutes while the model loads and SGLang
+  finishes CUDA graph capture.
+- On container backends such as Vast.ai, avoid `instance_path` cache volumes in
+  this service config.
+- The endpoint is OpenAI-compatible and served on port `30000`.
+
+## What's next?
+
+1. Read the [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro)
+2. Read the [DeepSeek-V4 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4)
+3. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) and [vLLM](https://dstack.ai/examples/inference/vllm/) examples
diff --git a/mkdocs.yml b/mkdocs.yml
index 1baa53015..1b75f0ebe 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -306,12 +306,13 @@ nav:
           - vLLM: examples/inference/vllm/index.md
           - NIM: examples/inference/nim/index.md
           - TensorRT-LLM: examples/inference/trtllm/index.md
+      - Models:
+          - DeepSeek V4: examples/models/deepseek-v4/index.md
+          - Qwen 3.6: examples/models/qwen36/index.md
       - Accelerators:
           - AMD: examples/accelerators/amd/index.md
           - TPU: examples/accelerators/tpu/index.md
           - Tenstorrent: examples/accelerators/tenstorrent/index.md
-      - Models:
-          - Qwen 3.6: examples/models/qwen36/index.md
   - Blog:
       - blog/index.md
   - Case studies: blog/case-studies.md

From 538b5afad64cb1e984615d0f634e858ac750b383 Mon Sep 17 00:00:00 2001
From: Andrey Cheptsov <andrey.cheptsov@github.com>
Date: Fri, 24 Apr 2026 21:45:43 +0200
Subject: [PATCH 2/2] Refine DeepSeek V4 deployment notes

---
 examples/models/deepseek-v4/README.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/models/deepseek-v4/README.md b/examples/models/deepseek-v4/README.md
index 35525df6d..b36a34301 100644
--- a/examples/models/deepseek-v4/README.md
+++ b/examples/models/deepseek-v4/README.md
@@ -47,6 +47,11 @@ commands:
 port: 30000
 model: deepseek-ai/DeepSeek-V4-Pro
 
+volumes:
+  - instance_path: /root/.cache
+    path: /root/.cache
+    optional: true
+
 resources:
   gpu: B200:8
   shm_size: 32GB
@@ -137,12 +142,10 @@ This returns both:
 
 ## Deployment notes
 
-- Use `lmsysorg/sglang:deepseek-v4-blackwell` for `B200:8`.
 - The first startup can take several minutes while the model loads and SGLang
-  finishes CUDA graph capture.
-- On container backends such as Vast.ai, avoid `instance_path` cache volumes in
-  this service config.
-- The endpoint is OpenAI-compatible and served on port `30000`.
+  finishes initialization.
+- The optional `/root/.cache` instance volume helps reuse the model cache on
+  backends that support instance volumes.
 
 ## What's next?