From 970c864349c4035c12d1cabde56ae2ceef536c6c Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Fri, 24 Apr 2026 21:36:53 +0200 Subject: [PATCH 1/2] Add DeepSeek V4 model docs --- docs/examples.md | 11 ++ docs/examples/models/deepseek-v4/index.md | 0 examples/inference/sglang/README.md | 3 + examples/models/deepseek-v4/README.md | 151 ++++++++++++++++++++++ mkdocs.yml | 5 +- 5 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 docs/examples/models/deepseek-v4/index.md create mode 100644 examples/models/deepseek-v4/README.md diff --git a/docs/examples.md b/docs/examples.md index b3e3e0d42..5770425b9 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -188,6 +188,17 @@ hide: ## Models
+ +

+ DeepSeek V4 +

+ +

+ Deploy DeepSeek V4 with SGLang on B200:8 +

+
+

diff --git a/docs/examples/models/deepseek-v4/index.md b/docs/examples/models/deepseek-v4/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index ddf49fd31..c3f7c267c 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -8,6 +8,9 @@ description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs This example shows how to deploy `Qwen/Qwen3.6-27B` using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. +> For a `DeepSeek-V4-Pro` deployment on `B200:8`, see the +[DeepSeek V4](../../models/deepseek-v4/index.md) model page. + ## Apply a configuration Here's an example of a service that deploys diff --git a/examples/models/deepseek-v4/README.md b/examples/models/deepseek-v4/README.md new file mode 100644 index 000000000..35525df6d --- /dev/null +++ b/examples/models/deepseek-v4/README.md @@ -0,0 +1,151 @@ +--- +title: DeepSeek V4 +description: Deploying DeepSeek-V4-Pro using SGLang on NVIDIA B200:8 +--- + +# DeepSeek V4 + +This example shows how to deploy `deepseek-ai/DeepSeek-V4-Pro` as a +[service](https://dstack.ai/docs/services) using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +## Apply a configuration + +Save the following configuration as `deepseek-v4.dstack.yml`. + +
+ +```yaml +type: service +name: deepseek-v4 + +image: lmsysorg/sglang:deepseek-v4-blackwell + +env: + - HF_TOKEN + - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +commands: + - | + sglang serve \ + --trust-remote-code \ + --model-path deepseek-ai/DeepSeek-V4-Pro \ + --tp 8 \ + --dp 8 \ + --enable-dp-attention \ + --moe-a2a-backend deepep \ + --mem-fraction-static 0.82 \ + --cuda-graph-max-bs 64 \ + --max-running-requests 256 \ + --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \ + --tool-call-parser deepseekv4 \ + --reasoning-parser deepseek-v4 \ + --host 0.0.0.0 \ + --port 30000 + +port: 30000 +model: deepseek-ai/DeepSeek-V4-Pro + +resources: + gpu: B200:8 + shm_size: 32GB + disk: 2TB.. +``` + +
+ +This configuration uses the single-node Blackwell `DeepSeek-V4-Pro` recipe +shape for `8 x NVIDIA B200`. + +Export your Hugging Face token and apply the configuration with +[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md). + +
+ +```shell +$ export HF_TOKEN= +$ dstack apply -f deepseek-v4.dstack.yml +``` + +
+ +If no gateway is created, the service endpoint will be available at +`/proxy/services///`. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "deepseek-ai/DeepSeek-V4-Pro", + "messages": [ + { + "role": "user", + "content": "What is 15% of 240? Reply with just the number." + } + ], + "temperature": 0, + "max_tokens": 32 + }' +``` + +
+ +## Reasoning mode + +To separate the model's reasoning into `reasoning_content`, keep +`--reasoning-parser deepseek-v4` in the server command and send +`chat_template_kwargs` in the request body. + +For raw HTTP requests, `chat_template_kwargs` and `separate_reasoning` must be +top-level JSON fields. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/deepseek-v4/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "deepseek-ai/DeepSeek-V4-Pro", + "messages": [ + { + "role": "user", + "content": "Solve step by step: If 3x + 5 = 20, what is x?" + } + ], + "temperature": 0, + "max_tokens": 256, + "chat_template_kwargs": { + "thinking": true + }, + "separate_reasoning": true + }' +``` + +
+ +This returns both: + +- `reasoning_content`: a separate reasoning trace +- `content`: the final user-visible answer + +## Deployment notes + +- Use `lmsysorg/sglang:deepseek-v4-blackwell` for `B200:8`. +- The first startup can take several minutes while the model loads and SGLang + finishes CUDA graph capture. +- On container backends such as Vast.ai, avoid `instance_path` cache volumes in + this service config. +- The endpoint is OpenAI-compatible and served on port `30000`. + +## What's next? + +1. Read the [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) +2. Read the [DeepSeek-V4 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4) +3. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) and [vLLM](https://dstack.ai/examples/inference/vllm/) examples diff --git a/mkdocs.yml b/mkdocs.yml index 1baa53015..1b75f0ebe 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -306,12 +306,13 @@ nav: - vLLM: examples/inference/vllm/index.md - NIM: examples/inference/nim/index.md - TensorRT-LLM: examples/inference/trtllm/index.md + - Models: + - DeepSeek V4: examples/models/deepseek-v4/index.md + - Qwen 3.6: examples/models/qwen36/index.md - Accelerators: - AMD: examples/accelerators/amd/index.md - TPU: examples/accelerators/tpu/index.md - Tenstorrent: examples/accelerators/tenstorrent/index.md - - Models: - - Qwen 3.6: examples/models/qwen36/index.md - Blog: - blog/index.md - Case studies: blog/case-studies.md From 538b5afad64cb1e984615d0f634e858ac750b383 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Fri, 24 Apr 2026 21:45:43 +0200 Subject: [PATCH 2/2] Refine DeepSeek V4 deployment notes --- examples/models/deepseek-v4/README.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/models/deepseek-v4/README.md b/examples/models/deepseek-v4/README.md index 35525df6d..b36a34301 100644 --- a/examples/models/deepseek-v4/README.md +++ b/examples/models/deepseek-v4/README.md @@ -47,6 +47,11 @@ commands: port: 30000 model: deepseek-ai/DeepSeek-V4-Pro +volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + resources: gpu: B200:8 shm_size: 32GB @@ -137,12 +142,10 @@ This returns both: ## Deployment notes -- Use `lmsysorg/sglang:deepseek-v4-blackwell` for `B200:8`. - The first startup can take several minutes while the model loads and SGLang - finishes CUDA graph capture. -- On container backends such as Vast.ai, avoid `instance_path` cache volumes in - this service config. -- The endpoint is OpenAI-compatible and served on port `30000`. + finishes initialization. +- The optional `/root/.cache` instance volume helps reuse the model cache on + backends that support instance volumes. ## What's next?