From 4702770392edab2ad1db4665d476848bfca4ee04 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Thu, 23 Jan 2025 19:33:06 +0545
Subject: [PATCH] Add Deepseek and Intel Examples

Add deepseek-r1 examples

Add deepseek-r1 examples

Fix newline check

Add README for Deepseek examples

Add Trl Deepseek Examples

Add GRPO example

Update Deepseek Example README.md with Deepseek_v2 and Training Examples

Add Intel Deepseek Examples

Update Deepseek and Intel README

Minor Update in Docs for Intel

Fix default example

[Docs] Minor changes to Intel Gaudi and DeepSeek examples

Add Deepseek_v2 example for AMD and updated README

[Docs] Minor changes to Intel Gaudi and DeepSeek examples
---
 docs/assets/stylesheets/extra.css             |   3 +-
 docs/examples.md                              |  22 +
 docs/examples/accelerators/intel/index.md     |   0
 docs/examples/llms/deepseek/index.md          |   0
 examples/accelerators/intel/README.md         | 188 ++++++
 examples/deployment/nim/.dstack.yml           |  12 +-
 examples/llms/deepseek/README.md              | 607 ++++++++++++++++++
 examples/llms/deepseek/sglang/amd/.dstack.yml |  18 +
 .../sglang/amd/deepseek_v2_lite.dstack.yml    |  18 +
 .../llms/deepseek/sglang/nvidia/.dstack.yml   |  18 +
 .../sglang/nvidia/deepseek_v2_lite.dstack.yml |  19 +
 examples/llms/deepseek/tgi/intel/.dstack.yml  |  45 ++
 examples/llms/deepseek/trl/amd/.dstack.yml    |  41 ++
 .../deepseek/trl/amd/deepseek_v2.dstack.yml   |  59 ++
 .../llms/deepseek/trl/amd/grpo.dstack.yml     |  32 +
 examples/llms/deepseek/trl/amd/grpo_train.py  |  60 ++
 examples/llms/deepseek/trl/intel/.dstack.yml  |  46 ++
 .../deepseek/trl/intel/deepseek_v2.dstack.yml |  45 ++
 examples/llms/deepseek/trl/nvidia/.dstack.yml |  38 ++
 .../trl/nvidia/deepseek_v2.dstack.yml         |  64 ++
 examples/llms/deepseek/vllm/amd/.dstack.yml   |  19 +
 .../vllm/amd/deepseek_v2_lite.dstack.yml      |  18 +
 examples/llms/deepseek/vllm/intel/.dstack.yml |  31 +
 .../llms/deepseek/vllm/nvidia/.dstack.yml     |  17 +
 .../vllm/nvidia/deepseek_v2_lite.dstack.yml   |  19 +
 examples/llms/llama31/README.md               |   2 +-
 mkdocs.yml                                    |   4 +-
 27 files changed, 1437 insertions(+), 8 deletions(-)
 create mode 100644 docs/examples/accelerators/intel/index.md
 create mode 100644 docs/examples/llms/deepseek/index.md
 create mode 100644 examples/accelerators/intel/README.md
 create mode 100644 examples/llms/deepseek/README.md
 create mode 100644 examples/llms/deepseek/sglang/amd/.dstack.yml
 create mode 100644 examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml
 create mode 100644 examples/llms/deepseek/sglang/nvidia/.dstack.yml
 create mode 100644 examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml
 create mode 100644 examples/llms/deepseek/tgi/intel/.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/amd/.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/amd/grpo.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/amd/grpo_train.py
 create mode 100644 examples/llms/deepseek/trl/intel/.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/nvidia/.dstack.yml
 create mode 100644 examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml
 create mode 100644 examples/llms/deepseek/vllm/amd/.dstack.yml
 create mode 100644 examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml
 create mode 100644 examples/llms/deepseek/vllm/intel/.dstack.yml
 create mode 100644 examples/llms/deepseek/vllm/nvidia/.dstack.yml
 create mode 100644 examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml

diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
index 066612e90..3519ec22e 100644
--- a/docs/assets/stylesheets/extra.css
+++ b/docs/assets/stylesheets/extra.css
@@ -24,7 +24,8 @@
     }
 }
 
-[dir=ltr] .md-typeset :is(.admonition,details) pre, [dir=ltr] .md-typeset :is(.admonition,details) :is(.admonition,details) {
+[dir=ltr] .md-typeset :is(.admonition,details) pre,
+[dir=ltr] .md-typeset :is(.admonition,details) :is(.admonition,details, .termy) {
     margin-left: 32px;
 }
 
diff --git a/docs/examples.md b/docs/examples.md
index c380b98bf..41a748500 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -83,6 +83,18 @@ hide:
         </p>
     </a>
 
+    <a href="/examples/accelerators/intel"
+       class="feature-cell sky">
+        <h3>
+            Intel Gaudi
+        </h3>
+
+        <p>
+            Deploy and fine-tune LLMs on AMD
+        </p>
+    </a>
+
+
     <a href="/examples/accelerators/tpu"
        class="feature-cell sky">
         <h3>
@@ -98,6 +110,16 @@ hide:
 ## LLMs
 
 <div class="tx-landing__highlights_grid">
+    <a href="/examples/llms/deepseek"
+       class="feature-cell sky">
+        <h3>
+            Deepseek
+        </h3>
+
+        <p>
+            Deploy and train Deepseek models
+        </p>
+    </a>
     <a href="/examples/llms/llama31"
        class="feature-cell sky">
         <h3>
diff --git a/docs/examples/accelerators/intel/index.md b/docs/examples/accelerators/intel/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/examples/llms/deepseek/index.md b/docs/examples/llms/deepseek/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/accelerators/intel/README.md b/examples/accelerators/intel/README.md
new file mode 100644
index 000000000..531daf29b
--- /dev/null
+++ b/examples/accelerators/intel/README.md
@@ -0,0 +1,188 @@
+# Intel Gaudi
+
+`dstack` supports running dev environments, tasks, and services on Intel Gaudi GPUs via 
+[SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh).
+
+## Deployment
+
+Serving frameworks like vLLM and TGI have Intel Gaudi support. Here's an example of
+a service that deploys
+[`DeepSeek-R1-Distill-Llama-70B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B){:target="_blank"} 
+using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} 
+and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"}.
+
+=== "TGI"
+    <div editor-title="examples/deployment/tgi/intel/.dstack.yml"> 
+    
+    ```yaml
+    type: service
+    name: tgi
+
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    env:
+    - HF_TOKEN
+    - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+    - PORT=8000
+    - OMPI_MCA_btl_vader_single_copy_mechanism=none
+    - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true
+    - PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+    - MAX_TOTAL_TOKENS=2048
+    - BATCH_BUCKET_SIZE=256
+    - PREFILL_BATCH_BUCKET_SIZE=4
+    - PAD_SEQUENCE_TO_MULTIPLE_OF=64
+    - ENABLE_HPU_GRAPH=true
+    - LIMIT_HPU_GRAPH=true
+    - USE_FLASH_ATTENTION=true
+    - FLASH_ATTENTION_RECOMPUTE=true
+    commands:
+      - text-generation-launcher
+        --sharded true
+        --num-shard $DSTACK_GPUS_NUM
+        --max-input-length 1024
+        --max-total-tokens 2048
+        --max-batch-prefill-tokens 4096
+        --max-batch-total-tokens 524288
+        --max-waiting-tokens 7
+        --waiting-served-ratio 1.2
+        --max-concurrent-requests 512
+    port: 8000
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+    resources:
+      gpu: gaudi2:8
+
+    # Uncomment to cache downloaded models
+    #volumes:
+    #  - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
+    ```
+    
+    </div>
+
+=== "vLLM"
+
+    <div editor-title="examples/deployment/vllm/intel/.dstack.yml"> 
+    
+    ```yaml
+    type: service
+    name: deepseek-r1-gaudi
+
+    image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+    env:
+    - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+    - HABANA_VISIBLE_DEVICES=all
+    - OMPI_MCA_btl_vader_single_copy_mechanism=none
+    commands:
+    - git clone https://github.com/HabanaAI/vllm-fork.git
+    - cd vllm-fork
+    - git checkout habana_main
+    - pip install -r requirements-hpu.txt
+    - python setup.py develop
+    - vllm serve $MODEL_ID
+        --tensor-parallel-size 8
+        --trust-remote-code
+        --download-dir /data
+    port: 8000
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+
+    resources:
+      gpu: gaudi2:8
+    
+    # Uncomment to cache downloaded models
+    #volumes:
+    #  - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
+    ```
+    </div>
+    
+
+## Fine-tuning
+
+Below is an example of LoRA fine-tuning of [`DeepSeek-R1-Distill-Qwen-7B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B){:target="_blank"}
+using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} 
+and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with 
+the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset. 
+    
+<div editor-title="examples/fine-tuning/trl/intel/.dstack.yml">
+    
+```yaml
+type: task
+name: trl-train
+
+image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+commands:
+   - pip install --upgrade-strategy eager optimum[habana]
+   - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+   - git clone https://github.com/huggingface/optimum-habana.git
+   - cd optimum-habana/examples/trl
+   - pip install -r requirements.txt
+   - pip install wandb
+   - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size $DSTACK_GPUS_NUM --use_deepspeed sft.py
+       --model_name_or_path $MODEL_ID
+       --dataset_name "lvwerra/stack-exchange-paired"
+       --deepspeed ../language-modeling/llama2_ds_zero3_config.json
+       --output_dir="./sft"
+       --do_train
+       --max_steps=500
+       --logging_steps=10
+       --save_steps=100
+       --per_device_train_batch_size=1
+       --per_device_eval_batch_size=1
+       --gradient_accumulation_steps=2
+       --learning_rate=1e-4
+       --lr_scheduler_type="cosine"
+       --warmup_steps=100
+       --weight_decay=0.05
+       --optim="paged_adamw_32bit"
+       --lora_target_modules "q_proj" "v_proj"
+       --bf16
+       --remove_unused_columns=False
+       --run_name="sft_deepseek_70"
+       --report_to="wandb"
+       --use_habana
+       --use_lazy_mode
+
+resources:
+  gpu: gaudi2:8
+```    
+
+</div>
+
+To finetune `DeepSeek-R1-Distill-Llama-70B` with eight Gaudi 2, 
+you can partially offload parameters to CPU memory using the Deepspeed configuration file.
+For more details, refer to [parameter offloading](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeedzerooffloadparamconfig).
+
+## Applying a configuration
+
+Once the configuration is ready, run `dstack apply -f <configuration file>`.
+
+<div class="termy">
+
+```shell
+$ dstack apply -f examples/deployment/vllm/.dstack.yml
+
+ #  BACKEND  REGION    RESOURCES                    SPOT  PRICE     
+ 1  ssh      remote    152xCPU,1007GB,8xGaudi2:96GB yes   $0     idle 
+
+Submit a new run? [y/n]: y
+
+Provisioning...
+---> 100%
+```
+
+</div>
+
+## Source code
+
+The source-code of this example can be found in 
+[`examples/llms/deepseek/tgi/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/tgi/intel){:target="_blank"},
+[`examples/llms/deepseek/vllm/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/vllm/intel){:target="_blank"} and
+[`examples/llms/deepseek/trl/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/trl/intel){:target="_blank"}.
+
+!!! info "What's next?"
+    1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services).
+    2. See also [Intel Gaudi Documentation :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/index.html), [vLLM Inference with Gaudi](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/vLLM_Inference.html)
+      and [Optimum for Gaudi examples :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana/blob/main/examples/trl/README.md).
diff --git a/examples/deployment/nim/.dstack.yml b/examples/deployment/nim/.dstack.yml
index 3fba16d7f..ba1b702a5 100644
--- a/examples/deployment/nim/.dstack.yml
+++ b/examples/deployment/nim/.dstack.yml
@@ -1,7 +1,7 @@
 type: service
-name: llama31
+name: qwen-nim
 
-image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest
+image: nvcr.io/nim/qwen/qwen-2.5-7b-instruct:latest
 env:
   - NGC_API_KEY
   - NIM_MAX_MODEL_LEN=4096
@@ -10,16 +10,18 @@ registry_auth:
   password: ${{ env.NGC_API_KEY }}
 port: 8000
 # Register the model
-model: meta/llama-3.1-8b-instruct
+model: qwen/qwen-2.5-7b-instruct
 
 # Uncomment to leverage spot instances
 #spot_policy: auto
 
 # Cache downloaded models
 volumes:
-  - /root/.cache/nim:/opt/nim/.cache
+  - instance_path: /root/.cache/nim
+    path: /opt/nim/.cache
+    optional: true
 
 resources:
   gpu: 24GB
   # Uncomment if using multiple GPUs
-  #shm_size: 24GB
+  shm_size: 16GB
diff --git a/examples/llms/deepseek/README.md b/examples/llms/deepseek/README.md
new file mode 100644
index 000000000..2e6f27150
--- /dev/null
+++ b/examples/llms/deepseek/README.md
@@ -0,0 +1,607 @@
+# Deepseek
+
+This example walks you through how to deploy and
+train [Deepseek :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai){:target="_blank"}
+models with `dstack`. 
+
+> We used Deepseek-R1 distilled models and Deepseek-V2-Lite, a 16B model with the same architecture as Deepseek-R1 (671B). Deepseek-V2-Lite retains MLA and DeepSeekMoE but requires less memory, making it ideal for testing and fine-tuning on smaller GPUs.
+
+??? info "Prerequisites"
+    Once `dstack` is [installed](https://dstack.ai/docs/installation), go ahead clone the repo, and run `dstack init`.
+
+    <div class="termy">
+ 
+    ```shell
+    $ git clone https://github.com/dstackai/dstack
+    $ cd dstack
+    $ dstack init
+    ```
+    </div>
+
+## Deployment
+
+### AMD
+
+Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with AMD `MI300X` GPU. The below configurations also support `Deepseek-V2-Lite`.
+
+=== "SGLang"
+
+    <div editor-title="examples/llms/deepseek/sglang/amd/.dstack.yml">
+
+    ```yaml
+    type: service
+    name: deepseek-r1-amd
+
+    image: lmsysorg/sglang:v0.4.1.post4-rocm620
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+    commands:
+       - python3 -m sglang.launch_server
+         --model-path $MODEL_ID
+         --port 8000
+         --trust-remote-code
+
+    port: 8000
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+    resources:
+      gpu: MI300X
+      disk: 300Gb
+
+    ```
+    </div>
+
+=== "vLLM"
+    
+    <div editor-title="examples/llms/deepseek/sglang/amd/.dstack.yml">
+
+    ```yaml
+    type: service
+    name: deepseek-r1-amd
+    
+    image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      - MAX_MODEL_LEN=126432
+    commands:
+      - vllm serve $MODEL_ID
+        --max-model-len $MAX_MODEL_LEN
+        --trust-remote-code
+    port: 8000
+    
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+    resources:
+      gpu: MI300X
+      disk: 300Gb
+    ```
+    </div>
+
+Note, when using `Deepseek-R1-Distill-Llama-70B` with `vLLM` with a 192GB GPU, we must limit the context size to 126432 tokens to fit the memory.
+
+### Intel Gaudi
+
+Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B`
+using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"}
+and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} (Gaudi fork) with Intel Gaudi 2. 
+
+> Both [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"}
+> and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} do not support `Deepseek-V2-Lite`.
+> See [this :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi/issues/271)
+> and [this :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork/issues/809#issuecomment-2652454824) issues.
+
+=== "TGI"
+
+    <div editor-title="examples/llms/deepseek/tgi/intel/.dstack.yml">
+    ```yaml
+    type: service
+
+    name: tgi
+
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+
+    auth: false
+    port: 8000
+
+    model: DeepSeek-R1-Distill-Llama-70B
+
+    env:
+      - HF_TOKEN
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      - PORT=8000
+      - OMPI_MCA_btl_vader_single_copy_mechanism=none
+      - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true
+      - PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+      - MAX_TOTAL_TOKENS=2048
+      - BATCH_BUCKET_SIZE=256
+      - PREFILL_BATCH_BUCKET_SIZE=4
+      - PAD_SEQUENCE_TO_MULTIPLE_OF=64
+      - ENABLE_HPU_GRAPH=true
+      - LIMIT_HPU_GRAPH=true
+      - USE_FLASH_ATTENTION=true
+      - FLASH_ATTENTION_RECOMPUTE=true
+
+    commands:
+      - text-generation-launcher
+          --sharded true
+          --num-shard 8
+          --max-input-length 1024
+          --max-total-tokens 2048
+          --max-batch-prefill-tokens 4096
+          --max-batch-total-tokens 524288
+          --max-waiting-tokens 7
+          --waiting-served-ratio 1.2
+          --max-concurrent-requests 512
+
+    resources:
+      gpu: Gaudi2:8
+    ```
+    </div>
+
+=== "vLLM"
+
+    <div editor-title="examples/llms/deepseek/vllm/intel/.dstack.yml">
+    ```yaml
+    type: service
+    name: deepseek-r1-gaudi
+
+    image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+
+
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      - HABANA_VISIBLE_DEVICES=all
+      - OMPI_MCA_btl_vader_single_copy_mechanism=none  
+
+    commands:
+      - git clone https://github.com/HabanaAI/vllm-fork.git
+      - cd vllm-fork
+      - git checkout habana_main
+      - pip install -r requirements-hpu.txt
+      - python setup.py develop
+      - vllm serve $MODEL_ID
+        --tensor-parallel-size 8
+        --trust-remote-code
+        --download-dir /data
+
+    port: 8000
+    ```
+    </div>  
+
+### NVIDIA
+
+Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-8B`
+using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"}
+and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with NVIDIA GPUs. 
+Both SGLang and vLLM also support `Deepseek-V2-Lite`.
+
+=== "SGLang"
+    <div editor-title="examples/llms/deepseek/sglang/nvidia/.dstack.yml">
+
+    ```yaml
+    type: service
+    name: deepseek-r1-nvidia
+    
+    image: lmsysorg/sglang:latest
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+    commands:
+        - python3 -m sglang.launch_server
+          --model-path $MODEL_ID
+          --port 8000
+          --trust-remote-code
+    
+    port: 8000
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+    
+    resources:
+      gpu: 24GB
+    ```
+    </div>
+
+=== "vLLM"
+    <div editor-title="examples/llms/deepseek/vllm/nvidia/.dstack.yml">
+
+    ```yaml
+    type: service
+    name: deepseek-r1-nvidia
+    
+    image: vllm/vllm-openai:latest
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+      - MAX_MODEL_LEN=4096
+    commands:
+      - vllm serve $MODEL_ID
+        --max-model-len $MAX_MODEL_LEN 
+    port: 8000 
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+    
+    resources:
+      gpu: 24GB
+    ```
+    </div>
+
+Note, to run `Deepseek-R1-Distill-Llama-8B` with `vLLM` with a 24GB GPU, we must limit the context size to 4096 tokens to fit the memory.
+
+> To run `Deepseek-V2-Lite` with `vLLM`, we must use 40GB GPU and to run `Deepseek-V2-Lite` with SGLang, we must use
+> 80GB GPU. For more details on SGlang's memory requirements you can refer to
+> this [issue](https://github.com/sgl-project/sglang/issues/3451).
+
+### Memory requirements
+
+Approximate memory requirements for loading the model (excluding context and CUDA/ROCm kernel reservations).
+
+| Model                       | Size     | FP16   | FP8    | INT4   |
+|-----------------------------|----------|--------|--------|--------|
+| `Deepseek-R1`               | **671B** | 1.35TB | 671GB  | 336GB  |
+| `DeepSeek-R1-Distill-Llama` | **70B**  | 161GB  | 80.5GB | 40B    |
+| `DeepSeek-R1-Distill-Qwen`  | **32B**  | 74GB   | 37GB   | 18.5GB |
+| `DeepSeek-V2-Lite`          | **16B**  | 35GB   | 17.5GB | 8.75GB |
+| `DeepSeek-R1-Distill-Qwen`  | **14B**  | 32GB   | 16GB   | 8GB    |
+| `DeepSeek-R1-Distill-Llama` | **8B**   | 18GB   | 9GB    | 4.5GB  |
+| `DeepSeek-R1-Distill-Qwen`  | **7B**   | 16GB   | 8GB    | 4GB    |
+
+For example, the FP8 version of Deepseek-R1 671B fits on a single node of MI300X with eight 192GB GPUs, a single node of
+H200 with eight 141GB GPUs, or a single node of Intel Gaudi2 with eight 96GB GPUs.
+
+### Applying the configuration
+
+To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
+
+<div class="termy">
+
+```shell
+$ dstack apply -f examples/llms/deepseek/sglang/amd/.dstack.yml
+
+ #  BACKEND  REGION     RESOURCES                         SPOT  PRICE   
+ 1  runpod   EU-RO-1   24xCPU, 283GB, 1xMI300X (192GB)    no    $2.49  
+    
+Submit the run deepseek-r1-amd? [y/n]: y
+
+Provisioning...
+---> 100%
+```
+</div>
+
+Once the service is up, the model will be available via the OpenAI-compatible endpoint
+at `<dstack server URL>/proxy/models/<project name>/`.
+
+<div class="termy">
+
+```shell
+curl http://127.0.0.1:3000/proxy/models/main/chat/completions \
+    -X POST \
+    -H 'Authorization: Bearer &lt;dstack token&gt;' \
+    -H 'Content-Type: application/json' \
+    -d '{
+      "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant."
+        },
+        {
+          "role": "user",
+          "content": "What is Deep Learning?"
+        }
+      ],
+      "stream": true,
+      "max_tokens": 512
+    }'
+```
+</div>
+
+When a [gateway](https://dstack.ai/docs/concepts/gateways.md) is configured, the OpenAI-compatible endpoint 
+is available at `https://gateway.<gateway domain>/`.
+
+## Fine-tuning
+
+### AMD
+
+Here are the examples of LoRA fine-tuning of `Deepseek-V2-Lite` and GRPO fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` on `MI300X` GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"}.
+
+=== "LoRA"
+
+    <div editor-title="examples/llms/deepseek/trl/amd/.dstack.yml">
+
+    ```yaml
+    type: task
+    name: trl-train
+
+    image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
+
+    env:
+      - WANDB_API_KEY
+      - WANDB_PROJECT
+      - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+      - ACCELERATE_USE_FSDP=False
+    commands:
+      - git clone https://github.com/huggingface/peft.git
+      - pip install trl
+      - pip install "numpy<2"
+      - pip install peft
+      - pip install wandb
+      - cd peft/examples/sft
+      - python train.py
+        --seed 100
+        --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite"
+        --dataset_name "smangrul/ultrachat-10k-chatml"
+        --chat_template_format "chatml"
+        --add_special_tokens False
+        --append_concat_token False
+        --splits "train,test"
+        --max_seq_len 512
+        --num_train_epochs 1
+        --logging_steps 5
+        --log_level "info"
+        --logging_strategy "steps"
+        --eval_strategy "epoch"
+        --save_strategy "epoch"
+        --hub_private_repo True
+        --hub_strategy "every_save"
+        --packing True
+        --learning_rate 1e-4
+        --lr_scheduler_type "cosine"
+        --weight_decay 1e-4
+        --warmup_ratio 0.0
+        --max_grad_norm 1.0
+        --output_dir "deepseek-sft-lora"
+        --per_device_train_batch_size 8
+        --per_device_eval_batch_size 8
+        --gradient_accumulation_steps 4
+        --gradient_checkpointing True
+        --use_reentrant True
+        --dataset_text_field "content"
+        --use_peft_lora True
+        --lora_r 16
+        --lora_alpha 16
+        --lora_dropout 0.05
+        --lora_target_modules "all-linear"
+
+    resources:
+      gpu: MI300X
+      disk: 150GB
+    ```
+    </div>
+
+=== "GRPO"
+
+    <div editor-title="examples/llms/deepseek/trl/amd/grpo.dstack.yml">
+    ```yaml
+    type: task
+    name: trl-train-grpo
+
+    image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 
+
+    env:
+      - WANDB_API_KEY
+      - WANDB_PROJECT
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+    commands:
+      - pip install trl
+      - pip install datasets
+      # numPy version less than 2 is required for the scipy installation with AMD.
+      - pip install "numpy<2"
+      - mkdir -p grpo_example
+      - cp examples/llms/deepseek/trl/amd/grpo_train.py grpo_example/grpo_train.py
+      - cd grpo_example
+      - python grpo_train.py
+        --model_name_or_path $MODEL_ID
+        --dataset_name trl-lib/tldr
+        --per_device_train_batch_size 2
+        --logging_steps 25
+        --output_dir Deepseek-Distill-Qwen-1.5B-GRPO
+        --trust_remote_code
+
+    resources:
+      gpu: MI300X
+      disk: 150GB
+    ```
+    </div>
+
+Note, the `GRPO` fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` consumes up to 135GB of vRAM.
+
+### Intel Gaudi
+
+Here is an example of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-7B` on Intel Gaudi 2 GPUs using
+HuggingFace's [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"}
+and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://github.com/deepspeedai/DeepSpeed){:target="_blank"}. Both also support `LoRA`
+fine-tuning of `Deepseek-V2-Lite` with same configuration as below.
+
+=== "LoRA"
+
+    <div editor-title="examples/llms/deepseek/trl/intel/.dstack.yml">
+    ```yaml
+    type: task
+    name: trl-train
+
+    image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0
+
+    env:
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+      - WANDB_API_KEY
+      - WANDB_PROJECT
+    commands:
+      - pip install --upgrade-strategy eager optimum[habana]
+      - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+      - git clone https://github.com/huggingface/optimum-habana.git
+      - cd optimum-habana/examples/trl
+      - pip install -r requirements.txt
+      - pip install wandb
+      - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py
+        --model_name_or_path $MODEL_ID
+        --dataset_name "lvwerra/stack-exchange-paired"
+        --deepspeed ../language-modeling/llama2_ds_zero3_config.json
+        --output_dir="./sft"
+        --do_train
+        --max_steps=500
+        --logging_steps=10
+        --save_steps=100
+        --per_device_train_batch_size=1
+        --per_device_eval_batch_size=1
+        --gradient_accumulation_steps=2
+        --learning_rate=1e-4
+        --lr_scheduler_type="cosine"
+        --warmup_steps=100
+        --weight_decay=0.05
+        --optim="paged_adamw_32bit"
+        --lora_target_modules "q_proj" "v_proj"
+        --bf16
+        --remove_unused_columns=False
+        --run_name="sft_deepseek_70"
+        --report_to="wandb"
+        --use_habana
+        --use_lazy_mode
+
+    resources:
+      gpu: gaudi2:8
+    ```
+
+    </div>
+
+
+### NVIDIA
+
+Here are examples of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` and QLoRA fine-tuning of `DeepSeek-V2-Lite`
+on NVIDIA GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"} library.
+
+=== "LoRA"
+    <div editor-title="examples/llms/deepseek/trl/nvidia/.dstack.yml">
+
+    ```yaml
+    type: task
+    name: trl-train
+
+    python: "3.10"
+
+    env:
+      - WANDB_API_KEY
+      - WANDB_PROJECT
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+    commands:
+      - git clone https://github.com/huggingface/trl.git
+      - pip install trl
+      - pip install peft
+      - pip install wandb
+      - cd trl/trl/scripts
+      - python sft.py
+        --model_name_or_path $MODEL_ID
+        --dataset_name trl-lib/Capybara
+        --learning_rate 2.0e-4
+        --num_train_epochs 1
+        --packing
+        --per_device_train_batch_size 2
+        --gradient_accumulation_steps 8
+        --gradient_checkpointing
+        --logging_steps 25
+        --eval_strategy steps
+        --eval_steps 100
+        --use_peft
+        --lora_r 32
+        --lora_alpha 16
+        --report_to wandb
+        --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT
+
+    resources:
+      gpu: 24GB
+    ```
+    </div>
+
+=== "QLoRA"
+    <div editor-title="examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml">
+
+    ```yaml
+    type: task
+    name: trl-train-deepseek-v2
+
+    python: "3.10"
+    nvcc: true
+    env:
+      - WANDB_API_KEY
+      - WANDB_PROJECT
+      - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+      - ACCELERATE_USE_FSDP=False
+    commands:
+      - git clone https://github.com/huggingface/peft.git
+      - pip install trl
+      - pip install peft
+      - pip install wandb
+      - pip install bitsandbytes
+      - cd peft/examples/sft
+      - python train.py
+        --seed 100 
+        --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" 
+        --dataset_name "smangrul/ultrachat-10k-chatml" 
+        --chat_template_format "chatml" 
+        --add_special_tokens False 
+        --append_concat_token False 
+        --splits "train,test" 
+        --max_seq_len 512 
+        --num_train_epochs 1 
+        --logging_steps 5 
+        --log_level "info" 
+        --logging_strategy "steps" 
+        --eval_strategy "epoch" 
+        --save_strategy "epoch" 
+        --hub_private_repo True 
+        --hub_strategy "every_save" 
+        --bf16 True 
+        --packing True 
+        --learning_rate 1e-4 
+        --lr_scheduler_type "cosine" 
+        --weight_decay 1e-4 
+        --warmup_ratio 0.0 
+        --max_grad_norm 1.0 
+        --output_dir "mistral-sft-lora" 
+        --per_device_train_batch_size 8 
+        --per_device_eval_batch_size 8 
+        --gradient_accumulation_steps 4 
+        --gradient_checkpointing True 
+        --use_reentrant True 
+        --dataset_text_field "content" 
+        --use_peft_lora True 
+        --lora_r 16 
+        --lora_alpha 16 
+        --lora_dropout 0.05 
+        --lora_target_modules "all-linear" 
+        --use_4bit_quantization True 
+        --use_nested_quant True 
+        --bnb_4bit_compute_dtype "bfloat16"
+
+    resources:
+      # Consumes ~25GB of vRAM for QLoRA fine-tuning deepseek-ai/DeepSeek-V2-Lite
+      gpu: 48GB
+    ```
+    </div>
+
+### Memory requirements
+
+| Model                       | Size     | Full fine-tuning | LoRA  | QLoRA |
+|-----------------------------|----------|------------------|-------|-------|
+| `Deepseek-R1`               | **671B** | 10.5TB           | 1.4TB | 442GB |
+| `DeepSeek-R1-Distill-Llama` | **70B**  | 1.09TB           | 151GB | 46GB  |
+| `DeepSeek-R1-Distill-Qwen`  | **32B**  | 512GB            | 70GB  | 21GB  |
+| `DeepSeek-V2-Lite`          | **16B**  | 256GB            | 35GB  | 11GB  |
+| `DeepSeek-R1-Distill-Qwen`  | **14B**  | 224GB            | 30GB  | 9GB   |
+| `DeepSeek-R1-Distill-Llama` | **8B**   | 128GB            | 17GB  | 5GB   |
+| `DeepSeek-R1-Distill-Qwen`  | **7B**   | 112GB            | 15GB  | 4GB   |
+| `DeepSeek-R1-Distill-Qwen`  | **1.5B** | 24GB             | 3.2GB | 1GB   |
+
+The memory requirements assume low-rank update matrices are 1% of model parameters. In practice, a 7B model with QLoRA
+needs 7–10GB due to intermediate hidden states.
+
+| Fine-tuning type | Calculation                                      |
+|------------------|--------------------------------------------------|
+| Full fine-tuning | 671B × 16 bytes = 10.48TB                        |
+| LoRA             | 671B × 2 bytes + 1% of 671B × 16 bytes = 1.41TB  |
+| QLoRA(4-bit)     | 671B × 0.5 bytes + 1% of 671B × 16 bytes = 442GB |
+
+## Source code
+
+The source-code of this example can be found in 
+[`examples/llms/deepseek` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek).
+
+!!! info "What's next?"
+    1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
+       [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips).
+   
diff --git a/examples/llms/deepseek/sglang/amd/.dstack.yml b/examples/llms/deepseek/sglang/amd/.dstack.yml
new file mode 100644
index 000000000..99a19bfee
--- /dev/null
+++ b/examples/llms/deepseek/sglang/amd/.dstack.yml
@@ -0,0 +1,18 @@
+type: service
+name: deepseek-r1-amd
+
+image: lmsysorg/sglang:v0.4.1.post4-rocm620
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+commands:
+  - python3 -m sglang.launch_server
+    --model-path $MODEL_ID
+    --port 8000
+    --trust-remote-code
+
+port: 8000
+model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+resources:
+  gpu: mi300x
+  disk: 300Gb
diff --git a/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml
new file mode 100644
index 000000000..01ef71a6b
--- /dev/null
+++ b/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml
@@ -0,0 +1,18 @@
+type: service
+name: deepseek-v2-lite-amd
+
+image: lmsysorg/sglang:v0.4.1.post4-rocm620
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+commands:
+  - python3 -m sglang.launch_server
+    --model-path $MODEL_ID
+    --port 8000
+    --trust-remote-code
+
+port: 8000
+model: deepseek-ai/DeepSeek-V2-Lite
+
+resources:
+  gpu: mi300x
+  disk: 150Gb
diff --git a/examples/llms/deepseek/sglang/nvidia/.dstack.yml b/examples/llms/deepseek/sglang/nvidia/.dstack.yml
new file mode 100644
index 000000000..d1c92b64d
--- /dev/null
+++ b/examples/llms/deepseek/sglang/nvidia/.dstack.yml
@@ -0,0 +1,18 @@
+type: service
+name: deepseek-r1-nvidia
+
+image: lmsysorg/sglang:latest
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+commands:
+  - python3 -m sglang.launch_server
+    --model-path $MODEL_ID
+    --port 8000
+    --trust-remote-code
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+
+resources:
+  gpu: 24GB
diff --git a/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml
new file mode 100644
index 000000000..8c0adaa41
--- /dev/null
+++ b/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml
@@ -0,0 +1,19 @@
+# Not Working https://github.com/sgl-project/sglang/issues/3451
+type: service
+name: deepseek-v2-lite-nvidia
+
+image: lmsysorg/sglang:latest
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+commands:
+  - python3 -m sglang.launch_server
+    --model-path $MODEL_ID
+    --port 8000
+    --trust-remote-code
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-V2-Lite
+
+resources:
+  gpu: 80GB
diff --git a/examples/llms/deepseek/tgi/intel/.dstack.yml b/examples/llms/deepseek/tgi/intel/.dstack.yml
new file mode 100644
index 000000000..16d083092
--- /dev/null
+++ b/examples/llms/deepseek/tgi/intel/.dstack.yml
@@ -0,0 +1,45 @@
+type: service
+
+name: tgi
+
+image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+
+auth: false
+port: 8000
+
+model: DeepSeek-R1-Distill-Llama-70B
+
+env:
+  - HF_TOKEN
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+  - PORT=8000
+  - OMPI_MCA_btl_vader_single_copy_mechanism=none
+  - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true
+  - PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+  - MAX_TOTAL_TOKENS=2048
+  - BATCH_BUCKET_SIZE=256
+  - PREFILL_BATCH_BUCKET_SIZE=4
+  - PAD_SEQUENCE_TO_MULTIPLE_OF=64
+  - ENABLE_HPU_GRAPH=true
+  - LIMIT_HPU_GRAPH=true
+  - USE_FLASH_ATTENTION=true
+  - FLASH_ATTENTION_RECOMPUTE=true
+
+commands:
+  - text-generation-launcher
+      --sharded true
+      --num-shard 8
+      --max-input-length 1024
+      --max-total-tokens 2048
+      --max-batch-prefill-tokens 4096
+      --max-batch-total-tokens 524288
+      --max-waiting-tokens 7
+      --waiting-served-ratio 1.2
+      --max-concurrent-requests 512
+
+resources:
+  gpu: Gaudi2:8
+
+# Uncomment to cache downloaded models
+#volumes:
+#  - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
diff --git a/examples/llms/deepseek/trl/amd/.dstack.yml b/examples/llms/deepseek/trl/amd/.dstack.yml
new file mode 100644
index 000000000..fe3dbc31a
--- /dev/null
+++ b/examples/llms/deepseek/trl/amd/.dstack.yml
@@ -0,0 +1,41 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train
+
+image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
+
+# Required environment variables
+env:
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+# Commands of the task
+commands:
+  - git clone https://github.com/huggingface/trl.git
+  - pip install trl
+  - pip install "numpy<2"
+  - pip install peft
+  - pip install wandb
+  - cd trl/trl/scripts
+  - python sft.py
+    --model_name_or_path $MODEL_ID
+    --dataset_name trl-lib/Capybara
+    --learning_rate 2.0e-4
+    --num_train_epochs 1
+    --packing
+    --per_device_train_batch_size 2
+    --gradient_accumulation_steps 8
+    --gradient_checkpointing
+    --logging_steps 25
+    --eval_strategy steps
+    --eval_steps 100
+    --use_peft
+    --lora_r 32
+    --lora_alpha 16
+    --report_to wandb
+    --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT
+
+resources:
+  gpu: MI300X
+  disk: 150GB
diff --git a/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml
new file mode 100644
index 000000000..4c719dcd5
--- /dev/null
+++ b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml
@@ -0,0 +1,59 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train
+
+image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
+
+# Required environment variables
+env:
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+  - ACCELERATE_USE_FSDP=False
+# Commands of the task
+commands:
+  - git clone https://github.com/huggingface/peft.git
+  - pip install trl
+  - pip install "numpy<2"
+  - pip install peft
+  - pip install wandb
+  - cd peft/examples/sft
+  - python train.py
+    --seed 100
+    --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite"
+    --dataset_name "smangrul/ultrachat-10k-chatml"
+    --chat_template_format "chatml"
+    --add_special_tokens False
+    --append_concat_token False
+    --splits "train,test"
+    --max_seq_len 512
+    --num_train_epochs 1
+    --logging_steps 5
+    --log_level "info"
+    --logging_strategy "steps"
+    --eval_strategy "epoch"
+    --save_strategy "epoch"
+    --hub_private_repo True
+    --hub_strategy "every_save"
+    --packing True
+    --learning_rate 1e-4
+    --lr_scheduler_type "cosine"
+    --weight_decay 1e-4
+    --warmup_ratio 0.0
+    --max_grad_norm 1.0
+    --output_dir "deepseek-sft-lora"
+    --per_device_train_batch_size 8
+    --per_device_eval_batch_size 8
+    --gradient_accumulation_steps 4
+    --gradient_checkpointing True
+    --use_reentrant True
+    --dataset_text_field "content"
+    --use_peft_lora True
+    --lora_r 16
+    --lora_alpha 16
+    --lora_dropout 0.05
+    --lora_target_modules "all-linear"
+
+resources:
+  gpu: MI300X
+  disk: 150GB
diff --git a/examples/llms/deepseek/trl/amd/grpo.dstack.yml b/examples/llms/deepseek/trl/amd/grpo.dstack.yml
new file mode 100644
index 000000000..f866bb1ca
--- /dev/null
+++ b/examples/llms/deepseek/trl/amd/grpo.dstack.yml
@@ -0,0 +1,32 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train-grpo
+
+image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
+
+# Required environment variables
+env:
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+# Commands of the task
+commands:
+  - pip install trl
+  - pip install datasets
+  # numpy version less than 2 is required for the scipy installation with AMD.
+  - pip install "numpy<2"
+  - mkdir -p grpo_example
+  - cp examples/llms/deepseek/trl/amd/grpo_train.py grpo_example/grpo_train.py
+  - cd grpo_example
+  - python grpo_train.py
+    --model_name_or_path $MODEL_ID
+    --dataset_name trl-lib/tldr
+    --per_device_train_batch_size 2
+    --logging_steps 25
+    --output_dir Deepseek-Distill-Qwen-1.5B-GRPO
+    --trust_remote_code
+
+# GRPO fine-tuning of DeepSeek-R1-Distill-Qwen-1.5B consumes 70% of VRAM
+resources:
+  gpu: MI300X
+  disk: 150GB
diff --git a/examples/llms/deepseek/trl/amd/grpo_train.py b/examples/llms/deepseek/trl/amd/grpo_train.py
new file mode 100644
index 000000000..ab59291de
--- /dev/null
+++ b/examples/llms/deepseek/trl/amd/grpo_train.py
@@ -0,0 +1,60 @@
+import argparse
+
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM
+from trl import GRPOConfig, GRPOTrainer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a model using GRPOTrainer.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        required=True,
+        help="Path to the model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--dataset_name", type=str, required=True, help="Name of the dataset to use"
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size per device for training",
+    )
+    parser.add_argument("--logging_steps", type=int, default=10, help="Logging steps interval")
+    parser.add_argument(
+        "--output_dir", type=str, default="output", help="Output directory for the trained model"
+    )
+    parser.add_argument(
+        "--trust_remote_code", action="store_true", help="Trust remote code when loading the model"
+    )
+    return parser.parse_args()
+
+
+def reward_len(completions, **kwargs):
+    return [abs(20 - len(completion)) for completion in completions]
+
+
+def main():
+    args = parse_args()
+
+    dataset = load_dataset(args.dataset_name, split="train")
+    training_args = GRPOConfig(
+        output_dir=args.output_dir,
+        logging_steps=args.logging_steps,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path, trust_remote_code=args.trust_remote_code
+    )
+    trainer = GRPOTrainer(
+        model=model, reward_funcs=reward_len, args=training_args, train_dataset=dataset
+    )
+
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llms/deepseek/trl/intel/.dstack.yml b/examples/llms/deepseek/trl/intel/.dstack.yml
new file mode 100644
index 000000000..9963e4844
--- /dev/null
+++ b/examples/llms/deepseek/trl/intel/.dstack.yml
@@ -0,0 +1,46 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train
+
+image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0
+
+# Required environment variables
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+# Commands of the task
+commands:
+  - pip install --upgrade-strategy eager optimum[habana]
+  - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+  - git clone https://github.com/huggingface/optimum-habana.git
+  - cd optimum-habana/examples/trl
+  - pip install -r requirements.txt
+  - pip install wandb
+  - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py
+    --model_name_or_path $MODEL_ID
+    --dataset_name "lvwerra/stack-exchange-paired"
+    --deepspeed ../language-modeling/llama2_ds_zero3_config.json
+    --output_dir="./sft"
+    --do_train
+    --max_steps=500
+    --logging_steps=10
+    --save_steps=100
+    --per_device_train_batch_size=1
+    --per_device_eval_batch_size=1
+    --gradient_accumulation_steps=2
+    --learning_rate=1e-4
+    --lr_scheduler_type="cosine"
+    --warmup_steps=100
+    --weight_decay=0.05
+    --optim="paged_adamw_32bit"
+    --lora_target_modules "q_proj" "v_proj"
+    --bf16
+    --remove_unused_columns=False
+    --run_name="sft_deepseek_70"
+    --report_to="wandb"
+    --use_habana
+    --use_lazy_mode
+
+resources:
+  gpu: gaudi2:8
diff --git a/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml
new file mode 100644
index 000000000..7aa13d677
--- /dev/null
+++ b/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml
@@ -0,0 +1,45 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train-deepseek-v2-lite
+
+image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0
+
+# Required environment variables
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+# Commands of the task
+commands:
+  - pip install git+https://github.com/huggingface/optimum-habana.git
+  - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+  - git clone https://github.com/huggingface/optimum-habana.git
+  - cd optimum-habana/examples/trl
+  - pip install -r requirements.txt
+  - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py
+    --model_name_or_path $MODEL_ID
+    --dataset_name "lvwerra/stack-exchange-paired"
+    --deepspeed ../language-modeling/llama2_ds_zero3_config.json
+    --output_dir="./sft"
+    --do_train
+    --max_steps=500
+    --logging_steps=10
+    --save_steps=100
+    --per_device_train_batch_size=1
+    --per_device_eval_batch_size=1
+    --gradient_accumulation_steps=2
+    --learning_rate=1e-4
+    --lr_scheduler_type="cosine"
+    --warmup_steps=100
+    --weight_decay=0.05
+    --optim="paged_adamw_32bit"
+    --lora_target_modules "q_proj" "v_proj"
+    --bf16
+    --remove_unused_columns=False
+    --run_name="sft_deepseek_v2lite"
+    --report_to="wandb"
+    --use_habana
+    --use_lazy_mode
+
+resources:
+  gpu: gaudi2:8
diff --git a/examples/llms/deepseek/trl/nvidia/.dstack.yml b/examples/llms/deepseek/trl/nvidia/.dstack.yml
new file mode 100644
index 000000000..09444040c
--- /dev/null
+++ b/examples/llms/deepseek/trl/nvidia/.dstack.yml
@@ -0,0 +1,38 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train
+
+python: "3.10"
+
+# Required environment variables
+env:
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+# Commands of the task
+commands:
+  - git clone https://github.com/huggingface/trl.git
+  - pip install trl
+  - pip install peft
+  - pip install wandb
+  - cd trl/trl/scripts
+  - python sft.py
+    --model_name_or_path $MODEL_ID
+    --dataset_name trl-lib/Capybara
+    --learning_rate 2.0e-4
+    --num_train_epochs 1
+    --packing
+    --per_device_train_batch_size 2
+    --gradient_accumulation_steps 8
+    --gradient_checkpointing
+    --logging_steps 25
+    --eval_strategy steps
+    --eval_steps 100
+    --use_peft
+    --lora_r 32
+    --lora_alpha 16
+    --report_to wandb
+    --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT
+
+resources:
+  gpu: 24GB
diff --git a/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml
new file mode 100644
index 000000000..9c6850270
--- /dev/null
+++ b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml
@@ -0,0 +1,64 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-train-deepseek-v2
+
+python: "3.10"
+
+nvcc: true
+# Required environment variables
+env:
+  - WANDB_API_KEY
+  - WANDB_PROJECT
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+  - ACCELERATE_USE_FSDP=False
+# Commands of the task
+commands:
+  - git clone https://github.com/huggingface/peft.git
+  - pip install trl
+  - pip install peft
+  - pip install wandb
+  - pip install bitsandbytes
+  - cd peft/examples/sft
+  - python train.py
+    --seed 100
+    --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite"
+    --dataset_name "smangrul/ultrachat-10k-chatml"
+    --chat_template_format "chatml"
+    --add_special_tokens False
+    --append_concat_token False
+    --splits "train,test"
+    --max_seq_len 512
+    --num_train_epochs 1
+    --logging_steps 5
+    --log_level "info"
+    --logging_strategy "steps"
+    --eval_strategy "epoch"
+    --save_strategy "epoch"
+    --hub_private_repo True
+    --hub_strategy "every_save"
+    --bf16 True
+    --packing True
+    --learning_rate 1e-4
+    --lr_scheduler_type "cosine"
+    --weight_decay 1e-4
+    --warmup_ratio 0.0
+    --max_grad_norm 1.0
+    --output_dir "deepseek-sft-lora"
+    --per_device_train_batch_size 8
+    --per_device_eval_batch_size 8
+    --gradient_accumulation_steps 4
+    --gradient_checkpointing True
+    --use_reentrant True
+    --dataset_text_field "content"
+    --use_peft_lora True
+    --lora_r 16
+    --lora_alpha 16
+    --lora_dropout 0.05
+    --lora_target_modules "all-linear"
+    --use_4bit_quantization True
+    --use_nested_quant True
+    --bnb_4bit_compute_dtype "bfloat16"
+
+resources:
+  # Consumes ~25GB of vRAM for QLoRA fine-tuning deepseek-ai/DeepSeek-V2-Lite
+  gpu: 48GB
diff --git a/examples/llms/deepseek/vllm/amd/.dstack.yml b/examples/llms/deepseek/vllm/amd/.dstack.yml
new file mode 100644
index 000000000..23bfb033c
--- /dev/null
+++ b/examples/llms/deepseek/vllm/amd/.dstack.yml
@@ -0,0 +1,19 @@
+type: service
+name: deepseek-r1-amd
+
+image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+  - MAX_MODEL_LEN=126432
+commands:
+  - vllm serve $MODEL_ID
+    --max-model-len $MAX_MODEL_LEN
+    --trust-remote-code
+port: 8000
+
+model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+
+resources:
+    gpu: mi300x
+    disk: 300Gb
diff --git a/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml
new file mode 100644
index 000000000..8937e9526
--- /dev/null
+++ b/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml
@@ -0,0 +1,18 @@
+type: service
+name: deepseek-v2-lite-amd
+
+image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+commands:
+  - vllm serve $MODEL_ID
+    --trust-remote-code
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-V2-Lite
+
+
+resources:
+  gpu: mi300x
+  disk: 150Gb
diff --git a/examples/llms/deepseek/vllm/intel/.dstack.yml b/examples/llms/deepseek/vllm/intel/.dstack.yml
new file mode 100644
index 000000000..d28a0152d
--- /dev/null
+++ b/examples/llms/deepseek/vllm/intel/.dstack.yml
@@ -0,0 +1,31 @@
+type: service
+name: deepseek-r1-gaudi
+
+image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+  - HABANA_VISIBLE_DEVICES=all
+  - OMPI_MCA_btl_vader_single_copy_mechanism=none
+
+commands:
+  - git clone https://github.com/HabanaAI/vllm-fork.git
+  - cd vllm-fork
+  - git checkout habana_main
+  - pip install -r requirements-hpu.txt
+  - python setup.py develop
+  - vllm serve $MODEL_ID
+    --tensor-parallel-size 8
+    --trust-remote-code
+    --download-dir /data
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+
+resources:
+  gpu: gaudi2:8
+
+# Uncomment to cache downloaded models
+#volumes:
+#  - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
diff --git a/examples/llms/deepseek/vllm/nvidia/.dstack.yml b/examples/llms/deepseek/vllm/nvidia/.dstack.yml
new file mode 100644
index 000000000..e623b182c
--- /dev/null
+++ b/examples/llms/deepseek/vllm/nvidia/.dstack.yml
@@ -0,0 +1,17 @@
+type: service
+name: deepseek-r1-nvidia
+
+image: vllm/vllm-openai:latest
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  - MAX_MODEL_LEN=4096
+commands:
+  - vllm serve $MODEL_ID
+    --max-model-len $MAX_MODEL_LEN
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+
+resources:
+  gpu: 24GB
diff --git a/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml
new file mode 100644
index 000000000..06e78f379
--- /dev/null
+++ b/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml
@@ -0,0 +1,19 @@
+type: service
+name: deepseek-v2-lite-nvidia
+
+image: vllm/vllm-openai:latest
+env:
+  - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite
+  - MAX_MODEL_LEN=4096
+commands:
+  - vllm serve $MODEL_ID
+    --max-model-len $MAX_MODEL_LEN
+    --tensor-parallel-size $DSTACK_GPUS_NUM
+    --trust-remote-code
+
+port: 8000
+
+model: deepseek-ai/DeepSeek-V2-Lite
+
+resources:
+  gpu: 48GB
diff --git a/examples/llms/llama31/README.md b/examples/llms/llama31/README.md
index a345d56b0..fb754e4fc 100644
--- a/examples/llms/llama31/README.md
+++ b/examples/llms/llama31/README.md
@@ -181,7 +181,7 @@ Provisioning...
 </div>
 
 Once the service is up, the model will be available via the OpenAI-compatible endpoint
-at `<dstack server URL>/proxy/models/<project name>/.
+at `<dstack server URL>/proxy/models/<project name>/`.
 
 <div class="termy">
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 77d47e3ef..961097f02 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -265,13 +265,15 @@ nav:
         - TRL: examples/fine-tuning/trl/index.md
     - Accelerators:
         - AMD: examples/accelerators/amd/index.md
+        - Intel Gaudi: examples/accelerators/intel/index.md
         - TPU: examples/accelerators/tpu/index.md
     - LLMs:
+        - Deepseek: examples/llms/deepseek/index.md
         - Llama 3.1: examples/llms/llama31/index.md
         - Llama 3.2: examples/llms/llama32/index.md
     - Misc:
         - Docker Compose: examples/misc/docker-compose/index.md
-  - Community: community.md
+#  - Community: community.md
   - Partners: partners.md
   - Blog:
       - blog/index.md