From 4702770392edab2ad1db4665d476848bfca4ee04 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Thu, 23 Jan 2025 19:33:06 +0545 Subject: [PATCH] Add Deepseek and Intel Examples Add deepseek-r1 examples Add deepseek-r1 examples Fix newline check Add README for Deepseek examples Add Trl Deepseek Examples Add GRPO example Update Deepseek Example README.md with Deepseek_v2 and Training Examples Add Intel Deepseek Examples Update Deepseek and Intel README Minor Update in Docs for Intel Fix default example [Docs] Minor changes to Intel Gaudi and DeepSeek examples Add Deepseek_v2 example for AMD and updated README [Docs] Minor changes to Intel Gaudi and DeepSeek examples --- docs/assets/stylesheets/extra.css | 3 +- docs/examples.md | 22 + docs/examples/accelerators/intel/index.md | 0 docs/examples/llms/deepseek/index.md | 0 examples/accelerators/intel/README.md | 188 ++++++ examples/deployment/nim/.dstack.yml | 12 +- examples/llms/deepseek/README.md | 607 ++++++++++++++++++ examples/llms/deepseek/sglang/amd/.dstack.yml | 18 + .../sglang/amd/deepseek_v2_lite.dstack.yml | 18 + .../llms/deepseek/sglang/nvidia/.dstack.yml | 18 + .../sglang/nvidia/deepseek_v2_lite.dstack.yml | 19 + examples/llms/deepseek/tgi/intel/.dstack.yml | 45 ++ examples/llms/deepseek/trl/amd/.dstack.yml | 41 ++ .../deepseek/trl/amd/deepseek_v2.dstack.yml | 59 ++ .../llms/deepseek/trl/amd/grpo.dstack.yml | 32 + examples/llms/deepseek/trl/amd/grpo_train.py | 60 ++ examples/llms/deepseek/trl/intel/.dstack.yml | 46 ++ .../deepseek/trl/intel/deepseek_v2.dstack.yml | 45 ++ examples/llms/deepseek/trl/nvidia/.dstack.yml | 38 ++ .../trl/nvidia/deepseek_v2.dstack.yml | 64 ++ examples/llms/deepseek/vllm/amd/.dstack.yml | 19 + .../vllm/amd/deepseek_v2_lite.dstack.yml | 18 + examples/llms/deepseek/vllm/intel/.dstack.yml | 31 + .../llms/deepseek/vllm/nvidia/.dstack.yml | 17 + .../vllm/nvidia/deepseek_v2_lite.dstack.yml | 19 + examples/llms/llama31/README.md | 2 +- mkdocs.yml | 4 +- 27 files changed, 1437 insertions(+), 8 deletions(-) create mode 100644 docs/examples/accelerators/intel/index.md create mode 100644 docs/examples/llms/deepseek/index.md create mode 100644 examples/accelerators/intel/README.md create mode 100644 examples/llms/deepseek/README.md create mode 100644 examples/llms/deepseek/sglang/amd/.dstack.yml create mode 100644 examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml create mode 100644 examples/llms/deepseek/sglang/nvidia/.dstack.yml create mode 100644 examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml create mode 100644 examples/llms/deepseek/tgi/intel/.dstack.yml create mode 100644 examples/llms/deepseek/trl/amd/.dstack.yml create mode 100644 examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml create mode 100644 examples/llms/deepseek/trl/amd/grpo.dstack.yml create mode 100644 examples/llms/deepseek/trl/amd/grpo_train.py create mode 100644 examples/llms/deepseek/trl/intel/.dstack.yml create mode 100644 examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml create mode 100644 examples/llms/deepseek/trl/nvidia/.dstack.yml create mode 100644 examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml create mode 100644 examples/llms/deepseek/vllm/amd/.dstack.yml create mode 100644 examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml create mode 100644 examples/llms/deepseek/vllm/intel/.dstack.yml create mode 100644 examples/llms/deepseek/vllm/nvidia/.dstack.yml create mode 100644 examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 066612e90..3519ec22e 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -24,7 +24,8 @@ } } -[dir=ltr] .md-typeset :is(.admonition,details) pre, [dir=ltr] .md-typeset :is(.admonition,details) :is(.admonition,details) { +[dir=ltr] .md-typeset :is(.admonition,details) pre, +[dir=ltr] .md-typeset :is(.admonition,details) :is(.admonition,details, .termy) { margin-left: 32px; } diff --git a/docs/examples.md b/docs/examples.md index c380b98bf..41a748500 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -83,6 +83,18 @@ hide:

+ +

+ Intel Gaudi +

+ +

+ Deploy and fine-tune LLMs on AMD +

+
+ +

@@ -98,6 +110,16 @@ hide: ## LLMs
+ +

+ Deepseek +

+ +

+ Deploy and train Deepseek models +

+

diff --git a/docs/examples/accelerators/intel/index.md b/docs/examples/accelerators/intel/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/examples/llms/deepseek/index.md b/docs/examples/llms/deepseek/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/accelerators/intel/README.md b/examples/accelerators/intel/README.md new file mode 100644 index 000000000..531daf29b --- /dev/null +++ b/examples/accelerators/intel/README.md @@ -0,0 +1,188 @@ +# Intel Gaudi + +`dstack` supports running dev environments, tasks, and services on Intel Gaudi GPUs via +[SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh). + +## Deployment + +Serving frameworks like vLLM and TGI have Intel Gaudi support. Here's an example of +a service that deploys +[`DeepSeek-R1-Distill-Llama-70B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B){:target="_blank"} +using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} +and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"}. + +=== "TGI" +
+ + ```yaml + type: service + name: tgi + + image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + env: + - HF_TOKEN + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - PORT=8000 + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true + - PT_HPU_ENABLE_LAZY_COLLECTIVES=true + - MAX_TOTAL_TOKENS=2048 + - BATCH_BUCKET_SIZE=256 + - PREFILL_BATCH_BUCKET_SIZE=4 + - PAD_SEQUENCE_TO_MULTIPLE_OF=64 + - ENABLE_HPU_GRAPH=true + - LIMIT_HPU_GRAPH=true + - USE_FLASH_ATTENTION=true + - FLASH_ATTENTION_RECOMPUTE=true + commands: + - text-generation-launcher + --sharded true + --num-shard $DSTACK_GPUS_NUM + --max-input-length 1024 + --max-total-tokens 2048 + --max-batch-prefill-tokens 4096 + --max-batch-total-tokens 524288 + --max-waiting-tokens 7 + --waiting-served-ratio 1.2 + --max-concurrent-requests 512 + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + + resources: + gpu: gaudi2:8 + + # Uncomment to cache downloaded models + #volumes: + # - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub + ``` + +
+ +=== "vLLM" + +
+ + ```yaml + type: service + name: deepseek-r1-gaudi + + image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + commands: + - git clone https://github.com/HabanaAI/vllm-fork.git + - cd vllm-fork + - git checkout habana_main + - pip install -r requirements-hpu.txt + - python setup.py develop + - vllm serve $MODEL_ID + --tensor-parallel-size 8 + --trust-remote-code + --download-dir /data + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + + + resources: + gpu: gaudi2:8 + + # Uncomment to cache downloaded models + #volumes: + # - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub + ``` +
+ + +## Fine-tuning + +Below is an example of LoRA fine-tuning of [`DeepSeek-R1-Distill-Qwen-7B` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B){:target="_blank"} +using [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} +and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide){:target="_blank"} with +the [`lvwerra/stack-exchange-paired` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/lvwerra/stack-exchange-paired){:target="_blank"} dataset. + +
+ +```yaml +type: task +name: trl-train + +image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + - WANDB_API_KEY + - WANDB_PROJECT +commands: + - pip install --upgrade-strategy eager optimum[habana] + - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + - git clone https://github.com/huggingface/optimum-habana.git + - cd optimum-habana/examples/trl + - pip install -r requirements.txt + - pip install wandb + - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size $DSTACK_GPUS_NUM --use_deepspeed sft.py + --model_name_or_path $MODEL_ID + --dataset_name "lvwerra/stack-exchange-paired" + --deepspeed ../language-modeling/llama2_ds_zero3_config.json + --output_dir="./sft" + --do_train + --max_steps=500 + --logging_steps=10 + --save_steps=100 + --per_device_train_batch_size=1 + --per_device_eval_batch_size=1 + --gradient_accumulation_steps=2 + --learning_rate=1e-4 + --lr_scheduler_type="cosine" + --warmup_steps=100 + --weight_decay=0.05 + --optim="paged_adamw_32bit" + --lora_target_modules "q_proj" "v_proj" + --bf16 + --remove_unused_columns=False + --run_name="sft_deepseek_70" + --report_to="wandb" + --use_habana + --use_lazy_mode + +resources: + gpu: gaudi2:8 +``` + +
+ +To finetune `DeepSeek-R1-Distill-Llama-70B` with eight Gaudi 2, +you can partially offload parameters to CPU memory using the Deepspeed configuration file. +For more details, refer to [parameter offloading](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeedzerooffloadparamconfig). + +## Applying a configuration + +Once the configuration is ready, run `dstack apply -f `. + +
+ +```shell +$ dstack apply -f examples/deployment/vllm/.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 ssh remote 152xCPU,1007GB,8xGaudi2:96GB yes $0 idle + +Submit a new run? [y/n]: y + +Provisioning... +---> 100% +``` + +
+ +## Source code + +The source-code of this example can be found in +[`examples/llms/deepseek/tgi/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/tgi/intel){:target="_blank"}, +[`examples/llms/deepseek/vllm/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/vllm/intel){:target="_blank"} and +[`examples/llms/deepseek/trl/intel` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/trl/intel){:target="_blank"}. + +!!! info "What's next?" + 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). + 2. See also [Intel Gaudi Documentation :material-arrow-top-right-thin:{ .external }](https://docs.habana.ai/en/latest/index.html), [vLLM Inference with Gaudi](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/vLLM_Inference.html) + and [Optimum for Gaudi examples :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana/blob/main/examples/trl/README.md). diff --git a/examples/deployment/nim/.dstack.yml b/examples/deployment/nim/.dstack.yml index 3fba16d7f..ba1b702a5 100644 --- a/examples/deployment/nim/.dstack.yml +++ b/examples/deployment/nim/.dstack.yml @@ -1,7 +1,7 @@ type: service -name: llama31 +name: qwen-nim -image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest +image: nvcr.io/nim/qwen/qwen-2.5-7b-instruct:latest env: - NGC_API_KEY - NIM_MAX_MODEL_LEN=4096 @@ -10,16 +10,18 @@ registry_auth: password: ${{ env.NGC_API_KEY }} port: 8000 # Register the model -model: meta/llama-3.1-8b-instruct +model: qwen/qwen-2.5-7b-instruct # Uncomment to leverage spot instances #spot_policy: auto # Cache downloaded models volumes: - - /root/.cache/nim:/opt/nim/.cache + - instance_path: /root/.cache/nim + path: /opt/nim/.cache + optional: true resources: gpu: 24GB # Uncomment if using multiple GPUs - #shm_size: 24GB + shm_size: 16GB diff --git a/examples/llms/deepseek/README.md b/examples/llms/deepseek/README.md new file mode 100644 index 000000000..2e6f27150 --- /dev/null +++ b/examples/llms/deepseek/README.md @@ -0,0 +1,607 @@ +# Deepseek + +This example walks you through how to deploy and +train [Deepseek :material-arrow-top-right-thin:{ .external }](https://huggingface.co/deepseek-ai){:target="_blank"} +models with `dstack`. + +> We used Deepseek-R1 distilled models and Deepseek-V2-Lite, a 16B model with the same architecture as Deepseek-R1 (671B). Deepseek-V2-Lite retains MLA and DeepSeekMoE but requires less memory, making it ideal for testing and fine-tuning on smaller GPUs. + +??? info "Prerequisites" + Once `dstack` is [installed](https://dstack.ai/docs/installation), go ahead clone the repo, and run `dstack init`. + +
+ + ```shell + $ git clone https://github.com/dstackai/dstack + $ cd dstack + $ dstack init + ``` +
+ +## Deployment + +### AMD + +Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with AMD `MI300X` GPU. The below configurations also support `Deepseek-V2-Lite`. + +=== "SGLang" + +
+ + ```yaml + type: service + name: deepseek-r1-amd + + image: lmsysorg/sglang:v0.4.1.post4-rocm620 + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + + resources: + gpu: MI300X + disk: 300Gb + + ``` +
+ +=== "vLLM" + +
+ + ```yaml + type: service + name: deepseek-r1-amd + + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - MAX_MODEL_LEN=126432 + commands: + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --trust-remote-code + port: 8000 + + model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + + resources: + gpu: MI300X + disk: 300Gb + ``` +
+ +Note, when using `Deepseek-R1-Distill-Llama-70B` with `vLLM` with a 192GB GPU, we must limit the context size to 126432 tokens to fit the memory. + +### Intel Gaudi + +Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` +using [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} +and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} (Gaudi fork) with Intel Gaudi 2. + +> Both [TGI on Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi){:target="_blank"} +> and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork){:target="_blank"} do not support `Deepseek-V2-Lite`. +> See [this :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/tgi-gaudi/issues/271) +> and [this :material-arrow-top-right-thin:{ .external }](https://github.com/HabanaAI/vllm-fork/issues/809#issuecomment-2652454824) issues. + +=== "TGI" + +
+ ```yaml + type: service + + name: tgi + + image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + + auth: false + port: 8000 + + model: DeepSeek-R1-Distill-Llama-70B + + env: + - HF_TOKEN + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - PORT=8000 + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true + - PT_HPU_ENABLE_LAZY_COLLECTIVES=true + - MAX_TOTAL_TOKENS=2048 + - BATCH_BUCKET_SIZE=256 + - PREFILL_BATCH_BUCKET_SIZE=4 + - PAD_SEQUENCE_TO_MULTIPLE_OF=64 + - ENABLE_HPU_GRAPH=true + - LIMIT_HPU_GRAPH=true + - USE_FLASH_ATTENTION=true + - FLASH_ATTENTION_RECOMPUTE=true + + commands: + - text-generation-launcher + --sharded true + --num-shard 8 + --max-input-length 1024 + --max-total-tokens 2048 + --max-batch-prefill-tokens 4096 + --max-batch-total-tokens 524288 + --max-waiting-tokens 7 + --waiting-served-ratio 1.2 + --max-concurrent-requests 512 + + resources: + gpu: Gaudi2:8 + ``` +
+ +=== "vLLM" + +
+ ```yaml + type: service + name: deepseek-r1-gaudi + + image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + + + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + + commands: + - git clone https://github.com/HabanaAI/vllm-fork.git + - cd vllm-fork + - git checkout habana_main + - pip install -r requirements-hpu.txt + - python setup.py develop + - vllm serve $MODEL_ID + --tensor-parallel-size 8 + --trust-remote-code + --download-dir /data + + port: 8000 + ``` +
+ +### NVIDIA + +Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-8B` +using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} +and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"} with NVIDIA GPUs. +Both SGLang and vLLM also support `Deepseek-V2-Lite`. + +=== "SGLang" +
+ + ```yaml + type: service + name: deepseek-r1-nvidia + + image: lmsysorg/sglang:latest + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + + resources: + gpu: 24GB + ``` +
+ +=== "vLLM" +
+ + ```yaml + type: service + name: deepseek-r1-nvidia + + image: vllm/vllm-openai:latest + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - MAX_MODEL_LEN=4096 + commands: + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + + resources: + gpu: 24GB + ``` +
+ +Note, to run `Deepseek-R1-Distill-Llama-8B` with `vLLM` with a 24GB GPU, we must limit the context size to 4096 tokens to fit the memory. + +> To run `Deepseek-V2-Lite` with `vLLM`, we must use 40GB GPU and to run `Deepseek-V2-Lite` with SGLang, we must use +> 80GB GPU. For more details on SGlang's memory requirements you can refer to +> this [issue](https://github.com/sgl-project/sglang/issues/3451). + +### Memory requirements + +Approximate memory requirements for loading the model (excluding context and CUDA/ROCm kernel reservations). + +| Model | Size | FP16 | FP8 | INT4 | +|-----------------------------|----------|--------|--------|--------| +| `Deepseek-R1` | **671B** | 1.35TB | 671GB | 336GB | +| `DeepSeek-R1-Distill-Llama` | **70B** | 161GB | 80.5GB | 40B | +| `DeepSeek-R1-Distill-Qwen` | **32B** | 74GB | 37GB | 18.5GB | +| `DeepSeek-V2-Lite` | **16B** | 35GB | 17.5GB | 8.75GB | +| `DeepSeek-R1-Distill-Qwen` | **14B** | 32GB | 16GB | 8GB | +| `DeepSeek-R1-Distill-Llama` | **8B** | 18GB | 9GB | 4.5GB | +| `DeepSeek-R1-Distill-Qwen` | **7B** | 16GB | 8GB | 4GB | + +For example, the FP8 version of Deepseek-R1 671B fits on a single node of MI300X with eight 192GB GPUs, a single node of +H200 with eight 141GB GPUs, or a single node of Intel Gaudi2 with eight 96GB GPUs. + +### Applying the configuration + +To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. + +
+ +```shell +$ dstack apply -f examples/llms/deepseek/sglang/amd/.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 runpod EU-RO-1 24xCPU, 283GB, 1xMI300X (192GB) no $2.49 + +Submit the run deepseek-r1-amd? [y/n]: y + +Provisioning... +---> 100% +``` +
+ +Once the service is up, the model will be available via the OpenAI-compatible endpoint +at `/proxy/models//`. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/models/main/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is Deep Learning?" + } + ], + "stream": true, + "max_tokens": 512 + }' +``` +
+ +When a [gateway](https://dstack.ai/docs/concepts/gateways.md) is configured, the OpenAI-compatible endpoint +is available at `https://gateway./`. + +## Fine-tuning + +### AMD + +Here are the examples of LoRA fine-tuning of `Deepseek-V2-Lite` and GRPO fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` on `MI300X` GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"}. + +=== "LoRA" + +
+ + ```yaml + type: task + name: trl-train + + image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + + env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False + commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install "numpy<2" + - pip install peft + - pip install wandb + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "deepseek-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + + resources: + gpu: MI300X + disk: 150GB + ``` +
+ +=== "GRPO" + +
+ ```yaml + type: task + name: trl-train-grpo + + image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + + env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + commands: + - pip install trl + - pip install datasets + # numPy version less than 2 is required for the scipy installation with AMD. + - pip install "numpy<2" + - mkdir -p grpo_example + - cp examples/llms/deepseek/trl/amd/grpo_train.py grpo_example/grpo_train.py + - cd grpo_example + - python grpo_train.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/tldr + --per_device_train_batch_size 2 + --logging_steps 25 + --output_dir Deepseek-Distill-Qwen-1.5B-GRPO + --trust_remote_code + + resources: + gpu: MI300X + disk: 150GB + ``` +
+ +Note, the `GRPO` fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` consumes up to 135GB of vRAM. + +### Intel Gaudi + +Here is an example of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-7B` on Intel Gaudi 2 GPUs using +HuggingFace's [Optimum for Intel Gaudi :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-habana){:target="_blank"} +and [DeepSpeed :material-arrow-top-right-thin:{ .external }](https://github.com/deepspeedai/DeepSpeed){:target="_blank"}. Both also support `LoRA` +fine-tuning of `Deepseek-V2-Lite` with same configuration as below. + +=== "LoRA" + +
+ ```yaml + type: task + name: trl-train + + image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 + + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + - WANDB_API_KEY + - WANDB_PROJECT + commands: + - pip install --upgrade-strategy eager optimum[habana] + - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + - git clone https://github.com/huggingface/optimum-habana.git + - cd optimum-habana/examples/trl + - pip install -r requirements.txt + - pip install wandb + - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py + --model_name_or_path $MODEL_ID + --dataset_name "lvwerra/stack-exchange-paired" + --deepspeed ../language-modeling/llama2_ds_zero3_config.json + --output_dir="./sft" + --do_train + --max_steps=500 + --logging_steps=10 + --save_steps=100 + --per_device_train_batch_size=1 + --per_device_eval_batch_size=1 + --gradient_accumulation_steps=2 + --learning_rate=1e-4 + --lr_scheduler_type="cosine" + --warmup_steps=100 + --weight_decay=0.05 + --optim="paged_adamw_32bit" + --lora_target_modules "q_proj" "v_proj" + --bf16 + --remove_unused_columns=False + --run_name="sft_deepseek_70" + --report_to="wandb" + --use_habana + --use_lazy_mode + + resources: + gpu: gaudi2:8 + ``` + +
+ + +### NVIDIA + +Here are examples of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` and QLoRA fine-tuning of `DeepSeek-V2-Lite` +on NVIDIA GPU using HuggingFace's [TRL :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/trl){:target="_blank"} library. + +=== "LoRA" +
+ + ```yaml + type: task + name: trl-train + + python: "3.10" + + env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + commands: + - git clone https://github.com/huggingface/trl.git + - pip install trl + - pip install peft + - pip install wandb + - cd trl/trl/scripts + - python sft.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/Capybara + --learning_rate 2.0e-4 + --num_train_epochs 1 + --packing + --per_device_train_batch_size 2 + --gradient_accumulation_steps 8 + --gradient_checkpointing + --logging_steps 25 + --eval_strategy steps + --eval_steps 100 + --use_peft + --lora_r 32 + --lora_alpha 16 + --report_to wandb + --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT + + resources: + gpu: 24GB + ``` +
+ +=== "QLoRA" +
+ + ```yaml + type: task + name: trl-train-deepseek-v2 + + python: "3.10" + nvcc: true + env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False + commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install peft + - pip install wandb + - pip install bitsandbytes + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --bf16 True + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "mistral-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + --use_4bit_quantization True + --use_nested_quant True + --bnb_4bit_compute_dtype "bfloat16" + + resources: + # Consumes ~25GB of vRAM for QLoRA fine-tuning deepseek-ai/DeepSeek-V2-Lite + gpu: 48GB + ``` +
+ +### Memory requirements + +| Model | Size | Full fine-tuning | LoRA | QLoRA | +|-----------------------------|----------|------------------|-------|-------| +| `Deepseek-R1` | **671B** | 10.5TB | 1.4TB | 442GB | +| `DeepSeek-R1-Distill-Llama` | **70B** | 1.09TB | 151GB | 46GB | +| `DeepSeek-R1-Distill-Qwen` | **32B** | 512GB | 70GB | 21GB | +| `DeepSeek-V2-Lite` | **16B** | 256GB | 35GB | 11GB | +| `DeepSeek-R1-Distill-Qwen` | **14B** | 224GB | 30GB | 9GB | +| `DeepSeek-R1-Distill-Llama` | **8B** | 128GB | 17GB | 5GB | +| `DeepSeek-R1-Distill-Qwen` | **7B** | 112GB | 15GB | 4GB | +| `DeepSeek-R1-Distill-Qwen` | **1.5B** | 24GB | 3.2GB | 1GB | + +The memory requirements assume low-rank update matrices are 1% of model parameters. In practice, a 7B model with QLoRA +needs 7–10GB due to intermediate hidden states. + +| Fine-tuning type | Calculation | +|------------------|--------------------------------------------------| +| Full fine-tuning | 671B × 16 bytes = 10.48TB | +| LoRA | 671B × 2 bytes + 1% of 671B × 16 bytes = 1.41TB | +| QLoRA(4-bit) | 671B × 0.5 bytes + 1% of 671B × 16 bytes = 442GB | + +## Source code + +The source-code of this example can be found in +[`examples/llms/deepseek` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek). + +!!! info "What's next?" + 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), + [services](https://dstack.ai/docs/services), and [protips](https://dstack.ai/docs/protips). + diff --git a/examples/llms/deepseek/sglang/amd/.dstack.yml b/examples/llms/deepseek/sglang/amd/.dstack.yml new file mode 100644 index 000000000..99a19bfee --- /dev/null +++ b/examples/llms/deepseek/sglang/amd/.dstack.yml @@ -0,0 +1,18 @@ +type: service +name: deepseek-r1-amd + +image: lmsysorg/sglang:v0.4.1.post4-rocm620 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B +commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + +port: 8000 +model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + +resources: + gpu: mi300x + disk: 300Gb diff --git a/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml new file mode 100644 index 000000000..01ef71a6b --- /dev/null +++ b/examples/llms/deepseek/sglang/amd/deepseek_v2_lite.dstack.yml @@ -0,0 +1,18 @@ +type: service +name: deepseek-v2-lite-amd + +image: lmsysorg/sglang:v0.4.1.post4-rocm620 +env: + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite +commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + +port: 8000 +model: deepseek-ai/DeepSeek-V2-Lite + +resources: + gpu: mi300x + disk: 150Gb diff --git a/examples/llms/deepseek/sglang/nvidia/.dstack.yml b/examples/llms/deepseek/sglang/nvidia/.dstack.yml new file mode 100644 index 000000000..d1c92b64d --- /dev/null +++ b/examples/llms/deepseek/sglang/nvidia/.dstack.yml @@ -0,0 +1,18 @@ +type: service +name: deepseek-r1-nvidia + +image: lmsysorg/sglang:latest +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B +commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + +port: 8000 + +model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + +resources: + gpu: 24GB diff --git a/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml new file mode 100644 index 000000000..8c0adaa41 --- /dev/null +++ b/examples/llms/deepseek/sglang/nvidia/deepseek_v2_lite.dstack.yml @@ -0,0 +1,19 @@ +# Not Working https://github.com/sgl-project/sglang/issues/3451 +type: service +name: deepseek-v2-lite-nvidia + +image: lmsysorg/sglang:latest +env: + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite +commands: + - python3 -m sglang.launch_server + --model-path $MODEL_ID + --port 8000 + --trust-remote-code + +port: 8000 + +model: deepseek-ai/DeepSeek-V2-Lite + +resources: + gpu: 80GB diff --git a/examples/llms/deepseek/tgi/intel/.dstack.yml b/examples/llms/deepseek/tgi/intel/.dstack.yml new file mode 100644 index 000000000..16d083092 --- /dev/null +++ b/examples/llms/deepseek/tgi/intel/.dstack.yml @@ -0,0 +1,45 @@ +type: service + +name: tgi + +image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + +auth: false +port: 8000 + +model: DeepSeek-R1-Distill-Llama-70B + +env: + - HF_TOKEN + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - PORT=8000 + - OMPI_MCA_btl_vader_single_copy_mechanism=none + - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true + - PT_HPU_ENABLE_LAZY_COLLECTIVES=true + - MAX_TOTAL_TOKENS=2048 + - BATCH_BUCKET_SIZE=256 + - PREFILL_BATCH_BUCKET_SIZE=4 + - PAD_SEQUENCE_TO_MULTIPLE_OF=64 + - ENABLE_HPU_GRAPH=true + - LIMIT_HPU_GRAPH=true + - USE_FLASH_ATTENTION=true + - FLASH_ATTENTION_RECOMPUTE=true + +commands: + - text-generation-launcher + --sharded true + --num-shard 8 + --max-input-length 1024 + --max-total-tokens 2048 + --max-batch-prefill-tokens 4096 + --max-batch-total-tokens 524288 + --max-waiting-tokens 7 + --waiting-served-ratio 1.2 + --max-concurrent-requests 512 + +resources: + gpu: Gaudi2:8 + +# Uncomment to cache downloaded models +#volumes: +# - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub diff --git a/examples/llms/deepseek/trl/amd/.dstack.yml b/examples/llms/deepseek/trl/amd/.dstack.yml new file mode 100644 index 000000000..fe3dbc31a --- /dev/null +++ b/examples/llms/deepseek/trl/amd/.dstack.yml @@ -0,0 +1,41 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + +# Commands of the task +commands: + - git clone https://github.com/huggingface/trl.git + - pip install trl + - pip install "numpy<2" + - pip install peft + - pip install wandb + - cd trl/trl/scripts + - python sft.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/Capybara + --learning_rate 2.0e-4 + --num_train_epochs 1 + --packing + --per_device_train_batch_size 2 + --gradient_accumulation_steps 8 + --gradient_checkpointing + --logging_steps 25 + --eval_strategy steps + --eval_steps 100 + --use_peft + --lora_r 32 + --lora_alpha 16 + --report_to wandb + --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml new file mode 100644 index 000000000..4c719dcd5 --- /dev/null +++ b/examples/llms/deepseek/trl/amd/deepseek_v2.dstack.yml @@ -0,0 +1,59 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False +# Commands of the task +commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install "numpy<2" + - pip install peft + - pip install wandb + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "deepseek-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/grpo.dstack.yml b/examples/llms/deepseek/trl/amd/grpo.dstack.yml new file mode 100644 index 000000000..f866bb1ca --- /dev/null +++ b/examples/llms/deepseek/trl/amd/grpo.dstack.yml @@ -0,0 +1,32 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train-grpo + +image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +# Commands of the task +commands: + - pip install trl + - pip install datasets + # numpy version less than 2 is required for the scipy installation with AMD. + - pip install "numpy<2" + - mkdir -p grpo_example + - cp examples/llms/deepseek/trl/amd/grpo_train.py grpo_example/grpo_train.py + - cd grpo_example + - python grpo_train.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/tldr + --per_device_train_batch_size 2 + --logging_steps 25 + --output_dir Deepseek-Distill-Qwen-1.5B-GRPO + --trust_remote_code + +# GRPO fine-tuning of DeepSeek-R1-Distill-Qwen-1.5B consumes 70% of VRAM +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/llms/deepseek/trl/amd/grpo_train.py b/examples/llms/deepseek/trl/amd/grpo_train.py new file mode 100644 index 000000000..ab59291de --- /dev/null +++ b/examples/llms/deepseek/trl/amd/grpo_train.py @@ -0,0 +1,60 @@ +import argparse + +from datasets import load_dataset +from transformers import AutoModelForCausalLM +from trl import GRPOConfig, GRPOTrainer + + +def parse_args(): + parser = argparse.ArgumentParser(description="Train a model using GRPOTrainer.") + parser.add_argument( + "--model_name_or_path", + type=str, + required=True, + help="Path to the model or model identifier from huggingface.co/models", + ) + parser.add_argument( + "--dataset_name", type=str, required=True, help="Name of the dataset to use" + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=1, + help="Batch size per device for training", + ) + parser.add_argument("--logging_steps", type=int, default=10, help="Logging steps interval") + parser.add_argument( + "--output_dir", type=str, default="output", help="Output directory for the trained model" + ) + parser.add_argument( + "--trust_remote_code", action="store_true", help="Trust remote code when loading the model" + ) + return parser.parse_args() + + +def reward_len(completions, **kwargs): + return [abs(20 - len(completion)) for completion in completions] + + +def main(): + args = parse_args() + + dataset = load_dataset(args.dataset_name, split="train") + training_args = GRPOConfig( + output_dir=args.output_dir, + logging_steps=args.logging_steps, + per_device_train_batch_size=args.per_device_train_batch_size, + ) + + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, trust_remote_code=args.trust_remote_code + ) + trainer = GRPOTrainer( + model=model, reward_funcs=reward_len, args=training_args, train_dataset=dataset + ) + + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/examples/llms/deepseek/trl/intel/.dstack.yml b/examples/llms/deepseek/trl/intel/.dstack.yml new file mode 100644 index 000000000..9963e4844 --- /dev/null +++ b/examples/llms/deepseek/trl/intel/.dstack.yml @@ -0,0 +1,46 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 + +# Required environment variables +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + - WANDB_API_KEY + - WANDB_PROJECT +# Commands of the task +commands: + - pip install --upgrade-strategy eager optimum[habana] + - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + - git clone https://github.com/huggingface/optimum-habana.git + - cd optimum-habana/examples/trl + - pip install -r requirements.txt + - pip install wandb + - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py + --model_name_or_path $MODEL_ID + --dataset_name "lvwerra/stack-exchange-paired" + --deepspeed ../language-modeling/llama2_ds_zero3_config.json + --output_dir="./sft" + --do_train + --max_steps=500 + --logging_steps=10 + --save_steps=100 + --per_device_train_batch_size=1 + --per_device_eval_batch_size=1 + --gradient_accumulation_steps=2 + --learning_rate=1e-4 + --lr_scheduler_type="cosine" + --warmup_steps=100 + --weight_decay=0.05 + --optim="paged_adamw_32bit" + --lora_target_modules "q_proj" "v_proj" + --bf16 + --remove_unused_columns=False + --run_name="sft_deepseek_70" + --report_to="wandb" + --use_habana + --use_lazy_mode + +resources: + gpu: gaudi2:8 diff --git a/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml new file mode 100644 index 000000000..7aa13d677 --- /dev/null +++ b/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml @@ -0,0 +1,45 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train-deepseek-v2-lite + +image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 + +# Required environment variables +env: + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - WANDB_API_KEY + - WANDB_PROJECT +# Commands of the task +commands: + - pip install git+https://github.com/huggingface/optimum-habana.git + - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + - git clone https://github.com/huggingface/optimum-habana.git + - cd optimum-habana/examples/trl + - pip install -r requirements.txt + - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py + --model_name_or_path $MODEL_ID + --dataset_name "lvwerra/stack-exchange-paired" + --deepspeed ../language-modeling/llama2_ds_zero3_config.json + --output_dir="./sft" + --do_train + --max_steps=500 + --logging_steps=10 + --save_steps=100 + --per_device_train_batch_size=1 + --per_device_eval_batch_size=1 + --gradient_accumulation_steps=2 + --learning_rate=1e-4 + --lr_scheduler_type="cosine" + --warmup_steps=100 + --weight_decay=0.05 + --optim="paged_adamw_32bit" + --lora_target_modules "q_proj" "v_proj" + --bf16 + --remove_unused_columns=False + --run_name="sft_deepseek_v2lite" + --report_to="wandb" + --use_habana + --use_lazy_mode + +resources: + gpu: gaudi2:8 diff --git a/examples/llms/deepseek/trl/nvidia/.dstack.yml b/examples/llms/deepseek/trl/nvidia/.dstack.yml new file mode 100644 index 000000000..09444040c --- /dev/null +++ b/examples/llms/deepseek/trl/nvidia/.dstack.yml @@ -0,0 +1,38 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train + +python: "3.10" + +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +# Commands of the task +commands: + - git clone https://github.com/huggingface/trl.git + - pip install trl + - pip install peft + - pip install wandb + - cd trl/trl/scripts + - python sft.py + --model_name_or_path $MODEL_ID + --dataset_name trl-lib/Capybara + --learning_rate 2.0e-4 + --num_train_epochs 1 + --packing + --per_device_train_batch_size 2 + --gradient_accumulation_steps 8 + --gradient_checkpointing + --logging_steps 25 + --eval_strategy steps + --eval_steps 100 + --use_peft + --lora_r 32 + --lora_alpha 16 + --report_to wandb + --output_dir DeepSeek-R1-Distill-Qwen-1.5B-SFT + +resources: + gpu: 24GB diff --git a/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml new file mode 100644 index 000000000..9c6850270 --- /dev/null +++ b/examples/llms/deepseek/trl/nvidia/deepseek_v2.dstack.yml @@ -0,0 +1,64 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-train-deepseek-v2 + +python: "3.10" + +nvcc: true +# Required environment variables +env: + - WANDB_API_KEY + - WANDB_PROJECT + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - ACCELERATE_USE_FSDP=False +# Commands of the task +commands: + - git clone https://github.com/huggingface/peft.git + - pip install trl + - pip install peft + - pip install wandb + - pip install bitsandbytes + - cd peft/examples/sft + - python train.py + --seed 100 + --model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" + --dataset_name "smangrul/ultrachat-10k-chatml" + --chat_template_format "chatml" + --add_special_tokens False + --append_concat_token False + --splits "train,test" + --max_seq_len 512 + --num_train_epochs 1 + --logging_steps 5 + --log_level "info" + --logging_strategy "steps" + --eval_strategy "epoch" + --save_strategy "epoch" + --hub_private_repo True + --hub_strategy "every_save" + --bf16 True + --packing True + --learning_rate 1e-4 + --lr_scheduler_type "cosine" + --weight_decay 1e-4 + --warmup_ratio 0.0 + --max_grad_norm 1.0 + --output_dir "deepseek-sft-lora" + --per_device_train_batch_size 8 + --per_device_eval_batch_size 8 + --gradient_accumulation_steps 4 + --gradient_checkpointing True + --use_reentrant True + --dataset_text_field "content" + --use_peft_lora True + --lora_r 16 + --lora_alpha 16 + --lora_dropout 0.05 + --lora_target_modules "all-linear" + --use_4bit_quantization True + --use_nested_quant True + --bnb_4bit_compute_dtype "bfloat16" + +resources: + # Consumes ~25GB of vRAM for QLoRA fine-tuning deepseek-ai/DeepSeek-V2-Lite + gpu: 48GB diff --git a/examples/llms/deepseek/vllm/amd/.dstack.yml b/examples/llms/deepseek/vllm/amd/.dstack.yml new file mode 100644 index 000000000..23bfb033c --- /dev/null +++ b/examples/llms/deepseek/vllm/amd/.dstack.yml @@ -0,0 +1,19 @@ +type: service +name: deepseek-r1-amd + +image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - MAX_MODEL_LEN=126432 +commands: + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --trust-remote-code +port: 8000 + +model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + + +resources: + gpu: mi300x + disk: 300Gb diff --git a/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml new file mode 100644 index 000000000..8937e9526 --- /dev/null +++ b/examples/llms/deepseek/vllm/amd/deepseek_v2_lite.dstack.yml @@ -0,0 +1,18 @@ +type: service +name: deepseek-v2-lite-amd + +image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 +env: + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite +commands: + - vllm serve $MODEL_ID + --trust-remote-code + +port: 8000 + +model: deepseek-ai/DeepSeek-V2-Lite + + +resources: + gpu: mi300x + disk: 150Gb diff --git a/examples/llms/deepseek/vllm/intel/.dstack.yml b/examples/llms/deepseek/vllm/intel/.dstack.yml new file mode 100644 index 000000000..d28a0152d --- /dev/null +++ b/examples/llms/deepseek/vllm/intel/.dstack.yml @@ -0,0 +1,31 @@ +type: service +name: deepseek-r1-gaudi + +image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - HABANA_VISIBLE_DEVICES=all + - OMPI_MCA_btl_vader_single_copy_mechanism=none + +commands: + - git clone https://github.com/HabanaAI/vllm-fork.git + - cd vllm-fork + - git checkout habana_main + - pip install -r requirements-hpu.txt + - python setup.py develop + - vllm serve $MODEL_ID + --tensor-parallel-size 8 + --trust-remote-code + --download-dir /data + +port: 8000 + +model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + +resources: + gpu: gaudi2:8 + +# Uncomment to cache downloaded models +#volumes: +# - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub diff --git a/examples/llms/deepseek/vllm/nvidia/.dstack.yml b/examples/llms/deepseek/vllm/nvidia/.dstack.yml new file mode 100644 index 000000000..e623b182c --- /dev/null +++ b/examples/llms/deepseek/vllm/nvidia/.dstack.yml @@ -0,0 +1,17 @@ +type: service +name: deepseek-r1-nvidia + +image: vllm/vllm-openai:latest +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - MAX_MODEL_LEN=4096 +commands: + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + +port: 8000 + +model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + +resources: + gpu: 24GB diff --git a/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml b/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml new file mode 100644 index 000000000..06e78f379 --- /dev/null +++ b/examples/llms/deepseek/vllm/nvidia/deepseek_v2_lite.dstack.yml @@ -0,0 +1,19 @@ +type: service +name: deepseek-v2-lite-nvidia + +image: vllm/vllm-openai:latest +env: + - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite + - MAX_MODEL_LEN=4096 +commands: + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --tensor-parallel-size $DSTACK_GPUS_NUM + --trust-remote-code + +port: 8000 + +model: deepseek-ai/DeepSeek-V2-Lite + +resources: + gpu: 48GB diff --git a/examples/llms/llama31/README.md b/examples/llms/llama31/README.md index a345d56b0..fb754e4fc 100644 --- a/examples/llms/llama31/README.md +++ b/examples/llms/llama31/README.md @@ -181,7 +181,7 @@ Provisioning...

Once the service is up, the model will be available via the OpenAI-compatible endpoint -at `/proxy/models//. +at `/proxy/models//`.
diff --git a/mkdocs.yml b/mkdocs.yml index 77d47e3ef..961097f02 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -265,13 +265,15 @@ nav: - TRL: examples/fine-tuning/trl/index.md - Accelerators: - AMD: examples/accelerators/amd/index.md + - Intel Gaudi: examples/accelerators/intel/index.md - TPU: examples/accelerators/tpu/index.md - LLMs: + - Deepseek: examples/llms/deepseek/index.md - Llama 3.1: examples/llms/llama31/index.md - Llama 3.2: examples/llms/llama32/index.md - Misc: - Docker Compose: examples/misc/docker-compose/index.md - - Community: community.md +# - Community: community.md - Partners: partners.md - Blog: - blog/index.md