diff --git a/contributing/DOCS.md b/contributing/DOCS.md index a885c5f51e..4fcc04d6d1 100644 --- a/contributing/DOCS.md +++ b/contributing/DOCS.md @@ -52,6 +52,19 @@ uv run mkdocs build -s The documentation uses a custom build system with MkDocs hooks to generate various files dynamically. +### Disable flags + +Use these in `.envrc` to disable expensive docs regeneration, especially during `mkdocs serve` auto-reload. Set any of them to disable the corresponding artifact. + +```shell +export DSTACK_DOCS_DISABLE_EXAMPLES=1 +export DSTACK_DOCS_DISABLE_LLM_TXT=1 +export DSTACK_DOCS_DISABLE_CLI_REFERENCE=1 +export DSTACK_DOCS_DISABLE_YAML_SCHEMAS=1 +export DSTACK_DOCS_DISABLE_OPENAPI_REFERENCE=1 +export DSTACK_DOCS_DISABLE_REST_PLUGIN_SPEC_REFERENCE=1 +``` + ### Build hooks The build process is customized via hooks in `scripts/docs/hooks.py`: diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index fcde5e2e73..cb2d68e55d 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -1615,7 +1615,8 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { .md-typeset.md-banner__inner a { color: var(--md-default-bg-color); /* border-bottom: 1.5px dotted; */ - font-weight: 500; + /* font-weight: 500; */ + font-size: 0.75rem; } .md-typeset.md-banner__inner .md-banner__button svg { diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 1eb63dd01e..685b793bc9 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -1093,6 +1093,5 @@ The rolling deployment stops when all replicas are updated or when a new deploym 1. Read about [dev environments](dev-environments.md) and [tasks](tasks.md) 2. Learn how to manage [fleets](fleets.md) 3. See how to set up [gateways](gateways.md) - 4. Check the [TGI](../../examples/inference/tgi/index.md), - [vLLM](../../examples/inference/vllm/index.md), and + 4. Check the [vLLM](../../examples/inference/vllm/index.md) and [NIM](../../examples/inference/nim/index.md) examples diff --git a/docs/docs/index.md b/docs/docs/index.md index 8afc24fdb5..4edaaee798 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -16,11 +16,11 @@ It streamlines development, training, and inference, and is compatible with any -#### 1. Set up the server +### Set up the server > Before using `dstack`, ensure you've [installed](installation.md) the server, or signed up for [dstack Sky](https://sky.dstack.ai). -#### 2. Define configurations +### Define configurations `dstack` supports the following configurations: @@ -32,7 +32,7 @@ It streamlines development, training, and inference, and is compatible with any Configuration can be defined as YAML files within your repo. -#### 3. Apply configurations +### Apply configurations Apply the configuration either via the `dstack apply` CLI command (or through a programmatic API.) diff --git a/docs/docs/installation.md b/docs/docs/installation.md index d555f0873d..0ff4f624a8 100644 --- a/docs/docs/installation.md +++ b/docs/docs/installation.md @@ -177,6 +177,8 @@ Once the server is up, you can access it via the `dstack` CLI. ### Configure the project +When the server is started, by default, it creates the `main` project and the `admin` user. + To point the CLI to the `dstack` server, configure it with the server address, user token, and project name: @@ -195,6 +197,12 @@ Configuration is updated at ~/.dstack/config.yml This configuration is stored in `~/.dstack/config.yml`. +Later, you can create additional projects and users. + +### Use CLI or API + +Once the project is configured, you can use the `dstack` CLI or API. + ## Install agent skills Install [`dstack` skills](https://skills.sh/dstackai/dstack/dstack) to help AI agents use the CLI and edit configuration files. @@ -207,6 +215,8 @@ $ npx skills add dstackai/dstack +### Use agents + AI agents like Claude, Codex, and Cursor can now create and manage fleets and submit workloads on your behalf.
@@ -233,10 +243,9 @@ $
-!!! info "Feedback" - We're actively improving Skills and would love your feedback in [GitHub issues](https://github.com/dstackai/dstack/issues). +We're actively improving Skills and would love your feedback in [GitHub issues](https://github.com/dstackai/dstack/issues). !!! info "What's next?" 1. See [Backends](concepts/backends.md) 2. Follow [Quickstart](quickstart.md) - 3. Check the [server deployment](guides/server-deployment.md) guide + 3. Check the [Server deployment](guides/server-deployment.md) guide diff --git a/docs/docs/reference/dstack.yml/service.md b/docs/docs/reference/dstack.yml/service.md index 59411a540d..8aba6f827e 100644 --- a/docs/docs/reference/dstack.yml/service.md +++ b/docs/docs/reference/dstack.yml/service.md @@ -20,51 +20,6 @@ The `service` configuration type allows running [services](../../concepts/servic type: required: true -=== "TGI" - - > TGI provides an OpenAI-compatible API starting with version 1.4.0, - so models served by TGI can be defined with `format: openai` too. - - #SCHEMA# dstack.api.TGIChatModel - overrides: - show_root_heading: false - type: - required: true - - ??? info "Chat template" - - By default, `dstack` loads the [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating) - from the model's repository. If it is not present there, manual configuration is required. - - ```yaml - type: service - - image: ghcr.io/huggingface/text-generation-inference:latest - env: - - MODEL_ID=TheBloke/Llama-2-13B-chat-GPTQ - commands: - - text-generation-launcher --port 8000 --trust-remote-code --quantize gptq - port: 8000 - - resources: - gpu: 80GB - - # Enable the OpenAI-compatible endpoint - model: - type: chat - name: TheBloke/Llama-2-13B-chat-GPTQ - format: tgi - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' }}{% endif %}{% endfor %}" - eos_token: "" - ``` - - Please note that model mapping is an experimental feature with the following limitations: - - 1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself. - 2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template). - - If you encounter any ofther issues, please make sure to file a - [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). ### `scaling` diff --git a/docs/examples.md b/docs/examples.md index e57a41cf52..cbecf2435e 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -3,16 +3,16 @@ title: Examples description: Collection of examples for training, inference, and clusters #template: examples.html hide: - - navigation -# - toc - - footer +# - navigation + - toc +# - footer --- - + --> ## Single-node training @@ -165,15 +165,6 @@ hide: Deploy Llama 3.1 with vLLM

- -

- TGI -

-

- Deploy Llama 4 with TGI -

-

@@ -219,17 +210,6 @@ hide:

- -

- Intel Gaudi -

- -

- Deploy and fine-tune LLMs on Intel Gaudi -

-
-

diff --git a/docs/examples/inference/tgi/index.md b/docs/examples/inference/tgi/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/overrides/main.html b/docs/overrides/main.html index f35bbf5401..3ae52c2be3 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -191,7 +191,7 @@ {% endblock %} {% block announce %} -Let agents manage infra and orchestrate workloads +Infrastructure orchestration is an agent skill {% endblock %} {% block footer %} diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index 9dfe364410..3f6b4966b1 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -1,6 +1,6 @@ --- title: AMD -description: Deploying and fine-tuning models on AMD MI300X GPUs using TGI, vLLM, and Axolotl +description: Deploying and fine-tuning models on AMD MI300X GPUs using vLLM, TRL, and Axolotl --- # AMD @@ -11,41 +11,8 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the ## Deployment -Most serving frameworks including vLLM and TGI have AMD support. Here's an example of a [service](https://dstack.ai/docs/services) that deploys -Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd) and [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html). - -=== "TGI" - -
- - ```yaml - type: service - name: amd-service-tgi - - # Using the official TGI's ROCm Docker image - image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm - - env: - - HF_TOKEN - - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct - - TRUST_REMOTE_CODE=true - - ROCM_USE_FLASH_ATTN_V2_TRITON=true - commands: - - text-generation-launcher --port 8000 - port: 8000 - # Register the model - model: meta-llama/Meta-Llama-3.1-70B-Instruct - - # Uncomment to leverage spot instances - #spot_policy: auto - - resources: - gpu: MI300X - disk: 150GB - ``` - -
- +vLLM supports AMD GPUs. Here's an example of a [service](https://dstack.ai/docs/services) that deploys +Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html). === "vLLM" @@ -97,6 +64,7 @@ Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-in gpu: MI300X disk: 200GB ``` + Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version. @@ -244,15 +212,13 @@ $ dstack apply -f examples/inference/vllm/amd/.dstack.yml ## Source code The source-code of this example can be found in -[`examples/inference/tgi/amd`](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi/amd), [`examples/inference/vllm/amd`](https://github.com/dstackai/dstack/blob/master/examples/inference/vllm/amd), [`examples/single-node-training/axolotl/amd`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/axolotl/amd) and [`examples/single-node-training/trl/amd`](https://github.com/dstackai/dstack/blob/master/examples/single-node-training/trl/amd) ## What's next? -1. Browse [TGI](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi), - [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), +1. Browse [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) diff --git a/examples/accelerators/intel/README.md b/examples/accelerators/intel/README.md deleted file mode 100644 index 0e2a629f2f..0000000000 --- a/examples/accelerators/intel/README.md +++ /dev/null @@ -1,193 +0,0 @@ ---- -title: Intel Gaudi -description: Deploying and fine-tuning models on Intel Gaudi accelerators using TGI, vLLM, and Optimum ---- - -# Intel Gaudi - -`dstack` supports running dev environments, tasks, and services on Intel Gaudi GPUs via -[SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets). - -## Deployment - -Serving frameworks like vLLM and TGI have Intel Gaudi support. Here's an example of -a service that deploys -[`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) -using [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) -and [vLLM](https://github.com/HabanaAI/vllm-fork). - -=== "TGI" -
- - ```yaml - type: service - name: tgi - - image: ghcr.io/huggingface/tgi-gaudi:2.3.1 - env: - - HF_TOKEN - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - PORT=8000 - - OMPI_MCA_btl_vader_single_copy_mechanism=none - - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true - - PT_HPU_ENABLE_LAZY_COLLECTIVES=true - - MAX_TOTAL_TOKENS=2048 - - BATCH_BUCKET_SIZE=256 - - PREFILL_BATCH_BUCKET_SIZE=4 - - PAD_SEQUENCE_TO_MULTIPLE_OF=64 - - ENABLE_HPU_GRAPH=true - - LIMIT_HPU_GRAPH=true - - USE_FLASH_ATTENTION=true - - FLASH_ATTENTION_RECOMPUTE=true - commands: - - text-generation-launcher - --sharded true - --num-shard $DSTACK_GPUS_NUM - --max-input-length 1024 - --max-total-tokens 2048 - --max-batch-prefill-tokens 4096 - --max-batch-total-tokens 524288 - --max-waiting-tokens 7 - --waiting-served-ratio 1.2 - --max-concurrent-requests 512 - port: 8000 - model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - resources: - gpu: gaudi2:8 - - # Uncomment to cache downloaded models - #volumes: - # - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub - ``` - -
- -=== "vLLM" - -
- - ```yaml - type: service - name: deepseek-r1-gaudi - - image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest - env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - HABANA_VISIBLE_DEVICES=all - - OMPI_MCA_btl_vader_single_copy_mechanism=none - commands: - - git clone https://github.com/HabanaAI/vllm-fork.git - - cd vllm-fork - - git checkout habana_main - - pip install -r requirements-hpu.txt - - python setup.py develop - - vllm serve $MODEL_ID - --tensor-parallel-size 8 - --trust-remote-code - --download-dir /data - port: 8000 - model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - - resources: - gpu: gaudi2:8 - - # Uncomment to cache downloaded models - #volumes: - # - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub - ``` -
- - -## Fine-tuning - -Below is an example of LoRA fine-tuning of [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) -using [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) -and [DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide) with -the [`lvwerra/stack-exchange-paired`](https://huggingface.co/datasets/lvwerra/stack-exchange-paired) dataset. - -
- -```yaml -type: task -name: trl-train - -image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 -env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B - - WANDB_API_KEY - - WANDB_PROJECT -commands: - - pip install --upgrade-strategy eager optimum[habana] - - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 - - git clone https://github.com/huggingface/optimum-habana.git - - cd optimum-habana/examples/trl - - pip install -r requirements.txt - - pip install wandb - - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size $DSTACK_GPUS_NUM --use_deepspeed sft.py - --model_name_or_path $MODEL_ID - --dataset_name "lvwerra/stack-exchange-paired" - --deepspeed ../language-modeling/llama2_ds_zero3_config.json - --output_dir="./sft" - --do_train - --max_steps=500 - --logging_steps=10 - --save_steps=100 - --per_device_train_batch_size=1 - --per_device_eval_batch_size=1 - --gradient_accumulation_steps=2 - --learning_rate=1e-4 - --lr_scheduler_type="cosine" - --warmup_steps=100 - --weight_decay=0.05 - --optim="paged_adamw_32bit" - --lora_target_modules "q_proj" "v_proj" - --bf16 - --remove_unused_columns=False - --run_name="sft_deepseek_70" - --report_to="wandb" - --use_habana - --use_lazy_mode - -resources: - gpu: gaudi2:8 -``` - -
- -To finetune `DeepSeek-R1-Distill-Llama-70B` with eight Gaudi 2, -you can partially offload parameters to CPU memory using the Deepspeed configuration file. -For more details, refer to [parameter offloading](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeedzerooffloadparamconfig). - -## Applying a configuration - -Once the configuration is ready, run `dstack apply -f `. - -
- -```shell -$ dstack apply -f examples/inference/vllm/.dstack.yml - - # BACKEND REGION RESOURCES SPOT PRICE - 1 ssh remote 152xCPU,1007GB,8xGaudi2:96GB yes $0 idle - -Submit a new run? [y/n]: y - -Provisioning... ----> 100% -``` - -
- -## Source code - -The source-code of this example can be found in -[`examples/llms/deepseek/tgi/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/tgi/intel), -[`examples/llms/deepseek/vllm/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/vllm/intel) and -[`examples/llms/deepseek/trl/intel`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/trl/intel). - -!!! info "What's next?" - 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). - 2. See also [Intel Gaudi Documentation](https://docs.habana.ai/en/latest/index.html), [vLLM Inference with Gaudi](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/vLLM_Inference.html) - and [Optimum for Gaudi examples](https://github.com/huggingface/optimum-habana/blob/main/examples/trl/README.md). diff --git a/examples/inference/tgi/.dstack.yml b/examples/inference/tgi/.dstack.yml deleted file mode 100644 index 67fe1179d4..0000000000 --- a/examples/inference/tgi/.dstack.yml +++ /dev/null @@ -1,32 +0,0 @@ -type: service -name: llama4-scout - -image: ghcr.io/huggingface/text-generation-inference:latest - -env: - - HF_TOKEN - - MODEL_ID=meta-llama/Llama-4-Scout-17B-16E-Instruct - - MAX_INPUT_LENGTH=8192 - - MAX_TOTAL_TOKENS=16384 - # max_batch_prefill_tokens must be >= max_input_tokens - - MAX_BATCH_PREFILL_TOKENS=8192 -commands: - # Activate the virtual environment at /usr/src/.venv/ - # as required by TGI's latest image. - - . /usr/src/.venv/bin/activate - - NUM_SHARD=$DSTACK_GPUS_NUM text-generation-launcher - -port: 80 -# Register the model -model: meta-llama/Llama-4-Scout-17B-16E-Instruct - -# Uncomment to leverage spot instances -#spot_policy: auto - -# Uncomment to cache downloaded models -#volumes: -# - /data:/data - -resources: - gpu: H200:2 - disk: 500GB.. diff --git a/examples/inference/tgi/README.md b/examples/inference/tgi/README.md deleted file mode 100644 index 08a1de74db..0000000000 --- a/examples/inference/tgi/README.md +++ /dev/null @@ -1,124 +0,0 @@ ---- -title: HuggingFace TGI -description: Deploying Llama 4 Scout using HuggingFace Text Generation Inference ---- - -# HuggingFace TGI - -This example shows how to deploy Llama 4 Scout with `dstack` using [HuggingFace TGI](https://huggingface.co/docs/text-generation-inference/en/index). - -??? info "Prerequisites" - Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. - -
- - ```shell - $ git clone https://github.com/dstackai/dstack - $ cd dstack - ``` - -
- -## Deployment - -Here's an example of a service that deploys [`Llama-4-Scout-17B-16E-Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) using TGI. - -
- -```yaml -type: service -name: llama4-scout - -image: ghcr.io/huggingface/text-generation-inference:latest - -env: - - HF_TOKEN - - MODEL_ID=meta-llama/Llama-4-Scout-17B-16E-Instruct - - MAX_INPUT_LENGTH=8192 - - MAX_TOTAL_TOKENS=16384 - # max_batch_prefill_tokens must be >= max_input_tokens - - MAX_BATCH_PREFILL_TOKENS=8192 -commands: - # Activate the virtual environment at /usr/src/.venv/ - # as required by TGI's latest image. - - . /usr/src/.venv/bin/activate - - NUM_SHARD=$DSTACK_GPUS_NUM text-generation-launcher - -port: 80 -# Register the model -model: meta-llama/Llama-4-Scout-17B-16E-Instruct - -# Uncomment to leverage spot instances -#spot_policy: auto - -# Uncomment to cache downloaded models -#volumes: -# - /data:/data - -resources: - gpu: H200:2 - disk: 500GB.. -``` -
- -### Running a configuration - -To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. - -
- -```shell -$ HF_TOKEN=... -$ dstack apply -f examples/inference/tgi/.dstack.yml - - # BACKEND REGION RESOURCES SPOT PRICE - 1 vastai is-iceland 48xCPU, 128GB, 2xH200 (140GB) no $7.87 - 2 runpod EU-SE-1 40xCPU, 128GB, 2xH200 (140GB) no $7.98 - -Submit the run llama4-scout? [y/n]: y - -Provisioning... ----> 100% -``` -
- -If no gateway is created, the service endpoint will be available at `/proxy/services///`. - -
- -```shell -$ curl http://127.0.0.1:3000/proxy/services/main/llama4-scout/v1/chat/completions \ - -X POST \ - -H 'Authorization: Bearer <dstack token>' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "What is Deep Learning?" - } - ], - "max_tokens": 128 - }' -``` - -
- -When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the service endpoint will be available at `https://llama4-scout./`. - -## Source code - -The source-code of this example can be found in -[`examples/inference/tgi`](https://github.com/dstackai/dstack/blob/master/examples/inference/tgi). - -## What's next? - -1. Check [services](https://dstack.ai/docs/services) -2. Browse the [Llama](https://dstack.ai/examples/llms/llama/), [vLLM](https://dstack.ai/examples/inference/vllm/), [SgLang](https://dstack.ai/examples/inference/sglang/) and [NIM](https://dstack.ai/examples/inference/nim/) examples -3. See also [AMD](https://dstack.ai/examples/accelerators/amd/) and - [TPU](https://dstack.ai/examples/accelerators/tpu/) diff --git a/examples/inference/tgi/amd/.dstack.yml b/examples/inference/tgi/amd/.dstack.yml deleted file mode 100644 index 46c2239688..0000000000 --- a/examples/inference/tgi/amd/.dstack.yml +++ /dev/null @@ -1,21 +0,0 @@ -type: service -name: amd-service-tgi - -image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm -env: - - HF_TOKEN - - ROCM_USE_FLASH_ATTN_V2_TRITON=true - - TRUST_REMOTE_CODE=true - - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct -commands: - - text-generation-launcher --port 8000 -port: 8000 -# Register the model -model: meta-llama/Meta-Llama-3.1-70B-Instruct - -# Uncomment to leverage spot instances -#spot_policy: auto - -resources: - gpu: MI300X - disk: 150GB diff --git a/examples/inference/tgi/tpu/.dstack.yml b/examples/inference/tgi/tpu/.dstack.yml deleted file mode 100644 index 42ba5ab7fd..0000000000 --- a/examples/inference/tgi/tpu/.dstack.yml +++ /dev/null @@ -1,27 +0,0 @@ -type: service -# The name is optional, if not specified, generated randomly -name: llama31-service-optimum-tpu - -# Using a Docker image with a fix instead of the official one -# More details at https://github.com/huggingface/optimum-tpu/pull/92 -image: dstackai/optimum-tpu:llama31 -# Required environment variables -env: - - HF_TOKEN - - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct - - MAX_TOTAL_TOKENS=4096 - - MAX_BATCH_PREFILL_TOKENS=4095 -commands: - - text-generation-launcher --port 8000 -port: 8000 -model: - format: tgi - type: chat - name: meta-llama/Meta-Llama-3.1-8B-Instruct - -# Uncomment to leverage spot instances -#spot_policy: auto - -resources: - # Required resources - gpu: v5litepod-4 diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md index 7af4e97989..ce77e31782 100644 --- a/examples/inference/vllm/README.md +++ b/examples/inference/vllm/README.md @@ -116,7 +116,7 @@ The source-code of this example can be found in ## What's next? 1. Check [services](https://dstack.ai/docs/services) -2. Browse the [Llama 3.1](https://dstack.ai/examples/llms/llama31/), [TGI](https://dstack.ai/examples/inference/tgi/) - and [NIM](https://dstack.ai/examples/inference/nim/) examples +2. Browse the [Llama 3.1](https://dstack.ai/examples/llms/llama31/) and + [NIM](https://dstack.ai/examples/inference/nim/) examples 3. See also [AMD](https://dstack.ai/examples/accelerators/amd/) and [TPU](https://dstack.ai/examples/accelerators/tpu/) diff --git a/examples/llms/deepseek/README.md b/examples/llms/deepseek/README.md index 41d73e9e99..ae467891fc 100644 --- a/examples/llms/deepseek/README.md +++ b/examples/llms/deepseek/README.md @@ -78,95 +78,6 @@ Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` usin Note, when using `Deepseek-R1-Distill-Llama-70B` with `vLLM` with a 192GB GPU, we must limit the context size to 126432 tokens to fit the memory. -### Intel Gaudi - -Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-70B` -using [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) -and [vLLM](https://github.com/HabanaAI/vllm-fork) (Gaudi fork) with Intel Gaudi 2. - -> Both [TGI on Gaudi](https://github.com/huggingface/tgi-gaudi) -> and [vLLM](https://github.com/HabanaAI/vllm-fork) do not support `Deepseek-V2-Lite`. -> See [this](https://github.com/huggingface/tgi-gaudi/issues/271) -> and [this](https://github.com/HabanaAI/vllm-fork/issues/809#issuecomment-2652454824) issues. - -=== "TGI" - -
- ```yaml - type: service - - name: tgi - - image: ghcr.io/huggingface/tgi-gaudi:2.3.1 - - auth: false - port: 8000 - - model: DeepSeek-R1-Distill-Llama-70B - - env: - - HF_TOKEN - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - PORT=8000 - - OMPI_MCA_btl_vader_single_copy_mechanism=none - - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true - - PT_HPU_ENABLE_LAZY_COLLECTIVES=true - - MAX_TOTAL_TOKENS=2048 - - BATCH_BUCKET_SIZE=256 - - PREFILL_BATCH_BUCKET_SIZE=4 - - PAD_SEQUENCE_TO_MULTIPLE_OF=64 - - ENABLE_HPU_GRAPH=true - - LIMIT_HPU_GRAPH=true - - USE_FLASH_ATTENTION=true - - FLASH_ATTENTION_RECOMPUTE=true - - commands: - - text-generation-launcher - --sharded true - --num-shard 8 - --max-input-length 1024 - --max-total-tokens 2048 - --max-batch-prefill-tokens 4096 - --max-batch-total-tokens 524288 - --max-waiting-tokens 7 - --waiting-served-ratio 1.2 - --max-concurrent-requests 512 - - resources: - gpu: Gaudi2:8 - ``` -
- -=== "vLLM" - -
- ```yaml - type: service - name: deepseek-r1-gaudi - - image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest - - - env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - HABANA_VISIBLE_DEVICES=all - - OMPI_MCA_btl_vader_single_copy_mechanism=none - - commands: - - git clone https://github.com/HabanaAI/vllm-fork.git - - cd vllm-fork - - git checkout habana_main - - pip install -r requirements-hpu.txt - - python setup.py develop - - vllm serve $MODEL_ID - --tensor-parallel-size 8 - --trust-remote-code - --download-dir /data - - port: 8000 - ``` -
- ### NVIDIA Here's an example of a service that deploys `Deepseek-R1-Distill-Llama-8B` @@ -241,7 +152,7 @@ Approximate memory requirements for loading the model (excluding context and CUD | `DeepSeek-R1-Distill-Qwen` | **7B** | 16GB | 8GB | 4GB | For example, the FP8 version of Deepseek-R1 671B fits on a single node of MI300X with eight 192GB GPUs, a single node of -H200 with eight 141GB GPUs, or a single node of Intel Gaudi2 with eight 96GB GPUs. +H200 with eight 141GB GPUs. ### Applying the configuration @@ -400,65 +311,6 @@ Here are the examples of LoRA fine-tuning of `Deepseek-V2-Lite` and GRPO fine-tu Note, the `GRPO` fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` consumes up to 135GB of VRAM. -### Intel Gaudi - -Here is an example of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-7B` on Intel Gaudi 2 GPUs using -HuggingFace's [Optimum for Intel Gaudi](https://github.com/huggingface/optimum-habana) -and [DeepSpeed](https://github.com/deepspeedai/DeepSpeed). Both also support `LoRA` -fine-tuning of `Deepseek-V2-Lite` with same configuration as below. - -=== "LoRA" - -
- ```yaml - type: task - name: trl-train - - image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 - - env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B - - WANDB_API_KEY - - WANDB_PROJECT - commands: - - pip install --upgrade-strategy eager optimum[habana] - - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 - - git clone https://github.com/huggingface/optimum-habana.git - - cd optimum-habana/examples/trl - - pip install -r requirements.txt - - pip install wandb - - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py - --model_name_or_path $MODEL_ID - --dataset_name "lvwerra/stack-exchange-paired" - --deepspeed ../language-modeling/llama2_ds_zero3_config.json - --output_dir="./sft" - --do_train - --max_steps=500 - --logging_steps=10 - --save_steps=100 - --per_device_train_batch_size=1 - --per_device_eval_batch_size=1 - --gradient_accumulation_steps=2 - --learning_rate=1e-4 - --lr_scheduler_type="cosine" - --warmup_steps=100 - --weight_decay=0.05 - --optim="paged_adamw_32bit" - --lora_target_modules "q_proj" "v_proj" - --bf16 - --remove_unused_columns=False - --run_name="sft_deepseek_70" - --report_to="wandb" - --use_habana - --use_lazy_mode - - resources: - gpu: gaudi2:8 - ``` - -
- - ### NVIDIA Here are examples of LoRA fine-tuning of `DeepSeek-R1-Distill-Qwen-1.5B` and QLoRA fine-tuning of `DeepSeek-V2-Lite` diff --git a/examples/llms/deepseek/tgi/intel/.dstack.yml b/examples/llms/deepseek/tgi/intel/.dstack.yml deleted file mode 100644 index 16d0830924..0000000000 --- a/examples/llms/deepseek/tgi/intel/.dstack.yml +++ /dev/null @@ -1,45 +0,0 @@ -type: service - -name: tgi - -image: ghcr.io/huggingface/tgi-gaudi:2.3.1 - -auth: false -port: 8000 - -model: DeepSeek-R1-Distill-Llama-70B - -env: - - HF_TOKEN - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - PORT=8000 - - OMPI_MCA_btl_vader_single_copy_mechanism=none - - TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN=true - - PT_HPU_ENABLE_LAZY_COLLECTIVES=true - - MAX_TOTAL_TOKENS=2048 - - BATCH_BUCKET_SIZE=256 - - PREFILL_BATCH_BUCKET_SIZE=4 - - PAD_SEQUENCE_TO_MULTIPLE_OF=64 - - ENABLE_HPU_GRAPH=true - - LIMIT_HPU_GRAPH=true - - USE_FLASH_ATTENTION=true - - FLASH_ATTENTION_RECOMPUTE=true - -commands: - - text-generation-launcher - --sharded true - --num-shard 8 - --max-input-length 1024 - --max-total-tokens 2048 - --max-batch-prefill-tokens 4096 - --max-batch-total-tokens 524288 - --max-waiting-tokens 7 - --waiting-served-ratio 1.2 - --max-concurrent-requests 512 - -resources: - gpu: Gaudi2:8 - -# Uncomment to cache downloaded models -#volumes: -# - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub diff --git a/examples/llms/deepseek/trl/intel/.dstack.yml b/examples/llms/deepseek/trl/intel/.dstack.yml deleted file mode 100644 index 9963e48445..0000000000 --- a/examples/llms/deepseek/trl/intel/.dstack.yml +++ /dev/null @@ -1,46 +0,0 @@ -type: task -# The name is optional, if not specified, generated randomly -name: trl-train - -image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 - -# Required environment variables -env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B - - WANDB_API_KEY - - WANDB_PROJECT -# Commands of the task -commands: - - pip install --upgrade-strategy eager optimum[habana] - - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 - - git clone https://github.com/huggingface/optimum-habana.git - - cd optimum-habana/examples/trl - - pip install -r requirements.txt - - pip install wandb - - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py - --model_name_or_path $MODEL_ID - --dataset_name "lvwerra/stack-exchange-paired" - --deepspeed ../language-modeling/llama2_ds_zero3_config.json - --output_dir="./sft" - --do_train - --max_steps=500 - --logging_steps=10 - --save_steps=100 - --per_device_train_batch_size=1 - --per_device_eval_batch_size=1 - --gradient_accumulation_steps=2 - --learning_rate=1e-4 - --lr_scheduler_type="cosine" - --warmup_steps=100 - --weight_decay=0.05 - --optim="paged_adamw_32bit" - --lora_target_modules "q_proj" "v_proj" - --bf16 - --remove_unused_columns=False - --run_name="sft_deepseek_70" - --report_to="wandb" - --use_habana - --use_lazy_mode - -resources: - gpu: gaudi2:8 diff --git a/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml b/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml deleted file mode 100644 index 7aa13d677a..0000000000 --- a/examples/llms/deepseek/trl/intel/deepseek_v2.dstack.yml +++ /dev/null @@ -1,45 +0,0 @@ -type: task -# The name is optional, if not specified, generated randomly -name: trl-train-deepseek-v2-lite - -image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 - -# Required environment variables -env: - - MODEL_ID=deepseek-ai/DeepSeek-V2-Lite - - WANDB_API_KEY - - WANDB_PROJECT -# Commands of the task -commands: - - pip install git+https://github.com/huggingface/optimum-habana.git - - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 - - git clone https://github.com/huggingface/optimum-habana.git - - cd optimum-habana/examples/trl - - pip install -r requirements.txt - - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py - --model_name_or_path $MODEL_ID - --dataset_name "lvwerra/stack-exchange-paired" - --deepspeed ../language-modeling/llama2_ds_zero3_config.json - --output_dir="./sft" - --do_train - --max_steps=500 - --logging_steps=10 - --save_steps=100 - --per_device_train_batch_size=1 - --per_device_eval_batch_size=1 - --gradient_accumulation_steps=2 - --learning_rate=1e-4 - --lr_scheduler_type="cosine" - --warmup_steps=100 - --weight_decay=0.05 - --optim="paged_adamw_32bit" - --lora_target_modules "q_proj" "v_proj" - --bf16 - --remove_unused_columns=False - --run_name="sft_deepseek_v2lite" - --report_to="wandb" - --use_habana - --use_lazy_mode - -resources: - gpu: gaudi2:8 diff --git a/examples/llms/deepseek/vllm/intel/.dstack.yml b/examples/llms/deepseek/vllm/intel/.dstack.yml deleted file mode 100644 index d28a0152d8..0000000000 --- a/examples/llms/deepseek/vllm/intel/.dstack.yml +++ /dev/null @@ -1,31 +0,0 @@ -type: service -name: deepseek-r1-gaudi - -image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest - -env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B - - HABANA_VISIBLE_DEVICES=all - - OMPI_MCA_btl_vader_single_copy_mechanism=none - -commands: - - git clone https://github.com/HabanaAI/vllm-fork.git - - cd vllm-fork - - git checkout habana_main - - pip install -r requirements-hpu.txt - - python setup.py develop - - vllm serve $MODEL_ID - --tensor-parallel-size 8 - --trust-remote-code - --download-dir /data - -port: 8000 - -model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B - -resources: - gpu: gaudi2:8 - -# Uncomment to cache downloaded models -#volumes: -# - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub diff --git a/mkdocs.yml b/mkdocs.yml index 3fcc531f2b..34437b1799 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -105,7 +105,6 @@ plugins: "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md" "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" - "docs/examples/deployment/tgi/index.md": "examples/inference/tgi/index.md" "backends.md": "docs/concepts/backends.md" "blog/monitoring-gpu-usage.md": "blog/posts/dstack-metrics.md" "blog/inactive-dev-environments-auto-shutdown.md": "blog/posts/inactivity-duration.md" @@ -122,7 +121,6 @@ plugins: "examples/distributed-training/rccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" "examples/deployment/nim/index.md": "examples/inference/nim/index.md" "examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" - "examples/deployment/tgi/index.md": "examples/inference/tgi/index.md" "examples/deployment/sglang/index.md": "examples/inference/sglang/index.md" "examples/deployment/trtllm/index.md": "examples/inference/trtllm/index.md" "examples/fine-tuning/trl/index.md": "examples/single-node-training/trl/index.md" @@ -304,13 +302,11 @@ nav: - Inference: - SGLang: examples/inference/sglang/index.md - vLLM: examples/inference/vllm/index.md - - TGI: examples/inference/tgi/index.md - NIM: examples/inference/nim/index.md - TensorRT-LLM: examples/inference/trtllm/index.md - Accelerators: - AMD: examples/accelerators/amd/index.md - TPU: examples/accelerators/tpu/index.md - - Intel Gaudi: examples/accelerators/intel/index.md - Tenstorrent: examples/accelerators/tenstorrent/index.md - Models: - Wan2.2: examples/models/wan22/index.md diff --git a/scripts/docs/gen_cli_reference.py b/scripts/docs/gen_cli_reference.py index 04db41df49..b72f48d1f8 100644 --- a/scripts/docs/gen_cli_reference.py +++ b/scripts/docs/gen_cli_reference.py @@ -22,9 +22,6 @@ DISABLE_ENV = "DSTACK_DOCS_DISABLE_CLI_REFERENCE" -logger.info("Generating CLI reference...") - - @cache # TODO make caching work def call_dstack(command: str) -> str: return subprocess.check_output(shlex.split(command)).decode() @@ -59,8 +56,10 @@ def process_file(file: File): def main(): if os.environ.get(DISABLE_ENV): - logger.warning(f"CLI reference generation is disabled: {DISABLE_ENV} is set") + logger.warning("CLI reference generation is disabled") exit() + + logger.info("Generating CLI reference...") # Sequential processing take > 10s with concurrent.futures.ThreadPoolExecutor() as pool: futures = [] diff --git a/scripts/docs/gen_openapi_reference.py b/scripts/docs/gen_openapi_reference.py index bb3a3d42f7..847bf74c46 100644 --- a/scripts/docs/gen_openapi_reference.py +++ b/scripts/docs/gen_openapi_reference.py @@ -3,11 +3,20 @@ """ import json +import logging +import os from pathlib import Path from dstack._internal.server.main import app from dstack._internal.settings import DSTACK_VERSION +disable_env = "DSTACK_DOCS_DISABLE_OPENAPI_REFERENCE" +if os.environ.get(disable_env): + logging.getLogger("mkdocs.plugins.dstack.openapi").warning( + "OpenAPI reference generation is disabled" + ) + exit(0) + app.title = "OpenAPI Spec" app.servers = [ {"url": "http://localhost:3000", "description": "Local server"}, diff --git a/scripts/docs/gen_rest_plugin_spec_reference.py b/scripts/docs/gen_rest_plugin_spec_reference.py index 6d9fa93c87..bfc5018dc8 100644 --- a/scripts/docs/gen_rest_plugin_spec_reference.py +++ b/scripts/docs/gen_rest_plugin_spec_reference.py @@ -4,11 +4,16 @@ import json import logging +import os from pathlib import Path from dstack._internal.settings import DSTACK_VERSION logger = logging.getLogger("mkdocs.plugins.dstack.rest_plugin_schema") +disable_env = "DSTACK_DOCS_DISABLE_REST_PLUGIN_SPEC_REFERENCE" +if os.environ.get(disable_env): + logger.warning("REST plugin spec reference generation is disabled") + exit(0) try: from example_plugin_server.main import app diff --git a/scripts/docs/hooks.py b/scripts/docs/hooks.py index 4530172d1a..ce5b3740bf 100644 --- a/scripts/docs/hooks.py +++ b/scripts/docs/hooks.py @@ -15,6 +15,8 @@ WELL_KNOWN_SKILLS_DIR = ".well-known/skills" SKILL_PATH = ("skills", "dstack", "SKILL.md") DISABLE_EXAMPLES_ENV = "DSTACK_DOCS_DISABLE_EXAMPLES" +DISABLE_LLM_TXT_ENV = "DSTACK_DOCS_DISABLE_LLM_TXT" +DISABLE_YAML_SCHEMAS_ENV = "DSTACK_DOCS_DISABLE_YAML_SCHEMAS" SCHEMA_REFERENCE_PREFIX = "docs/reference/" @@ -35,6 +37,8 @@ def _get_schema_expanded_content(rel_path, config, src_path=None): """Return expanded markdown for reference/**/*.md that contain #SCHEMA#, else None. If src_path is given (e.g. from on_post_build loop), read from it; else build path from config. """ + if os.environ.get(DISABLE_YAML_SCHEMAS_ENV): + return None if not rel_path.startswith(SCHEMA_REFERENCE_PREFIX) or not rel_path.endswith(".md"): log.debug(f"Skipping {rel_path}: not in {SCHEMA_REFERENCE_PREFIX} or not .md") return None @@ -88,6 +92,16 @@ def on_page_read_source(page, config): return None +def on_config(config): + if os.environ.get(DISABLE_EXAMPLES_ENV): + log.warning("Examples documentation is disabled") + if os.environ.get(DISABLE_YAML_SCHEMAS_ENV): + log.warning("YAML schema reference generation is disabled") + if os.environ.get(DISABLE_LLM_TXT_ENV): + log.warning("llms.txt generation is disabled") + return config + + def on_page_context(context, page, config, nav): """Override edit_url only for example stubs so Edit points to the README; other pages use theme default from edit_uri.""" repo_url = (config.get("repo_url") or "").rstrip("/") @@ -204,6 +218,9 @@ def _write_well_known_skills(config, site_dir): def _generate_llms_files(config, site_dir): """Generate llms.txt and llms-full.txt using external script.""" + if os.environ.get(DISABLE_LLM_TXT_ENV): + return + repo_root = os.path.dirname(config["config_file_path"]) # Import and run the generator