From b27f99ccb8e5068bec1d7ad1a2c1d976a9457318 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Wed, 6 May 2026 17:01:03 +0200 Subject: [PATCH 1/7] Inline example source code into docs pages Drop the mkdocs hook that materialized examples//README.md into docs/examples//index.md stubs at build time. Move the 20 navigated example READMEs directly into docs/examples/.md (flat layout, no per-example subdirectory) and delete the parallel .dstack.yml configs since their content is already inline in the markdown. The two GCP NCCL test yamls that were only referenced via dead "Source code" admonitions are now inlined into their respective tabs. Within the moved pages, convert absolute https://dstack.ai/(docs|examples) links to relative .md paths so mkdocs strict mode validates them. Non-navigated examples (misc/, llms/, server-deployment/, plugins/, single-node-training/{qlora,optimum-tpu}, the AMD subdirs, etc.) are left untouched for a later pass. --- contributing/DOCS.md | 30 +--- ...d-kubernetes-2024-recap-and-whats-ahead.md | 2 +- docs/blog/posts/changelog-07-25.md | 2 +- docs/blog/posts/gpu-health-checks.md | 2 +- docs/blog/posts/mpi.md | 2 +- docs/blog/posts/nebius-in-dstack-sky.md | 4 +- docs/docs/concepts/fleets.md | 8 +- docs/docs/concepts/gateways.md | 2 +- docs/docs/concepts/services.md | 4 +- .../examples/accelerators/amd.md | 18 +-- docs/examples/accelerators/amd/index.md | 0 .../examples/accelerators/tenstorrent.md | 28 ++-- .../accelerators/tenstorrent/index.md | 0 .../examples/accelerators/tpu.md | 14 +- docs/examples/accelerators/tpu/index.md | 0 .../examples/clusters/aws.md | 19 ++- docs/examples/clusters/aws/index.md | 0 .../examples/clusters/crusoe.md | 14 +- docs/examples/clusters/crusoe/index.md | 0 .../examples/clusters/gcp.md | 136 +++++++++++++++--- docs/examples/clusters/gcp/index.md | 0 .../examples/clusters/lambda.md | 16 +-- docs/examples/clusters/lambda/index.md | 0 .../examples/clusters/nccl-rccl-tests.md | 16 +-- .../clusters/nccl-rccl-tests/index.md | 0 .../examples/clusters/nebius.md | 12 +- docs/examples/clusters/nebius/index.md | 0 .../examples/distributed-training/axolotl.md | 18 +-- .../distributed-training/axolotl/index.md | 0 .../distributed-training/ray-ragen.md | 13 +- .../distributed-training/ray-ragen/index.md | 0 .../examples/distributed-training/trl.md | 16 +-- .../distributed-training/trl/index.md | 0 .../examples/inference/nim.md | 12 +- docs/examples/inference/nim/index.md | 0 .../examples/inference/sglang.md | 16 +-- docs/examples/inference/sglang/index.md | 0 .../examples/inference/trtllm.md | 10 +- docs/examples/inference/trtllm/index.md | 0 .../examples/inference/vllm.md | 16 +-- docs/examples/inference/vllm/index.md | 0 .../examples/models/deepseek-v4.md | 6 +- docs/examples/models/deepseek-v4/index.md | 0 .../examples/models/qwen36.md | 10 +- docs/examples/models/qwen36/index.md | 0 .../examples/single-node-training/axolotl.md | 16 +-- .../single-node-training/axolotl/index.md | 0 .../examples/single-node-training/trl.md | 14 +- .../single-node-training/trl/index.md | 0 examples/accelerators/tenstorrent/.dstack.yml | 9 -- .../tt-inference-server.dstack.yml | 24 ---- .../tenstorrent/tt-smi.dstack.yml | 10 -- examples/clusters/aws/fleet.dstack.yml | 8 -- examples/clusters/gcp/a3-fleet.dstack.yml | 7 - examples/clusters/gcp/a3high-fleet.dstack.yml | 7 - .../clusters/gcp/a3high-nccl-tests.dstack.yml | 37 ----- .../clusters/gcp/a3mega-nccl-tests.dstack.yml | 50 ------- examples/clusters/gcp/a4-fleet.dstack.yml | 13 -- .../nccl-rccl-tests/nccl-tests.dstack.yml | 29 ---- .../nccl-rccl-tests/rccl-tests.dstack.yml | 44 ------ .../distributed-training/axolotl/.dstack.yml | 49 ------- .../axolotl/fleet.dstack.yml | 9 -- .../ray-ragen/.dstack.yml | 39 ----- .../ray-ragen/fleet.dstack.yml | 9 -- .../trl/deepspeed.dstack.yml | 52 ------- .../distributed-training/trl/fleet.dstack.yml | 9 -- .../distributed-training/trl/fsdp.dstack.yml | 52 ------- .../single-node-training/axolotl/.dstack.yml | 28 ---- .../single-node-training/trl/train.dstack.yml | 54 ------- mkdocs.yml | 101 ++++++------- scripts/docs/gen_examples.py | 31 ---- scripts/docs/gen_llms_files.py | 14 +- scripts/docs/hooks.py | 68 ++------- 73 files changed, 339 insertions(+), 890 deletions(-) rename examples/accelerators/amd/README.md => docs/examples/accelerators/amd.md (91%) delete mode 100644 docs/examples/accelerators/amd/index.md rename examples/accelerators/tenstorrent/README.md => docs/examples/accelerators/tenstorrent.md (81%) delete mode 100644 docs/examples/accelerators/tenstorrent/index.md rename examples/accelerators/tpu/README.md => docs/examples/accelerators/tpu.md (94%) delete mode 100644 docs/examples/accelerators/tpu/index.md rename examples/clusters/aws/README.md => docs/examples/clusters/aws.md (84%) delete mode 100644 docs/examples/clusters/aws/index.md rename examples/clusters/crusoe/README.md => docs/examples/clusters/crusoe.md (88%) delete mode 100644 docs/examples/clusters/crusoe/index.md rename examples/clusters/gcp/README.md => docs/examples/clusters/gcp.md (76%) delete mode 100644 docs/examples/clusters/gcp/index.md rename examples/clusters/lambda/README.md => docs/examples/clusters/lambda.md (84%) delete mode 100644 docs/examples/clusters/lambda/index.md rename examples/clusters/nccl-rccl-tests/README.md => docs/examples/clusters/nccl-rccl-tests.md (82%) delete mode 100644 docs/examples/clusters/nccl-rccl-tests/index.md rename examples/clusters/nebius/README.md => docs/examples/clusters/nebius.md (90%) delete mode 100644 docs/examples/clusters/nebius/index.md rename examples/distributed-training/axolotl/README.md => docs/examples/distributed-training/axolotl.md (75%) delete mode 100644 docs/examples/distributed-training/axolotl/index.md rename examples/distributed-training/ray-ragen/README.md => docs/examples/distributed-training/ray-ragen.md (86%) delete mode 100644 docs/examples/distributed-training/ray-ragen/index.md rename examples/distributed-training/trl/README.md => docs/examples/distributed-training/trl.md (83%) delete mode 100644 docs/examples/distributed-training/trl/index.md rename examples/inference/nim/README.md => docs/examples/inference/nim.md (80%) delete mode 100644 docs/examples/inference/nim/index.md rename examples/inference/sglang/README.md => docs/examples/inference/sglang.md (89%) delete mode 100644 docs/examples/inference/sglang/index.md rename examples/inference/trtllm/README.md => docs/examples/inference/trtllm.md (83%) delete mode 100644 docs/examples/inference/trtllm/index.md rename examples/inference/vllm/README.md => docs/examples/inference/vllm.md (77%) delete mode 100644 docs/examples/inference/vllm/index.md rename examples/models/deepseek-v4/README.md => docs/examples/models/deepseek-v4.md (93%) delete mode 100644 docs/examples/models/deepseek-v4/index.md rename examples/models/qwen36/README.md => docs/examples/models/qwen36.md (91%) delete mode 100644 docs/examples/models/qwen36/index.md rename examples/single-node-training/axolotl/README.md => docs/examples/single-node-training/axolotl.md (78%) delete mode 100644 docs/examples/single-node-training/axolotl/index.md rename examples/single-node-training/trl/README.md => docs/examples/single-node-training/trl.md (83%) delete mode 100644 docs/examples/single-node-training/trl/index.md delete mode 100644 examples/accelerators/tenstorrent/.dstack.yml delete mode 100644 examples/accelerators/tenstorrent/tt-inference-server.dstack.yml delete mode 100644 examples/accelerators/tenstorrent/tt-smi.dstack.yml delete mode 100644 examples/clusters/aws/fleet.dstack.yml delete mode 100644 examples/clusters/gcp/a3-fleet.dstack.yml delete mode 100644 examples/clusters/gcp/a3high-fleet.dstack.yml delete mode 100644 examples/clusters/gcp/a3high-nccl-tests.dstack.yml delete mode 100644 examples/clusters/gcp/a3mega-nccl-tests.dstack.yml delete mode 100644 examples/clusters/gcp/a4-fleet.dstack.yml delete mode 100644 examples/clusters/nccl-rccl-tests/nccl-tests.dstack.yml delete mode 100644 examples/clusters/nccl-rccl-tests/rccl-tests.dstack.yml delete mode 100644 examples/distributed-training/axolotl/.dstack.yml delete mode 100644 examples/distributed-training/axolotl/fleet.dstack.yml delete mode 100644 examples/distributed-training/ray-ragen/.dstack.yml delete mode 100644 examples/distributed-training/ray-ragen/fleet.dstack.yml delete mode 100644 examples/distributed-training/trl/deepspeed.dstack.yml delete mode 100644 examples/distributed-training/trl/fleet.dstack.yml delete mode 100644 examples/distributed-training/trl/fsdp.dstack.yml delete mode 100644 examples/single-node-training/axolotl/.dstack.yml delete mode 100644 examples/single-node-training/trl/train.dstack.yml delete mode 100644 scripts/docs/gen_examples.py diff --git a/contributing/DOCS.md b/contributing/DOCS.md index 4fcc04d6d1..ce545803a4 100644 --- a/contributing/DOCS.md +++ b/contributing/DOCS.md @@ -39,7 +39,7 @@ uv run pre-commit install To preview the documentation, run the follow command: ```shell -uv run mkdocs serve -w examples -s +uv run mkdocs serve -s ``` If you want to build static files, you can use the following command: @@ -57,7 +57,6 @@ The documentation uses a custom build system with MkDocs hooks to generate vario Use these in `.envrc` to disable expensive docs regeneration, especially during `mkdocs serve` auto-reload. Set any of them to disable the corresponding artifact. ```shell -export DSTACK_DOCS_DISABLE_EXAMPLES=1 export DSTACK_DOCS_DISABLE_LLM_TXT=1 export DSTACK_DOCS_DISABLE_CLI_REFERENCE=1 export DSTACK_DOCS_DISABLE_YAML_SCHEMAS=1 @@ -69,19 +68,11 @@ export DSTACK_DOCS_DISABLE_REST_PLUGIN_SPEC_REFERENCE=1 The build process is customized via hooks in `scripts/docs/hooks.py`: -#### 1. Example materialization - -Example pages like `examples/single-node-training/trl/index.md` are stubs that reference `README.md` files in the repository root: -- **Stub location**: `docs/examples/single-node-training/trl/index.md` -- **Content source**: `examples/single-node-training/trl/README.md` - -During the build, the hook reads the README content and uses it for rendering the HTML page. - -#### 2. Schema reference expansion +#### 1. Schema reference expansion Files in `docs/reference/**/*.md` can use `#SCHEMA#` placeholders that are expanded with generated schema documentation during the build. -#### 3. llms.txt generation +#### 2. llms.txt generation Two files are generated for LLM consumption: @@ -108,9 +99,9 @@ description: Short description of what this page covers --- ``` -For examples, add frontmatter to the `README.md` files in the repository root (e.g., `examples/single-node-training/trl/README.md`). +For examples, add frontmatter to the page files (e.g., `docs/examples/single-node-training/trl.md`). -#### 4. Skills discovery +#### 3. Skills discovery The build creates `.well-known/skills/` directory structure for skills discovery: - Reads `skills/dstack/SKILL.md` @@ -129,18 +120,11 @@ docs/ │ ├── concepts/ # Concept pages │ ├── guides/ # How-to guides │ └── reference/ # API reference (schema expansion) -├── examples/ # Example stub files (index.md) +├── examples/ # Example pages (inline source code) │ └── single-node-training/ -│ └── trl/ -│ └── index.md # Stub referencing root README +│ └── trl.md # Page content with frontmatter └── overrides/ # Theme customization -examples/ # Example content (repository root) -└── single-node-training/ - └── trl/ - ├── README.md # Actual content with frontmatter - └── train.dstack.yml - scripts/docs/ ├── hooks.py # MkDocs build hooks ├── gen_llms_files.py # llms.txt generation diff --git a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md index 9d32f336b0..8980c984f1 100644 --- a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md +++ b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -104,7 +104,7 @@ efficient manner. ### NVIDIA -NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../examples/inference/nim/index.md) +NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../examples/inference/nim.md) for model deployment, and we continue to enhance support for the rest of NVIDIA's ecosystem. ### AMD diff --git a/docs/blog/posts/changelog-07-25.md b/docs/blog/posts/changelog-07-25.md index a065ef37c7..50c8ff032a 100644 --- a/docs/blog/posts/changelog-07-25.md +++ b/docs/blog/posts/changelog-07-25.md @@ -144,7 +144,7 @@ resources: #### AWS EFA -EFA is a network interface for EC2 that enables low-latency, high-bandwidth communication between nodes—crucial for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when using supported instance types in fleets. Check out our [example](../../examples/clusters/aws/index.md) +EFA is a network interface for EC2 that enables low-latency, high-bandwidth communication between nodes—crucial for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when using supported instance types in fleets. Check out our [example](../../examples/clusters/aws.md) #### Default Docker images diff --git a/docs/blog/posts/gpu-health-checks.md b/docs/blog/posts/gpu-health-checks.md index 9b074023c4..1fe89e1d1d 100644 --- a/docs/blog/posts/gpu-health-checks.md +++ b/docs/blog/posts/gpu-health-checks.md @@ -51,7 +51,7 @@ A healthy instance is ready for workloads. A warning means you should monitor it This release focuses on passive checks using DCGM background health checks. These run continuously and do not interrupt workloads. -For active checks today, you can run [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests/index.md) as a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to verify GPU-to-GPU communication and bandwidth across a fleet. Active tests like these can reveal network or interconnect issues that passive monitoring might miss. More built-in support for active diagnostics is planned. +For active checks today, you can run [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests.md) as a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to verify GPU-to-GPU communication and bandwidth across a fleet. Active tests like these can reveal network or interconnect issues that passive monitoring might miss. More built-in support for active diagnostics is planned. ## Supported backends diff --git a/docs/blog/posts/mpi.md b/docs/blog/posts/mpi.md index 713059f2f7..37cd0dc7bf 100644 --- a/docs/blog/posts/mpi.md +++ b/docs/blog/posts/mpi.md @@ -100,5 +100,5 @@ as well as use MPI for other tasks. !!! info "What's next?" 1. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Check the [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests/index.md) example + 2. Check the [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests.md) example 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nebius-in-dstack-sky.md b/docs/blog/posts/nebius-in-dstack-sky.md index dd1617d290..823576f377 100644 --- a/docs/blog/posts/nebius-in-dstack-sky.md +++ b/docs/blog/posts/nebius-in-dstack-sky.md @@ -104,7 +104,7 @@ $ dstack apply -f my-cluster.dstack.yml Once the fleet is ready, you can run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). `dstack` automatically configures drivers, networking, and fast GPU-to-GPU interconnect. -To learn more, see the [clusters](../../examples/clusters/nebius/index.md) guide. +To learn more, see the [clusters](../../examples/clusters/nebius.md) guide. With Nebius joining `dstack` Sky, users can now run on-demand and spot GPUs and clusters directly through the marketplace—gaining access to the same production grade infrastrucure Nebius customers use for frontier-scale training, without needing a separate Nebius account. @@ -124,4 +124,4 @@ Our goal is to give teams maximum flexibility while removing the complexity of m 4. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 5. Read the [clusters](../../examples/clusters/nebius/index.md) guide + 5. Read the [clusters](../../examples/clusters/nebius.md) guide diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index b927e94d4f..685392bd80 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -164,22 +164,22 @@ This property ensures that instances are interconnected. This is required for ru === "AWS" On AWS, `dstack` requires `public_ips` to be set to `false` in the backend configuration. - Refer to the [AWS](../../examples/clusters/aws/index.md) example for more details. + Refer to the [AWS](../../examples/clusters/aws.md) example for more details. === "GCP" On GCP, you may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. + Refer to the [GCP](../../examples/clusters/gcp.md) examples for more details. === "Nebius" On [Nebius](https://docs.nebius.com/compute/clusters/gpu), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. === "Crusoe" On [Crusoe](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. - Refer to the [Crusoe](../../examples/clusters/crusoe/index.md#vms) example for more details. + Refer to the [Crusoe](../../examples/clusters/crusoe.md#vms) example for more details. === "Kubernetes" If the Kubernetes cluster has interconnect configured, `dstack` can use it without additional setup. - See the [Lambda](../../examples/clusters/lambda/index.md#kubernetes) or [Crusoe](../../examples/clusters/crusoe/index.md#kubernetes) examples. + See the [Lambda](../../examples/clusters/lambda.md#kubernetes) or [Crusoe](../../examples/clusters/crusoe.md#kubernetes) examples. > See the [Clusters](../../examples.md#clusters) examples. diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 29209124d5..53374aa53d 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -97,7 +97,7 @@ router: -If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../../examples/inference/sglang/index.md) or [Prefill-Decode workers](../../examples/inference/sglang/index.md#pd-disaggregation) (aka PD disaggregation). +If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../../examples/inference/sglang.md) or [Prefill-Decode workers](../../examples/inference/sglang.md#pd-disaggregation) (aka PD disaggregation). !!! note "PD disaggregation" To run services with PD disaggregation see [SGLang PD disaggregation](https://dstack.ai/examples/inference/sglang/#pd-disaggregation). diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 9969e565ab..1923aa0655 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -1288,5 +1288,5 @@ The rolling deployment stops when all replicas are updated or when a new deploym 1. Read about [dev environments](dev-environments.md) and [tasks](tasks.md) 2. Learn how to manage [fleets](fleets.md) 3. See how to set up [gateways](gateways.md) - 4. Check the [vLLM](../../examples/inference/vllm/index.md) and - [NIM](../../examples/inference/nim/index.md) examples + 4. Check the [vLLM](../../examples/inference/vllm.md) and + [NIM](../../examples/inference/nim.md) examples diff --git a/examples/accelerators/amd/README.md b/docs/examples/accelerators/amd.md similarity index 91% rename from examples/accelerators/amd/README.md rename to docs/examples/accelerators/amd.md index b35b29c1c9..5c0c306ce8 100644 --- a/examples/accelerators/amd/README.md +++ b/docs/examples/accelerators/amd.md @@ -6,12 +6,12 @@ description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, v # AMD `dstack` supports running dev environments, tasks, and services on AMD GPUs. -You can do that by setting up an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-fleets) +You can do that by setting up an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the `runpod` backend. ## Deployment -Here are examples of a [service](https://dstack.ai/docs/services) that deploy +Here are examples of a [service](../../docs/concepts/services.md) that deploy `Qwen/Qwen3.6-27B` on AMD MI300X GPUs using [SGLang](https://github.com/sgl-project/sglang) and [vLLM](https://docs.vllm.ai/en/latest/). @@ -101,7 +101,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by ## Fine-tuning > If you're planning multi-node AMD training, validate cluster networking first -with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) +with the [NCCL/RCCL tests](../clusters/nccl-rccl-tests.md) example. === "TRL" @@ -230,14 +230,14 @@ $ dstack apply -f ## What's next? -1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) - and [vLLM](https://dstack.ai/examples/inference/vllm/) examples, plus +1. Browse the dedicated [SGLang](../inference/sglang.md) + and [vLLM](../inference/vllm.md) examples, plus [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html), and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) 2. For multi-node training, run - [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) + [NCCL/RCCL tests](../clusters/nccl-rccl-tests.md) to validate AMD cluster networking. -3. Check [dev environments](https://dstack.ai/docs/dev-environments), - [tasks](https://dstack.ai/docs/tasks), and - [services](https://dstack.ai/docs/services). +3. Check [dev environments](../../docs/concepts/dev-environments.md), + [tasks](../../docs/concepts/tasks.md), and + [services](../../docs/concepts/services.md). diff --git a/docs/examples/accelerators/amd/index.md b/docs/examples/accelerators/amd/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/accelerators/tenstorrent/README.md b/docs/examples/accelerators/tenstorrent.md similarity index 81% rename from examples/accelerators/tenstorrent/README.md rename to docs/examples/accelerators/tenstorrent.md index 4edc463f67..65005fd3a4 100644 --- a/examples/accelerators/tenstorrent/README.md +++ b/docs/examples/accelerators/tenstorrent.md @@ -10,11 +10,11 @@ description: Running dev environments, tasks, and services on Tenstorrent Wormho ??? info "SSH fleets" -
+
```yaml type: fleet - name: wormwhole-fleet + name: tt-fleet ssh_config: user: root @@ -34,15 +34,15 @@ description: Running dev environments, tasks, and services on Tenstorrent Wormho
```bash - $ dstack apply -f examples/acceleators/tenstorrent/fleet.dstack.yml + $ dstack apply -f tt-fleet.dstack.yml - FLEET RESOURCES PRICE STATUS CREATED - wormwhole-fleet cpu=12 mem=32GB disk=243GB n150:12GB $0 idle 18 sec ago + FLEET RESOURCES PRICE STATUS CREATED + tt-fleet cpu=12 mem=32GB disk=243GB n150:12GB $0 idle 18 sec ago ```
- For more details on fleet configuration, refer to [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets). + For more details on fleet configuration, refer to [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). ## Services @@ -50,7 +50,7 @@ Here's an example of a service that deploys [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B) using [Tenstorrent Inference Service](https://github.com/tenstorrent/tt-inference-server). -
+
```yaml type: service @@ -86,7 +86,7 @@ Go ahead and run configuration using `dstack apply`:
```bash - $ dstack apply -f examples/acceleators/tenstorrent/tt-inference-server.dstack.yml + $ dstack apply -f service.dstack.yml ```
@@ -123,16 +123,16 @@ Additionally, the model is available via `dstack`'s control plane UI: ![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-model-ui.png){ width=800 } -When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the service endpoint +When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint is available at `https://./`. -> Services support many options, including authentication, auto-scaling policies, etc. To learn more, refer to [Services](https://dstack.ai/docs/concepts/services). +> Services support many options, including authentication, auto-scaling policies, etc. To learn more, refer to [Services](../../docs/concepts/services.md). ## Tasks Below is a task that simply runs `tt-smi -s`. Tasks can be used for training, fine-tuning, batch inference, or antything else. -
+
```yaml type: task @@ -159,13 +159,13 @@ resources:
-> Tasks support many options, including multi-node configuration, max duration, etc. To learn more, refer to [Tasks](https://dstack.ai/docs/concepts/tasks). +> Tasks support many options, including multi-node configuration, max duration, etc. To learn more, refer to [Tasks](../../docs/concepts/tasks.md). ## Dev environments Below is an example of a dev environment configuration. It can be used to provision a dev environemnt that can be accessed via your desktop IDE. -
+
```yaml type: dev-environment @@ -191,7 +191,7 @@ If you run it via `dstack apply`, it will output the URL to access it via your d ![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-cursor.png){ width=800 } -> Dev nevironments support many options, including inactivity and max duration, IDE configuration, etc. To learn more, refer to [Dev environments](https://dstack.ai/docs/concepts/tasks). +> Dev nevironments support many options, including inactivity and max duration, IDE configuration, etc. To learn more, refer to [Dev environments](../../docs/concepts/tasks.md). ??? info "Feedback" Found a bug, or want to request a feature? File it in the [issue tracker](https://github.com/dstackai/dstack/issues), diff --git a/docs/examples/accelerators/tenstorrent/index.md b/docs/examples/accelerators/tenstorrent/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/accelerators/tpu/README.md b/docs/examples/accelerators/tpu.md similarity index 94% rename from examples/accelerators/tpu/README.md rename to docs/examples/accelerators/tpu.md index 53f31b93bd..92640a4835 100644 --- a/examples/accelerators/tpu/README.md +++ b/docs/examples/accelerators/tpu.md @@ -7,7 +7,7 @@ description: Deploying and fine-tuning models on Google Cloud TPUs using Optimum If you've configured the `gcp` backend in `dstack`, you can run dev environments, tasks, and services on [TPUs](https://cloud.google.com/tpu/docs/intro-to-tpu). Choose a TPU instance by specifying the TPU version and the number of cores (e.g. `v5litepod-8`) in the `gpu` property under `resources`, -or request TPUs by specifying `tpu` as `vendor` ([see examples](https://dstack.ai/docs/guides/protips/#gpu)). +or request TPUs by specifying `tpu` as `vendor` ([see examples](../../docs/guides/protips.md#gpu)). Below are a few examples on using TPUs for deployment and fine-tuning. @@ -18,18 +18,18 @@ Below are a few examples on using TPUs for deployment and fine-tuning. !!! info "TPU storage" By default, each TPU VM contains a 100GB boot disk and its size cannot be changed. - If you need more storage, attach additional disks using [Volumes](https://dstack.ai/docs/concepts/volumes/). + If you need more storage, attach additional disks using [Volumes](../../docs/concepts/volumes.md). ## Deployment Many serving frameworks including vLLM and TGI have TPU support. -Here's an example of a [service](https://dstack.ai/docs/services) that deploys Llama 3.1 8B using +Here's an example of a [service](../../docs/concepts/services.md) that deploys Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) and [vLLM](https://github.com/vllm-project/vllm). === "Optimum TPU" -
+
```yaml type: service @@ -61,7 +61,7 @@ and [vLLM](https://github.com/vllm-project/vllm). the official Docker image can be used. === "vLLM" -
+
```yaml type: service @@ -189,5 +189,5 @@ Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each co 1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), - [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/concepts/fleets). +2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). diff --git a/docs/examples/accelerators/tpu/index.md b/docs/examples/accelerators/tpu/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/aws/README.md b/docs/examples/clusters/aws.md similarity index 84% rename from examples/clusters/aws/README.md rename to docs/examples/clusters/aws.md index b6319e214e..688af91e0e 100644 --- a/examples/clusters/aws/README.md +++ b/docs/examples/clusters/aws.md @@ -42,7 +42,7 @@ projects: Once your backend is ready, define a fleet configuration. -
+
```yaml type: fleet @@ -62,7 +62,7 @@ Provision the fleet with `dstack apply`:
```shell -$ dstack apply -f examples/clusters/aws/efa-fleet.dstack.yml +$ dstack apply -f efa-fleet.dstack.yml Provisioning... ---> 100% @@ -96,7 +96,7 @@ Provisioning... To confirm that EFA is working, run NCCL tests: -
+
```yaml type: task @@ -135,7 +135,7 @@ Run it with `dstack apply`:
```shell -$ dstack apply -f examples/clusters/nccl-tests/.dstack.yml +$ dstack apply -f nccl-tests.dstack.yml Provisioning... ---> 100% @@ -150,7 +150,7 @@ Provisioning... Here’s an example using `torchrun` for a simple multi-node PyTorch job: -
+
```yaml type: task @@ -186,7 +186,7 @@ Provision and launch it via `dstack apply`.
```shell -$ dstack apply -f examples/distributed-training/torchrun/.dstack.yml +$ dstack apply -f train-distrib.dstack.yml Provisioning... ---> 100% @@ -197,7 +197,6 @@ Provisioning... Instead of setting `python`, you can specify your own Docker image using `image`. Make sure that the image is properly configured for EFA. !!! info "What's next" - 1. Learn more about [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks) - 2. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments), - [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets) - 3. Read the [Clusters](https://dstack.ai/docs/guides/clusters) guide + 1. Learn more about [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks) and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) + 2. Check [dev environments](../../docs/concepts/dev-environments.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/examples/clusters/aws/index.md b/docs/examples/clusters/aws/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/crusoe/README.md b/docs/examples/clusters/crusoe.md similarity index 88% rename from examples/clusters/crusoe/README.md rename to docs/examples/clusters/crusoe.md index ed416ae3e7..2a9c108ec6 100644 --- a/examples/clusters/crusoe/README.md +++ b/docs/examples/clusters/crusoe.md @@ -67,7 +67,7 @@ $ dstack apply -f crusoe-fleet.dstack.yml This will automatically create an IB partition and provision instances with InfiniBand networking. -Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). > If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. @@ -84,7 +84,7 @@ Once the fleet is created, you can run [dev environments](https://dstack.ai/docs ### Configure the backend -Follow the standard instructions for setting up a [`kubernetes`](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: +Follow the standard instructions for setting up a [`kubernetes`](../../docs/concepts/backends.md#kubernetes) backend:
@@ -133,15 +133,15 @@ $ dstack apply -f crusoe-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). ## NCCL tests -Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. +Use a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. === "VMs" - With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](https://dstack.ai/docs/concepts/volumes#instance-volumes). + With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](../../docs/concepts/volumes.md#instance-volumes).
@@ -275,6 +275,6 @@ $ dstack apply -f crusoe-nccl-tests.dstack.yml ## What's next -1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) -2. Check out [backends](https://dstack.ai/docs/concepts/backends#crusoe-cloud) and [fleets](https://dstack.ai/docs/concepts/fleets#cloud-fleets) +1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) +2. Check out [backends](../../docs/concepts/backends.md#crusoe-cloud) and [fleets](../../docs/concepts/fleets.md#cloud-fleets) 3. Check the docs on [Crusoe's networking](https://docs.crusoecloud.com/networking/infiniband/) and ["Crusoe Managed" Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) diff --git a/docs/examples/clusters/crusoe/index.md b/docs/examples/clusters/crusoe/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/gcp/README.md b/docs/examples/clusters/gcp.md similarity index 76% rename from examples/clusters/gcp/README.md rename to docs/examples/clusters/gcp.md index a4610235b2..b0f0393200 100644 --- a/examples/clusters/gcp/README.md +++ b/docs/examples/clusters/gcp.md @@ -191,7 +191,7 @@ Once you've configured the `gcp` backend, create the fleet configuration: === "A4" -
+
```yaml type: fleet @@ -220,7 +220,7 @@ Once you've configured the `gcp` backend, create the fleet configuration:
```shell - $ dstack apply -f examples/clusters/gcp/a4-fleet.dstack.yml + $ dstack apply -f a4-fleet.dstack.yml Provisioning... ---> 100% @@ -257,7 +257,7 @@ Once you've configured the `gcp` backend, create the fleet configuration:
```shell - $ dstack apply -f examples/clusters/gcp/a3mega-fleet.dstack.yml + $ dstack apply -f a3mega-fleet.dstack.yml FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $22.1525 (spot) idle 9 mins ago @@ -273,7 +273,7 @@ Once you've configured the `gcp` backend, create the fleet configuration: === "A3 High/Edge" -
+
```yaml type: fleet @@ -296,7 +296,7 @@ Once you've configured the `gcp` backend, create the fleet configuration:
```shell - $ dstack apply -f examples/clusters/gcp/a3high-fleet.dstack.yml + $ dstack apply -f a3high-fleet.dstack.yml FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $20.5688 (spot) idle 9 mins ago @@ -324,7 +324,7 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt
```shell - $ dstack apply -f examples/clusters/nccl-tests/.dstack.yml + $ dstack apply -f nccl-tests.dstack.yml Provisioning... ---> 100% @@ -351,15 +351,70 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt
=== "A3 Mega" - !!! info "Source code" - The source code of the task can be found at [examples/clusters/gcp/a3mega-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml). + +
+ + ```yaml + type: task + name: nccl-tests + nodes: 2 + image: nvcr.io/nvidia/pytorch:24.04-py3 + entrypoint: "bash -c" # Need to use bash instead of default dash for nccl-env-profile.sh + commands: + - | + # Setup TCPXO NCCL env variables + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" + source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + # Build NCCL Tests + git clone https://github.com/NVIDIA/nccl-tests.git + cd nccl-tests + MPI=1 CC=mpicc CXX=mpicxx make -j + cd build + # We use FIFO for inter-node communication + FIFO=/tmp/dstack_job + if [ ${DSTACK_NODE_RANK} -eq 0 ]; then + sleep 10 + echo "${DSTACK_NODES_IPS}" > hostfile + MPIRUN='mpirun --allow-run-as-root --hostfile hostfile' + # Wait for other nodes + while true; do + if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then + break + fi + echo 'Waiting for nodes...' + sleep 5 + done + # Run NCCL Tests + ${MPIRUN} \ + -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \ + --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \ + $(env | awk -F= '{print "-x", $1}' | xargs) \ + ./all_gather_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 200 -c 0; + # Notify nodes the job is done + ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" + else + mkfifo ${FIFO} + # Wait for a message from the first node + cat ${FIFO} + fi + spot_policy: auto + resources: + shm_size: 16GB + ``` + +
Pass the configuration to `dstack apply`:
```shell - $ dstack apply -f examples/clusters/gcp/a3mega-nccl-tests.dstack.yml + $ dstack apply -f nccl-tests.dstack.yml nccl-tests provisioning completed (running) nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 @@ -385,15 +440,57 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt
=== "A3 High/Edge" - !!! info "Source code" - The source code of the task can be found at [examples/clusters/nccl-tests/.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests/.dstack.yml). - + +
+ + ```yaml + type: task + name: nccl-tests + nodes: 2 + image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx + commands: + - | + export NCCL_DEBUG=INFO + export LD_LIBRARY_PATH=/usr/local/tcpx/lib64:$LD_LIBRARY_PATH + # We use FIFO for inter-node communication + FIFO=/tmp/dstack_job + if [ ${DSTACK_NODE_RANK} -eq 0 ]; then + mkdir -p /scripts/hostfiles2 + : > /scripts/hostfiles2/hostfile8 + for ip in ${DSTACK_NODES_IPS}; do + echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> /scripts/hostfiles2/hostfile8 + done + MPIRUN='mpirun --allow-run-as-root --hostfile /scripts/hostfiles2/hostfile8' + # Wait for other nodes + while true; do + if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then + break + fi + echo 'Waiting for nodes...' + sleep 5 + done + # Run NCCL Tests + NCCL_GPUDIRECTTCPX_FORCE_ACK=0 /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 8M 8GB 2 + # Notify nodes the job is done + ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" + else + mkfifo ${FIFO} + # Wait for a message from the first node + cat ${FIFO} + fi + spot_policy: auto + resources: + shm_size: 16GB + ``` + +
+ Pass the configuration to `dstack apply`:
```shell - $ dstack apply -f examples/clusters/gcp/a3high-nccl-tests.dstack.yml + $ dstack apply -f nccl-tests.dstack.yml nccl-tests provisioning completed (running) nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 @@ -418,16 +515,13 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt
- !!! info "Source code" - The source code of the task can be found at [examples/clusters/gcp/a3high-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3high-nccl-tests.dstack.yml). - ### Distributed training === "A4" - You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A4 instances. + You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A4 instances. === "A3 Mega" - You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: ```shell # ... @@ -446,7 +540,7 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt ``` === "A3 High/Edge" - You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: ```shell # ... @@ -483,6 +577,6 @@ In addition to distributed training, you can of course run regular tasks, dev en ## What's new -1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) -2. Read the [Clusters](https://dstack.ai/docs/guides/clusters) guide +1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) +2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) 3. Check GCP's docs on using [A4](https://docs.cloud.google.com/compute/docs/gpus/create-gpu-vm-a3u-a4), and [A3 Mega/High/Edge](https://docs.cloud.google.com/compute/docs/gpus/gpudirect) instances diff --git a/docs/examples/clusters/gcp/index.md b/docs/examples/clusters/gcp/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/lambda/README.md b/docs/examples/clusters/lambda.md similarity index 84% rename from examples/clusters/lambda/README.md rename to docs/examples/clusters/lambda.md index 07fb0ce926..e66e74573a 100644 --- a/examples/clusters/lambda/README.md +++ b/docs/examples/clusters/lambda.md @@ -19,7 +19,7 @@ description: Setting up Lambda clusters using Kubernetes or 1-Click Clusters wit ### Configure the backend -Follow the standard instructions for setting up a [Kubernetes](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: +Follow the standard instructions for setting up a [Kubernetes](../../docs/concepts/backends.md#kubernetes) backend:
@@ -68,11 +68,11 @@ $ dstack apply -f lambda-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). ## 1-Click Clusters -Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](https://dstack.ai/docs/concepts/backends#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](https://dstack.ai/docs/concepts/fleets). +Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](../../docs/concepts/backends.md#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](../../docs/concepts/fleets.md). ### Prerequsisites @@ -80,7 +80,7 @@ Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-cl ### Create a fleet -Follow the standard instructions for setting up an [SSH fleet](https://dstack.ai/docs/concepts/fleets/#ssh-fleets): +Follow the standard instructions for setting up an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets):
@@ -116,11 +116,11 @@ $ dstack apply -f lambda-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). ## Run tasks -To run tasks on a cluster, you must use [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-task). +To run tasks on a cluster, you must use [distributed tasks](../../docs/concepts/tasks.md#distributed-task). ### Run NCCL tests @@ -213,6 +213,6 @@ Provisioning... ## What's next -1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) -2. Read the [Kuberentes](https://dstack.ai/docs/guides/kubernetes), and [Clusters](https://dstack.ai/docs/guides/clusters) guides +1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) +2. Read about the [Kubernetes backend](../../docs/concepts/backends.md#kubernetes) and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) 3. Check Lambda's docs on [Kubernetes](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) and [1CC](https://docs.lambda.ai/public-cloud/1-click-clusters/) diff --git a/docs/examples/clusters/lambda/index.md b/docs/examples/clusters/lambda/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/nccl-rccl-tests/README.md b/docs/examples/clusters/nccl-rccl-tests.md similarity index 82% rename from examples/clusters/nccl-rccl-tests/README.md rename to docs/examples/clusters/nccl-rccl-tests.md index a9cadd82e8..4c565d8c68 100644 --- a/examples/clusters/nccl-rccl-tests/README.md +++ b/docs/examples/clusters/nccl-rccl-tests.md @@ -5,10 +5,10 @@ description: Running NCCL and RCCL tests to validate cluster network bandwidth # NCCL/RCCL tests -This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). +This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). ## Running as a task @@ -16,7 +16,7 @@ Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPU === "NCCL tests" -
+
```yaml type: task @@ -59,7 +59,7 @@ Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPU === "RCCL tests" -
+
```yaml type: task @@ -120,12 +120,12 @@ Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPU ### Apply a configuration -To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply/) command. +To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f examples/clusters/nccl-rccl-tests/nccl-tests.dstack.yml +$ dstack apply -f nccl-tests.dstack.yml # BACKEND REGION INSTANCE RESOURCES SPOT PRICE 1 aws us-east-1 g4dn.12xlarge 48xCPU, 192GB, 4xT4 (16GB), 100.0GB (disk) no $3.912 @@ -139,5 +139,5 @@ Submit the run nccl-tests? [y/n]: y ## What's next? -1. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), - [services](https://dstack.ai/docsconcepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets). +1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). diff --git a/docs/examples/clusters/nccl-rccl-tests/index.md b/docs/examples/clusters/nccl-rccl-tests/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/clusters/nebius/README.md b/docs/examples/clusters/nebius.md similarity index 90% rename from examples/clusters/nebius/README.md rename to docs/examples/clusters/nebius.md index 9f8bd349a0..6986a10ab5 100644 --- a/examples/clusters/nebius/README.md +++ b/docs/examples/clusters/nebius.md @@ -75,7 +75,7 @@ $ dstack apply -f nebius-fleet.dstack.yml This will automatically create a Nebius cluster and provision instances. -Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). > If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. @@ -107,7 +107,7 @@ $ nebius mk8s cluster get-credentials --id <cluster id> --external ### Configure a backend -Follow the standard instructions for setting up a [`kubernetes`](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: +Follow the standard instructions for setting up a [`kubernetes`](../../docs/concepts/backends.md#kubernetes) backend:
@@ -154,11 +154,11 @@ $ dstack apply -f nebius-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). +Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). ## NCCL tests -Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth. +Use a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth.
@@ -252,6 +252,6 @@ nccl-tests provisioning completed (running) ## What's next -1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) -2. Check out [backends](https://dstack.ai/docs/concepts/backends) and [fleets](https://dstack.ai/docs/concepts/fleets) +1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) +2. Check out [backends](../../docs/concepts/backends.md) and [fleets](../../docs/concepts/fleets.md) 3. Read Nebius' docs on [networking for VMs](https://docs.nebius.com/compute/clusters/gpu) and the [managed Kubernetes service](https://docs.nebius.com/kubernetes). diff --git a/docs/examples/clusters/nebius/index.md b/docs/examples/clusters/nebius/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/distributed-training/axolotl/README.md b/docs/examples/distributed-training/axolotl.md similarity index 75% rename from examples/distributed-training/axolotl/README.md rename to docs/examples/distributed-training/axolotl.md index cd7be95e4c..c2e04d3fc6 100644 --- a/examples/distributed-training/axolotl/README.md +++ b/docs/examples/distributed-training/axolotl.md @@ -5,16 +5,16 @@ description: Distributed fine-tuning with Axolotl and FSDP across multiple nodes # Axolotl -This example walks you through how to run distributed fine-tune using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) and [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). +This example walks you through how to run distributed fine-tune using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) and [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). ## Define a configuration Once the fleet is created, define a distributed task configuration. Here's an example of distributed `QLORA` task using `FSDP`. -
+
```yaml type: task @@ -72,7 +72,7 @@ volumes: ### Apply the configuration -To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. +To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
@@ -81,13 +81,13 @@ $ HF_TOKEN=... $ WANDB_API_KEY=... $ WANDB_PROJECT=... $ HUB_MODEL_ID=... -$ dstack apply -f examples/distributed-training/trl/fsdp.dstack.yml +$ dstack apply -f train-distrib.dstack.yml # BACKEND RESOURCES INSTANCE TYPE PRICE 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle -Submit the run trl-train-fsdp-distrib? [y/n]: y +Submit the run axolotl-multi-node-qlora-llama3-70b? [y/n]: y Provisioning... ---> 100% @@ -95,6 +95,6 @@ Provisioning...
!!! info "What's next?" - 1. Read the [clusters](https://dstack.ai/docs/guides/clusters) guide - 2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), - [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets) + 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) diff --git a/docs/examples/distributed-training/axolotl/index.md b/docs/examples/distributed-training/axolotl/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/distributed-training/ray-ragen/README.md b/docs/examples/distributed-training/ray-ragen.md similarity index 86% rename from examples/distributed-training/ray-ragen/README.md rename to docs/examples/distributed-training/ray-ragen.md index f7bd80d5c2..e3194b2b3a 100644 --- a/examples/distributed-training/ray-ragen/README.md +++ b/docs/examples/distributed-training/ray-ragen.md @@ -11,7 +11,7 @@ to fine-tune an agent on multiple nodes. Under the hood `RAGEN` uses [verl](https://github.com/volcengine/verl) for Reinforcement Learning and [Ray](https://docs.ray.io/en/latest/) for distributed training. !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). ## Run a Ray cluster @@ -19,11 +19,11 @@ If you want to use Ray with `dstack`, you have to first run a Ray cluster. The task below runs a Ray cluster on an existing fleet: -
+
```yaml type: task -name: ray-ragen-cluster +name: ray-cluster nodes: 2 @@ -76,7 +76,7 @@ Now, if you run this task via `dstack apply`, it will automatically forward the
```shell -$ dstack apply -f examples/distributed-training/ray-ragen/.dstack.yml +$ dstack apply -f ray-cluster.dstack.yml ```
@@ -130,6 +130,5 @@ $ ray job submit \ Using Ray via `dstack` is a powerful way to get access to the rich Ray ecosystem while benefiting from `dstack`'s provisioning capabilities. !!! info "What's next" - 1. Check the [Clusters](https://dstack.ai/docs/guides/clusters) guide - 2. Read about [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks) and [fleets](https://dstack.ai/docs/concepts/fleets) - 3. Browse Ray's [docs](https://docs.ray.io/en/latest/train/examples.html) for other examples. + 1. Read about [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks), [fleets](../../docs/concepts/fleets.md), and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) + 2. Browse Ray's [docs](https://docs.ray.io/en/latest/train/examples.html) for other examples. diff --git a/docs/examples/distributed-training/ray-ragen/index.md b/docs/examples/distributed-training/ray-ragen/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/distributed-training/trl/README.md b/docs/examples/distributed-training/trl.md similarity index 83% rename from examples/distributed-training/trl/README.md rename to docs/examples/distributed-training/trl.md index 47d3f6f888..3a25c04b48 100644 --- a/examples/distributed-training/trl/README.md +++ b/docs/examples/distributed-training/trl.md @@ -8,7 +8,7 @@ description: Distributed fine-tuning with TRL, Accelerate, and DeepSpeed This example walks you through how to run distributed fine-tune using [TRL](https://github.com/huggingface/trl), [Accelerate](https://github.com/huggingface/accelerate) and [Deepspeed](https://github.com/deepspeedai/DeepSpeed). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). ## Define a configuration @@ -16,7 +16,7 @@ Once the fleet is created, define a distributed task configuration. Here's an ex === "FSDP" -
+
```yaml type: task name: trl-train-fsdp-distrib @@ -73,7 +73,7 @@ Once the fleet is created, define a distributed task configuration. Here's an ex === "Deepseed ZeRO-3" -
+
```yaml type: task name: trl-train-deepspeed-distrib @@ -133,7 +133,7 @@ Once the fleet is created, define a distributed task configuration. Here's an ex ### Apply the configuration -To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. +To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
@@ -141,7 +141,7 @@ To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/referenc $ HF_TOKEN=... $ WANDB_API_KEY=... $ HUB_MODEL_ID=... -$ dstack apply -f examples/distributed-training/trl/fsdp.dstack.yml +$ dstack apply -f train-distrib.dstack.yml # BACKEND RESOURCES INSTANCE TYPE PRICE 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle @@ -155,6 +155,6 @@ Provisioning...
!!! info "What's next?" - 1. Read the [clusters](https://dstack.ai/docs/guides/clusters) guide - 2. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), - [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets) + 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) diff --git a/docs/examples/distributed-training/trl/index.md b/docs/examples/distributed-training/trl/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/inference/nim/README.md b/docs/examples/inference/nim.md similarity index 80% rename from examples/inference/nim/README.md rename to docs/examples/inference/nim.md index 680c51f498..263baa2737 100644 --- a/examples/inference/nim/README.md +++ b/docs/examples/inference/nim.md @@ -8,7 +8,7 @@ description: Deploying Nemotron-3-Super-120B-A12B using NVIDIA NIM This example shows how to deploy Nemotron-3-Super-120B-A12B using [NVIDIA NIM](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) and `dstack`. ??? info "Prerequisites" - Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. + Once `dstack` is [installed](../../docs/installation.md), clone the repo with examples.
@@ -23,7 +23,7 @@ This example shows how to deploy Nemotron-3-Super-120B-A12B using [NVIDIA NIM](h Here's an example of a service that deploys Nemotron-3-Super-120B-A12B using NIM. -
+
```yaml type: service @@ -54,13 +54,13 @@ resources: ### Running a configuration Save the configuration above as `nemotron120.dstack.yml`, then use the -[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. +[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
```shell $ NGC_API_KEY=... -$ dstack apply -f nemotron120.dstack.yml +$ dstack apply -f service.dstack.yml ```
@@ -91,9 +91,9 @@ $ curl http://127.0.0.1:3000/proxy/services/main/nemotron120/v1/chat/completions
-When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the service endpoint will be available at `https://nemotron120./`. +When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint will be available at `https://nemotron120./`. ## What's next? -1. Check [services](https://dstack.ai/docs/services) +1. Check [services](../../docs/concepts/services.md) 2. Browse the [Nemotron-3-Super-120B-A12B model page](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b) diff --git a/docs/examples/inference/nim/index.md b/docs/examples/inference/nim/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/inference/sglang/README.md b/docs/examples/inference/sglang.md similarity index 89% rename from examples/inference/sglang/README.md rename to docs/examples/inference/sglang.md index 3f2694c655..feda39a46d 100644 --- a/examples/inference/sglang/README.md +++ b/docs/examples/inference/sglang.md @@ -9,7 +9,7 @@ This example shows how to deploy `Qwen/Qwen3.6-27B` using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. > For a `DeepSeek-V4-Pro` deployment on `B200:8`, see the -[DeepSeek V4](../../models/deepseek-v4/index.md) model page. +[DeepSeek V4](../models/deepseek-v4.md) model page. ## Apply a configuration @@ -18,7 +18,7 @@ Here's an example of a service that deploys === "NVIDIA" -
+
```yaml type: service @@ -53,7 +53,7 @@ Here's an example of a service that deploys === "AMD" -
+
```yaml type: service @@ -94,13 +94,13 @@ guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. The first startup on MI300X can take longer while SGLang compiles ROCm kernels. -Save one of the configurations above as `qwen36.dstack.yml`, then use the -[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. +Save one of the configurations above as `service.dstack.yml`, then use the +[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen36.dstack.yml +$ dstack apply -f service.dstack.yml ```
@@ -132,7 +132,7 @@ Qwen3.6 uses thinking mode by default. To disable thinking, pass `"chat_template_kwargs": {"enable_thinking": false}` in the request body. To enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command. -> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +> If a [gateway](../../docs/concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## Configuration options @@ -221,5 +221,5 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics ## What's next? -1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) +1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) 2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) diff --git a/docs/examples/inference/sglang/index.md b/docs/examples/inference/sglang/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/inference/trtllm/README.md b/docs/examples/inference/trtllm.md similarity index 83% rename from examples/inference/trtllm/README.md rename to docs/examples/inference/trtllm.md index ae3666d225..8f95cefc63 100644 --- a/examples/inference/trtllm/README.md +++ b/docs/examples/inference/trtllm.md @@ -13,7 +13,7 @@ This example shows how to deploy `nvidia/Qwen3-235B-A22B-FP8` using Here's an example of a service that deploys `nvidia/Qwen3-235B-A22B-FP8` using TensorRT-LLM. -
+
```yaml type: service @@ -53,12 +53,12 @@ resources: ```
-Apply it with [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md): +Apply it with [`dstack apply`](../../docs/reference/cli/dstack/apply.md):
```shell -$ dstack apply -f qwen235.dstack.yml +$ dstack apply -f service.dstack.yml ```
@@ -90,10 +90,10 @@ $ curl http://127.0.0.1:3000/proxy/services/main/qwen235/v1/chat/completions \
-When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the service endpoint will be available at `https://qwen235./`. +When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint will be available at `https://qwen235./`. ## What's next? -1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) +1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) 2. Browse the [TensorRT-LLM deployment guides](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html) and the [Qwen3 deployment guide](https://nvidia.github.io/TensorRT-LLM/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html) 3. See the [`trtllm-serve` reference](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve/trtllm-serve.html) diff --git a/docs/examples/inference/trtllm/index.md b/docs/examples/inference/trtllm/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/inference/vllm/README.md b/docs/examples/inference/vllm.md similarity index 77% rename from examples/inference/vllm/README.md rename to docs/examples/inference/vllm.md index 75d6add9be..4ac880defc 100644 --- a/examples/inference/vllm/README.md +++ b/docs/examples/inference/vllm.md @@ -15,7 +15,7 @@ Here's an example of a service that deploys === "NVIDIA" -
+
```yaml type: service @@ -49,7 +49,7 @@ Here's an example of a service that deploys === "AMD" -
+
```yaml type: service @@ -88,13 +88,13 @@ Qwen3.6-27B is a multimodal model. For text-only workloads, add `--language-model-only` to free more memory for the KV cache. To enable tool calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`. -Save one of the configurations above as `qwen36.dstack.yml`, then use the -[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command. +Save one of the configurations above as `service.dstack.yml`, then use the +[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen36.dstack.yml +$ dstack apply -f service.dstack.yml ```
@@ -122,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +> If a [gateway](../../docs/concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## What's next? -1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) -2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](https://dstack.ai/examples/inference/sglang/) example +1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) +2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](../inference/sglang.md) example diff --git a/docs/examples/inference/vllm/index.md b/docs/examples/inference/vllm/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/models/deepseek-v4/README.md b/docs/examples/models/deepseek-v4.md similarity index 93% rename from examples/models/deepseek-v4/README.md rename to docs/examples/models/deepseek-v4.md index b36a343018..7efd9977e8 100644 --- a/examples/models/deepseek-v4/README.md +++ b/docs/examples/models/deepseek-v4.md @@ -6,7 +6,7 @@ description: Deploying DeepSeek-V4-Pro using SGLang on NVIDIA B200:8 # DeepSeek V4 This example shows how to deploy `deepseek-ai/DeepSeek-V4-Pro` as a -[service](https://dstack.ai/docs/services) using +[service](../../docs/concepts/services.md) using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration @@ -64,7 +64,7 @@ This configuration uses the single-node Blackwell `DeepSeek-V4-Pro` recipe shape for `8 x NVIDIA B200`. Export your Hugging Face token and apply the configuration with -[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md). +[`dstack apply`](../../docs/reference/cli/dstack/apply.md).
@@ -151,4 +151,4 @@ This returns both: 1. Read the [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro) 2. Read the [DeepSeek-V4 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4) -3. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) and [vLLM](https://dstack.ai/examples/inference/vllm/) examples +3. Browse the dedicated [SGLang](../inference/sglang.md) and [vLLM](../inference/vllm.md) examples diff --git a/docs/examples/models/deepseek-v4/index.md b/docs/examples/models/deepseek-v4/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/models/qwen36/README.md b/docs/examples/models/qwen36.md similarity index 91% rename from examples/models/qwen36/README.md rename to docs/examples/models/qwen36.md index bc92271b27..3723e36fa0 100644 --- a/examples/models/qwen36/README.md +++ b/docs/examples/models/qwen36.md @@ -6,7 +6,7 @@ description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs # Qwen 3.6 This example shows how to deploy `Qwen/Qwen3.6-27B` as a -[service](https://dstack.ai/docs/services) using +[service](../../docs/concepts/services.md) using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration @@ -92,7 +92,7 @@ The NVIDIA and AMD configurations above use pinned SGLang images and the same straightforward 4-GPU layout used across the Qwen 3.6 docs and examples. Apply the configuration with -[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md). +[`dstack apply`](../../docs/reference/cli/dstack/apply.md).
@@ -162,7 +162,7 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ 1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B) 2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) 3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) -4. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) - and [vLLM](https://dstack.ai/examples/inference/vllm/) examples -5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for +4. Browse the dedicated [SGLang](../inference/sglang.md) + and [vLLM](../inference/vllm.md) examples +5. Check the [AMD](../accelerators/amd.md) example for more AMD deployment and training configurations diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/single-node-training/axolotl/README.md b/docs/examples/single-node-training/axolotl.md similarity index 78% rename from examples/single-node-training/axolotl/README.md rename to docs/examples/single-node-training/axolotl.md index 7781139e0b..3ab19d0502 100644 --- a/examples/single-node-training/axolotl/README.md +++ b/docs/examples/single-node-training/axolotl.md @@ -8,7 +8,7 @@ description: Fine-tuning models with Axolotl using FSDP and QLoRA This example shows how to use [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) with `dstack` to fine-tune 4-bit Quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. ??? info "Prerequisites" - Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. + Once `dstack` is [installed](../../docs/installation.md), clone the repo with examples.
@@ -25,7 +25,7 @@ Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer config Below is a task configuration that does fine-tuning. -
+
```yaml type: task @@ -63,7 +63,7 @@ resources: The task uses Axolotl's Docker image, where Axolotl is already pre-installed. !!! info "AMD" - The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#axolotl). ## Run the configuration @@ -77,7 +77,7 @@ $ HF_TOKEN=... $ WANDB_API_KEY=... $ WANDB_PROJECT=... $ HUB_MODEL_ID=... -$ dstack apply -f examples/single-node-training/axolotl/.dstack.yml +$ dstack apply -f train.dstack.yml # BACKEND RESOURCES INSTANCE TYPE PRICE 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 @@ -94,7 +94,7 @@ Provisioning... ## What's next? -1. Browse the [Axolotl distributed training](https://dstack.ai/docs/examples/distributed-training/axolotl) example -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), - [services](https://dstack.ai/docs/services), [fleets](https://dstack.ai/docs/concepts/fleets) -3. See the [AMD](https://dstack.ai/examples/accelerators/amd#axolotl) example +1. Browse the [Axolotl distributed training](../distributed-training/axolotl.md) example +2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), [fleets](../../docs/concepts/fleets.md) +3. See the [AMD](../accelerators/amd.md#axolotl) example diff --git a/docs/examples/single-node-training/axolotl/index.md b/docs/examples/single-node-training/axolotl/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/single-node-training/trl/README.md b/docs/examples/single-node-training/trl.md similarity index 83% rename from examples/single-node-training/trl/README.md rename to docs/examples/single-node-training/trl.md index 82dca87a98..7295055259 100644 --- a/examples/single-node-training/trl/README.md +++ b/docs/examples/single-node-training/trl.md @@ -11,7 +11,7 @@ This example walks you through how to use [TRL](https://github.com/huggingface/t Below is a task configuration that does fine-tuning. -
+
```yaml type: task @@ -74,7 +74,7 @@ resources: Change the `resources` property to specify more GPUs. !!! info "AMD" - The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#trl). + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#trl). ??? info "DeepSpeed" For more memory-efficient use of multiple GPUs, consider using DeepSpeed and ZeRO Stage 3. @@ -93,7 +93,7 @@ cloud resources and run the configuration. $ HF_TOKEN=... $ WANDB_API_KEY=... $ HUB_MODEL_ID=... -$ dstack apply -f examples/single-node-training/trl/train.dstack.yml +$ dstack apply -f train.dstack.yml # BACKEND RESOURCES INSTANCE TYPE PRICE 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 @@ -110,7 +110,7 @@ Provisioning... ## What's next? -1. Browse the [TRL distributed training](https://dstack.ai/docs/examples/distributed-training/trl) example -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), - [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/fleets) -3. See the [AMD](https://dstack.ai/examples/accelerators/amd#trl) example +1. Browse the [TRL distributed training](../distributed-training/trl.md) example +2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), + [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) +3. See the [AMD](../accelerators/amd.md#trl) example diff --git a/docs/examples/single-node-training/trl/index.md b/docs/examples/single-node-training/trl/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/examples/accelerators/tenstorrent/.dstack.yml b/examples/accelerators/tenstorrent/.dstack.yml deleted file mode 100644 index 6e3319a001..0000000000 --- a/examples/accelerators/tenstorrent/.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: dev-environment -name: cursor - -image: dstackai/tt-smi:latest - -ide: cursor - -resources: - gpu: n150:1 diff --git a/examples/accelerators/tenstorrent/tt-inference-server.dstack.yml b/examples/accelerators/tenstorrent/tt-inference-server.dstack.yml deleted file mode 100644 index 6f1815ead1..0000000000 --- a/examples/accelerators/tenstorrent/tt-inference-server.dstack.yml +++ /dev/null @@ -1,24 +0,0 @@ -type: service -name: tt-inference-server - -env: - - HF_TOKEN - - HF_MODEL_REPO_ID=meta-llama/Llama-3.2-1B-Instruct -image: ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64:0.0.4-v0.56.0-rc47-e2e0002ac7dc -commands: - - | - . ${PYTHON_ENV_DIR}/bin/activate - pip install "huggingface_hub[cli]" - export LLAMA_DIR="/data/models--$(echo "$HF_MODEL_REPO_ID" | sed 's/\//--/g')/" - huggingface-cli download $HF_MODEL_REPO_ID --local-dir $LLAMA_DIR - python /home/container_app_user/app/src/run_vllm_api_server.py -port: 7000 - -model: meta-llama/Llama-3.2-1B-Instruct - -# Cache downloaded model -volumes: - - /mnt/data/tt-inference-server/data:/data - -resources: - gpu: n150:1 diff --git a/examples/accelerators/tenstorrent/tt-smi.dstack.yml b/examples/accelerators/tenstorrent/tt-smi.dstack.yml deleted file mode 100644 index b9478cb166..0000000000 --- a/examples/accelerators/tenstorrent/tt-smi.dstack.yml +++ /dev/null @@ -1,10 +0,0 @@ -type: task -name: tt-smi - -image: dstackai/tt-smi:latest - -commands: - - tt-smi -s - -resources: - gpu: n150:1 diff --git a/examples/clusters/aws/fleet.dstack.yml b/examples/clusters/aws/fleet.dstack.yml deleted file mode 100644 index 9914c3df1f..0000000000 --- a/examples/clusters/aws/fleet.dstack.yml +++ /dev/null @@ -1,8 +0,0 @@ -type: fleet -name: my-efa-fleet - -nodes: 2 -placement: cluster - -resources: - gpu: H100:8 diff --git a/examples/clusters/gcp/a3-fleet.dstack.yml b/examples/clusters/gcp/a3-fleet.dstack.yml deleted file mode 100644 index 483877068d..0000000000 --- a/examples/clusters/gcp/a3-fleet.dstack.yml +++ /dev/null @@ -1,7 +0,0 @@ -type: fleet -name: a3mega-cluster -nodes: 2 -placement: cluster -instance_types: - - a3-megagpu-8g -spot_policy: auto diff --git a/examples/clusters/gcp/a3high-fleet.dstack.yml b/examples/clusters/gcp/a3high-fleet.dstack.yml deleted file mode 100644 index e9f0a9dbc2..0000000000 --- a/examples/clusters/gcp/a3high-fleet.dstack.yml +++ /dev/null @@ -1,7 +0,0 @@ -type: fleet -name: a3high-cluster -nodes: 2 -placement: cluster -instance_types: - - a3-highgpu-8g -spot_policy: auto diff --git a/examples/clusters/gcp/a3high-nccl-tests.dstack.yml b/examples/clusters/gcp/a3high-nccl-tests.dstack.yml deleted file mode 100644 index 6cacbdf54b..0000000000 --- a/examples/clusters/gcp/a3high-nccl-tests.dstack.yml +++ /dev/null @@ -1,37 +0,0 @@ -type: task -name: nccl-tests -nodes: 2 -image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx -commands: - - | - export NCCL_DEBUG=INFO - export LD_LIBRARY_PATH=/usr/local/tcpx/lib64:$LD_LIBRARY_PATH - # We use FIFO for inter-node communication - FIFO=/tmp/dstack_job - if [ ${DSTACK_NODE_RANK} -eq 0 ]; then - mkdir -p /scripts/hostfiles2 - : > /scripts/hostfiles2/hostfile8 - for ip in ${DSTACK_NODES_IPS}; do - echo "${ip} slots=${DSTACK_GPUS_PER_NODE}" >> /scripts/hostfiles2/hostfile8 - done - MPIRUN='mpirun --allow-run-as-root --hostfile /scripts/hostfiles2/hostfile8' - # Wait for other nodes - while true; do - if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then - break - fi - echo 'Waiting for nodes...' - sleep 5 - done - # Run NCCL Tests - NCCL_GPUDIRECTTCPX_FORCE_ACK=0 /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 8M 8GB 2 - # Notify nodes the job is done - ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" - else - mkfifo ${FIFO} - # Wait for a message from the first node - cat ${FIFO} - fi -spot_policy: auto -resources: - shm_size: 16GB diff --git a/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml b/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml deleted file mode 100644 index 8c7e49d3f9..0000000000 --- a/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml +++ /dev/null @@ -1,50 +0,0 @@ -type: task -name: nccl-tests -nodes: 2 -image: nvcr.io/nvidia/pytorch:24.04-py3 -entrypoint: "bash -c" # Need to use bash instead of default dash for nccl-env-profile.sh -commands: - - | - # Setup TCPXO NCCL env variables - NCCL_LIB_DIR="/var/lib/tcpxo/lib64" - source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh - export NCCL_FASTRAK_CTRL_DEV=enp0s12 - export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 - export NCCL_SOCKET_IFNAME=enp0s12 - export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" - export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" - # Build NCCL Tests - git clone https://github.com/NVIDIA/nccl-tests.git - cd nccl-tests - MPI=1 CC=mpicc CXX=mpicxx make -j - cd build - # We use FIFO for inter-node communication - FIFO=/tmp/dstack_job - if [ ${DSTACK_NODE_RANK} -eq 0 ]; then - sleep 10 - echo "${DSTACK_NODES_IPS}" > hostfile - MPIRUN='mpirun --allow-run-as-root --hostfile hostfile' - # Wait for other nodes - while true; do - if ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 true >/dev/null 2>&1; then - break - fi - echo 'Waiting for nodes...' - sleep 5 - done - # Run NCCL Tests - ${MPIRUN} \ - -n ${DSTACK_GPUS_NUM} -N ${DSTACK_GPUS_PER_NODE} \ - --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \ - $(env | awk -F= '{print "-x", $1}' | xargs) \ - ./all_gather_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 200 -c 0; - # Notify nodes the job is done - ${MPIRUN} -n ${DSTACK_NODES_NUM} -N 1 sh -c "echo done > ${FIFO}" - else - mkfifo ${FIFO} - # Wait for a message from the first node - cat ${FIFO} - fi -spot_policy: auto -resources: - shm_size: 16GB diff --git a/examples/clusters/gcp/a4-fleet.dstack.yml b/examples/clusters/gcp/a4-fleet.dstack.yml deleted file mode 100644 index ac97e22def..0000000000 --- a/examples/clusters/gcp/a4-fleet.dstack.yml +++ /dev/null @@ -1,13 +0,0 @@ -type: fleet -name: a4-cluster - -nodes: 2 -placement: cluster - -# Specify the zone where you have configured the RoCE VPC -availability_zones: [us-west2-c] -backends: [gcp] -spot_policy: auto - -resources: - gpu: B200:8 diff --git a/examples/clusters/nccl-rccl-tests/nccl-tests.dstack.yml b/examples/clusters/nccl-rccl-tests/nccl-tests.dstack.yml deleted file mode 100644 index 4232e60a9e..0000000000 --- a/examples/clusters/nccl-rccl-tests/nccl-tests.dstack.yml +++ /dev/null @@ -1,29 +0,0 @@ -type: task -name: nccl-tests - -nodes: 2 -startup_order: workers-first -stop_criteria: master-done - -env: - - NCCL_DEBUG=INFO -commands: - - | - if [ $DSTACK_NODE_RANK -eq 0 ]; then - mpirun \ - --allow-run-as-root \ - --hostfile $DSTACK_MPI_HOSTFILE \ - -n $DSTACK_GPUS_NUM \ - -N $DSTACK_GPUS_PER_NODE \ - --bind-to none \ - /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 - else - sleep infinity - fi - -# Uncomment if the `kubernetes` backend requires it for `/dev/infiniband` access -#privileged: true - -resources: - gpu: nvidia:1..8 - shm_size: 16GB diff --git a/examples/clusters/nccl-rccl-tests/rccl-tests.dstack.yml b/examples/clusters/nccl-rccl-tests/rccl-tests.dstack.yml deleted file mode 100644 index 5beb1cd3ee..0000000000 --- a/examples/clusters/nccl-rccl-tests/rccl-tests.dstack.yml +++ /dev/null @@ -1,44 +0,0 @@ -type: task -name: rccl-tests - -nodes: 2 -startup_order: workers-first -stop_criteria: master-done - -# Mount the system libraries folder from the host -volumes: - - /usr/local/lib:/mnt/lib - -image: rocm/dev-ubuntu-22.04:6.4-complete -env: - - NCCL_DEBUG=INFO - - OPEN_MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi -commands: - # Setup MPI and build RCCL tests - - apt-get install -y git libopenmpi-dev openmpi-bin - - git clone https://github.com/ROCm/rccl-tests.git - - cd rccl-tests - - make MPI=1 MPI_HOME=$OPEN_MPI_HOME - - # Preload the RoCE driver library from the host (for Broadcom driver compatibility) - - export LD_PRELOAD=/mnt/lib/libbnxt_re-rdmav34.so - - # Run RCCL tests via MPI - - | - if [ $DSTACK_NODE_RANK -eq 0 ]; then - mpirun --allow-run-as-root \ - --hostfile $DSTACK_MPI_HOSTFILE \ - -n $DSTACK_GPUS_NUM \ - -N $DSTACK_GPUS_PER_NODE \ - --mca btl_tcp_if_include ens41np0 \ - -x LD_PRELOAD \ - -x NCCL_IB_HCA=mlx5_0/1,bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 \ - -x NCCL_IB_GID_INDEX=3 \ - -x NCCL_IB_DISABLE=0 \ - ./build/all_reduce_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 20 -c 0; - else - sleep infinity - fi - -resources: - gpu: MI300X:8 diff --git a/examples/distributed-training/axolotl/.dstack.yml b/examples/distributed-training/axolotl/.dstack.yml deleted file mode 100644 index 6192c689d0..0000000000 --- a/examples/distributed-training/axolotl/.dstack.yml +++ /dev/null @@ -1,49 +0,0 @@ -type: task -name: axolotl-multi-node-qlora-llama3-70b - -# Size of the cluster -nodes: 2 - -# The axolotlai/axolotl:main-latest image does not include InfiniBand or RDMA libraries, so we need to use the NGC container. -image: nvcr.io/nvidia/pytorch:25.01-py3 -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - WANDB_PROJECT - - HUB_MODEL_ID - - NCCL_DEBUG=INFO - - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - - ACCELERATE_LOG_LEVEL=info -# Commands of the task -commands: - # Replacing the default Torch and FlashAttention in the NCG container with Axolotl-compatible versions. - # The preinstalled versions are incompatible with Axolotl. - - pip uninstall -y torch flash-attn - - pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/test/cu124 - - pip install --no-build-isolation axolotl[flash-attn,deepspeed] - - wget https://raw.githubusercontent.com/huggingface/trl/main/examples/accelerate_configs/fsdp1.yaml - - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/qlora-fsdp-70b.yaml - # Axolotl includes hf-xet version 1.1.0, which fails during downloads. Replacing it with the latest version (1.1.2). - - pip uninstall -y hf-xet - - pip install hf-xet --no-cache-dir - - | - accelerate launch \ - --config_file=fsdp1.yaml \ - -m axolotl.cli.train qlora-fsdp-70b.yaml \ - --hub-model-id $HUB_MODEL_ID \ - --output-dir /checkpoints/qlora-llama3-70b \ - --wandb-project $DSTACK_RUN_NAME \ - --wandb-name $WANDB_NAME \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM - -resources: - gpu: 80GB:8 - shm_size: 128GB - -volumes: - - /checkpoints:/checkpoints diff --git a/examples/distributed-training/axolotl/fleet.dstack.yml b/examples/distributed-training/axolotl/fleet.dstack.yml deleted file mode 100644 index a522642091..0000000000 --- a/examples/distributed-training/axolotl/fleet.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: fleet -name: axolotl-fleet - -nodes: 2 -placement: cluster - -resources: - gpu: 80GB:8 - shm_size: 128GB diff --git a/examples/distributed-training/ray-ragen/.dstack.yml b/examples/distributed-training/ray-ragen/.dstack.yml deleted file mode 100644 index 8dabde9e04..0000000000 --- a/examples/distributed-training/ray-ragen/.dstack.yml +++ /dev/null @@ -1,39 +0,0 @@ -type: task -name: ray-ragen-cluster - -nodes: 2 - -env: -- WANDB_API_KEY -image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.2 -commands: - - wget -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - - bash miniconda.sh -b -p /workflow/miniconda - - eval "$(/workflow/miniconda/bin/conda shell.bash hook)" - - git clone https://github.com/RAGEN-AI/RAGEN.git - - cd RAGEN - - bash scripts/setup_ragen.sh - - conda activate ragen - - cd verl - - pip install --no-deps -e . - - pip install hf_transfer hf_xet - - pip uninstall -y ray - - pip install -U "ray[default]" - - | - if [ $DSTACK_NODE_RANK = 0 ]; then - ray start --head --port=6379; - else - ray start --address=$DSTACK_MASTER_NODE_IP:6379 - fi - -# Expose Ray dashboard port -ports: - - 8265 - -resources: - gpu: 80GB:8 - shm_size: 128GB - -# Save checkpoints on the instance -volumes: - - /checkpoints:/checkpoints diff --git a/examples/distributed-training/ray-ragen/fleet.dstack.yml b/examples/distributed-training/ray-ragen/fleet.dstack.yml deleted file mode 100644 index 04cd389254..0000000000 --- a/examples/distributed-training/ray-ragen/fleet.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: fleet -name: ray-ragen-cluster-fleet - -nodes: 2 -placement: cluster - -resources: - gpu: 80GB:8 - shm_size: 128GB diff --git a/examples/distributed-training/trl/deepspeed.dstack.yml b/examples/distributed-training/trl/deepspeed.dstack.yml deleted file mode 100644 index 972351f6ac..0000000000 --- a/examples/distributed-training/trl/deepspeed.dstack.yml +++ /dev/null @@ -1,52 +0,0 @@ -type: task -name: trl-train-deepspeed-distrib - -# Size of the cluster -nodes: 2 - -image: nvcr.io/nvidia/pytorch:25.01-py3 - -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - HUB_MODEL_ID - - MODEL_ID=meta-llama/Llama-3.1-8B - - ACCELERATE_LOG_LEVEL=info -# Commands of the task -commands: - - pip install transformers bitsandbytes peft wandb deepspeed - - git clone https://github.com/huggingface/trl - - cd trl - - pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM \ - trl/scripts/sft.py \ - --model_name $MODEL_ID \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --attn_implementation flash_attention_2 \ - --logging_steps=10 \ - --output_dir /checkpoints/llama31-ft \ - --hub_model_id $HUB_MODEL_ID \ - --torch_dtype bfloat16 - -resources: - gpu: 80GB:8 - shm_size: 128GB - -volumes: - - /checkpoints:/checkpoints diff --git a/examples/distributed-training/trl/fleet.dstack.yml b/examples/distributed-training/trl/fleet.dstack.yml deleted file mode 100644 index 1275794e8c..0000000000 --- a/examples/distributed-training/trl/fleet.dstack.yml +++ /dev/null @@ -1,9 +0,0 @@ -type: fleet -name: trl-train-fleet - -nodes: 2 -placement: cluster - -resources: - gpu: 80GB:8 - shm_size: 128GB diff --git a/examples/distributed-training/trl/fsdp.dstack.yml b/examples/distributed-training/trl/fsdp.dstack.yml deleted file mode 100644 index b00104033b..0000000000 --- a/examples/distributed-training/trl/fsdp.dstack.yml +++ /dev/null @@ -1,52 +0,0 @@ -type: task -name: trl-train-fsdp-distrib - -# Size of the cluster -nodes: 2 - -image: nvcr.io/nvidia/pytorch:25.01-py3 - -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - HUB_MODEL_ID - - MODEL_ID=meta-llama/Llama-3.1-8B - - ACCELERATE_LOG_LEVEL=info -# Commands of the task -commands: - - pip install transformers bitsandbytes peft wandb - - git clone https://github.com/huggingface/trl - - cd trl - - pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/fsdp1.yaml \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM \ - trl/scripts/sft.py \ - --model_name $MODEL_ID \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --attn_implementation flash_attention_2 \ - --logging_steps=10 \ - --output_dir /checkpoints/llama31-ft \ - --hub_model_id $HUB_MODEL_ID \ - --torch_dtype bfloat16 - -resources: - gpu: 80GB:8 - shm_size: 128GB - -volumes: - - /checkpoints:/checkpoints diff --git a/examples/single-node-training/axolotl/.dstack.yml b/examples/single-node-training/axolotl/.dstack.yml deleted file mode 100644 index dd28618904..0000000000 --- a/examples/single-node-training/axolotl/.dstack.yml +++ /dev/null @@ -1,28 +0,0 @@ -type: task -# The name is optional, if not specified, generated randomly -name: axolotl-nvidia-llama-scout-train - -# Using the official Axolotl's Docker image -image: axolotlai/axolotl:main-latest - -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - WANDB_PROJECT - - HUB_MODEL_ID -# Commands of the task -commands: - - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml - - | - axolotl train scout-qlora-flexattn-fsdp2.yaml \ - --wandb-project $WANDB_PROJECT \ - --wandb-name $DSTACK_RUN_NAME \ - --hub-model-id $HUB_MODEL_ID - -resources: - # Four GPU (required by FSDP) - gpu: H100:4 - # Shared memory size for inter-process communication - shm_size: 64GB - disk: 500GB.. diff --git a/examples/single-node-training/trl/train.dstack.yml b/examples/single-node-training/trl/train.dstack.yml deleted file mode 100644 index 9b24ae6131..0000000000 --- a/examples/single-node-training/trl/train.dstack.yml +++ /dev/null @@ -1,54 +0,0 @@ -type: task -# The name is optional, if not specified, generated randomly -name: trl-train - -python: 3.12 - -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - HUB_MODEL_ID - - ACCELERATE_LOG_LEVEL=info -# Commands of the task -commands: - # Pin torch==2.6.0 to avoid building Flash Attention from source. - # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0. - - uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 - - uv pip install transformers bitsandbytes peft wandb - - uv pip install flash_attn --no-build-isolation - - git clone https://github.com/huggingface/trl - - cd trl - - uv pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/multi_gpu.yaml \ - --num_processes $DSTACK_GPUS_PER_NODE \ - trl/scripts/sft.py \ - --model_name meta-llama/Meta-Llama-3.1-8B \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --lora_r 16 \ - --lora_alpha 32 \ - --lora_target_modules q_proj k_proj v_proj o_proj \ - --load_in_4bit \ - --use_peft \ - --attn_implementation "flash_attention_2" \ - --logging_steps=10 \ - --output_dir models/llama31 \ - --hub_model_id peterschmidt85/FineLlama-3.1-8B -resources: - gpu: - # 24GB or more VRAM - memory: 24GB.. - # One or more GPU - count: 1.. - # Shared memory (for multi-gpu) - shm_size: 24GB diff --git a/mkdocs.yml b/mkdocs.yml index 1b75f0ebe5..6c82dd15a5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -98,41 +98,41 @@ plugins: "docs/tasks.md": "docs/concepts/tasks.md" "docs/services.md": "docs/concepts/services.md" "docs/fleets.md": "docs/concepts/fleets.md" - "docs/examples/llms/llama31.md": "examples/inference/vllm/index.md" - "docs/examples/llms/llama32.md": "examples/inference/vllm/index.md" - "docs/examples/llms/qwen36.md": "examples/models/qwen36/index.md" - "examples/llms/llama31/index.md": "examples/inference/vllm/index.md" - "examples/llms/llama32/index.md": "examples/inference/vllm/index.md" - "examples/llms/qwen36/index.md": "examples/models/qwen36/index.md" - "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md" - "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" - "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" + "docs/examples/llms/llama31.md": "examples/inference/vllm.md" + "docs/examples/llms/llama32.md": "examples/inference/vllm.md" + "docs/examples/llms/qwen36.md": "examples/models/qwen36.md" + "examples/llms/llama31/index.md": "examples/inference/vllm.md" + "examples/llms/llama32/index.md": "examples/inference/vllm.md" + "examples/llms/qwen36/index.md": "examples/models/qwen36.md" + "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd.md" + "docs/examples/deployment/nim/index.md": "examples/inference/nim.md" + "docs/examples/deployment/vllm/index.md": "examples/inference/vllm.md" "backends.md": "docs/concepts/backends.md" "blog/monitoring-gpu-usage.md": "blog/posts/dstack-metrics.md" "blog/inactive-dev-environments-auto-shutdown.md": "blog/posts/inactivity-duration.md" "blog/data-centers-and-private-clouds.md": "blog/posts/gpu-blocks-and-proxy-jump.md" - "blog/distributed-training-with-aws-efa.md": "examples/clusters/aws/index.md" + "blog/distributed-training-with-aws-efa.md": "examples/clusters/aws.md" "blog/dstack-stats.md": "blog/posts/dstack-metrics.md" "docs/guides/metrics.md": "docs/concepts/metrics.md" "docs/guides/monitoring.md": "docs/concepts/metrics.md" "blog/nvidia-and-amd-on-vultr.md.md": "blog/posts/nvidia-and-amd-on-vultr.md" - "examples/misc/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" - "examples/misc/a3high-clusters/index.md": "examples/clusters/gcp/index.md" - "examples/misc/a3mega-clusters/index.md": "examples/clusters/gcp/index.md" - "examples/distributed-training/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" - "examples/distributed-training/rccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" - "examples/deployment/nim/index.md": "examples/inference/nim/index.md" - "examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" - "examples/deployment/sglang/index.md": "examples/inference/sglang/index.md" - "examples/deployment/trtllm/index.md": "examples/inference/trtllm/index.md" - "examples/fine-tuning/trl/index.md": "examples/single-node-training/trl/index.md" - "examples/fine-tuning/axolotl/index.md": "examples/single-node-training/axolotl/index.md" - "blog/efa.md": "examples/clusters/aws/index.md" + "examples/misc/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests.md" + "examples/misc/a3high-clusters/index.md": "examples/clusters/gcp.md" + "examples/misc/a3mega-clusters/index.md": "examples/clusters/gcp.md" + "examples/distributed-training/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests.md" + "examples/distributed-training/rccl-tests/index.md": "examples/clusters/nccl-rccl-tests.md" + "examples/deployment/nim/index.md": "examples/inference/nim.md" + "examples/deployment/vllm/index.md": "examples/inference/vllm.md" + "examples/deployment/sglang/index.md": "examples/inference/sglang.md" + "examples/deployment/trtllm/index.md": "examples/inference/trtllm.md" + "examples/fine-tuning/trl/index.md": "examples/single-node-training/trl.md" + "examples/fine-tuning/axolotl/index.md": "examples/single-node-training/axolotl.md" + "blog/efa.md": "examples/clusters/aws.md" "docs/concepts/repos.md": "docs/concepts/dev-environments.md#repos" - "examples/clusters/a3high/index.md": "examples/clusters/gcp/index.md" - "examples/clusters/a3mega/index.md": "examples/clusters/gcp/index.md" - "examples/clusters/a4/index.md": "examples/clusters/gcp/index.md" - "examples/clusters/efa/index.md": "examples/clusters/aws/index.md" + "examples/clusters/a3high/index.md": "examples/clusters/gcp.md" + "examples/clusters/a3mega/index.md": "examples/clusters/gcp.md" + "examples/clusters/a4/index.md": "examples/clusters/gcp.md" + "examples/clusters/efa/index.md": "examples/clusters/aws.md" "docs/guides/migration.md": "docs/guides/upgrade.md" "docs/reference/api/rest/index.md": "docs/reference/api/http/index.md" - typeset @@ -242,10 +242,11 @@ nav: - Guides: - Server deployment: docs/guides/server-deployment.md - Troubleshooting: docs/guides/troubleshooting.md - - Protips: docs/guides/protips.md - - Upgrade: docs/guides/upgrade.md - - Migration: - - Slurm: docs/guides/migration/slurm.md + - More: + - Protips: docs/guides/protips.md + - Upgrade: docs/guides/upgrade.md + - Migration: + - Slurm: docs/guides/migration/slurm.md - Reference: - .dstack.yml: - dev-environment: docs/reference/dstack.yml/dev-environment.md @@ -288,31 +289,31 @@ nav: - Examples: - examples.md - Single-node training: - - TRL: examples/single-node-training/trl/index.md - - Axolotl: examples/single-node-training/axolotl/index.md + - TRL: examples/single-node-training/trl.md + - Axolotl: examples/single-node-training/axolotl.md - Distributed training: - - TRL: examples/distributed-training/trl/index.md - - Axolotl: examples/distributed-training/axolotl/index.md - - Ray+RAGEN: examples/distributed-training/ray-ragen/index.md + - TRL: examples/distributed-training/trl.md + - Axolotl: examples/distributed-training/axolotl.md + - Ray+RAGEN: examples/distributed-training/ray-ragen.md - Clusters: - - AWS: examples/clusters/aws/index.md - - GCP: examples/clusters/gcp/index.md - - Lambda: examples/clusters/lambda/index.md - - Crusoe: examples/clusters/crusoe/index.md - - Nebius: examples/clusters/nebius/index.md - - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests/index.md + - AWS: examples/clusters/aws.md + - GCP: examples/clusters/gcp.md + - Lambda: examples/clusters/lambda.md + - Crusoe: examples/clusters/crusoe.md + - Nebius: examples/clusters/nebius.md + - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests.md - Inference: - - SGLang: examples/inference/sglang/index.md - - vLLM: examples/inference/vllm/index.md - - NIM: examples/inference/nim/index.md - - TensorRT-LLM: examples/inference/trtllm/index.md + - SGLang: examples/inference/sglang.md + - vLLM: examples/inference/vllm.md + - NIM: examples/inference/nim.md + - TensorRT-LLM: examples/inference/trtllm.md - Models: - - DeepSeek V4: examples/models/deepseek-v4/index.md - - Qwen 3.6: examples/models/qwen36/index.md + - DeepSeek V4: examples/models/deepseek-v4.md + - Qwen 3.6: examples/models/qwen36.md - Accelerators: - - AMD: examples/accelerators/amd/index.md - - TPU: examples/accelerators/tpu/index.md - - Tenstorrent: examples/accelerators/tenstorrent/index.md + - AMD: examples/accelerators/amd.md + - TPU: examples/accelerators/tpu.md + - Tenstorrent: examples/accelerators/tenstorrent.md - Blog: - blog/index.md - Case studies: blog/case-studies.md diff --git a/scripts/docs/gen_examples.py b/scripts/docs/gen_examples.py deleted file mode 100644 index 364ac7dfea..0000000000 --- a/scripts/docs/gen_examples.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Copies examples/**/README.md files as docs/examples/**/index.md -""" - -import logging -import os -from fnmatch import fnmatch -from pathlib import Path - -import mkdocs_gen_files -from mkdocs.structure.files import File - -FILE_PATTERN = "examples/**/index.md" -logger = logging.getLogger("mkdocs.plugins.dstack.examples") - -disable_env = "DSTACK_DOCS_DISABLE_EXAMPLES" -if os.environ.get(disable_env): - logger.warning(f"Examples generation is disabled: {disable_env} is set") - exit() - -logger.info("Generating examples documentation...") - -file: File -for file in mkdocs_gen_files.files: - if not fnmatch(file.src_uri, FILE_PATTERN): - continue - p = (Path(file.src_dir).parent / file.src_uri).parent / "README.md" - with open(p, "r") as f: - text = f.read() - with mkdocs_gen_files.open(file.src_uri, "w") as f: - f.write(text) diff --git a/scripts/docs/gen_llms_files.py b/scripts/docs/gen_llms_files.py index a7eee100ce..3e1b9ec6d0 100644 --- a/scripts/docs/gen_llms_files.py +++ b/scripts/docs/gen_llms_files.py @@ -31,18 +31,11 @@ def read_frontmatter(file_path: Path) -> Dict[str, Any]: return {} -def get_page_info(page_path: str, docs_dir: Path, repo_root: Path) -> Optional[Dict[str, str]]: +def get_page_info(page_path: str, docs_dir: Path) -> Optional[Dict[str, str]]: """Get title and description for a page from its frontmatter.""" # page_path is relative to docs_dir full_path = docs_dir / page_path - # For examples/**/index.md, read from README.md at repo root (same logic as hooks.py) - if page_path.startswith("examples/") and page_path.endswith("index.md"): - example_dir = Path(page_path).parent - readme_path = repo_root / example_dir / "README.md" - if readme_path.exists(): - full_path = readme_path - if not full_path.exists(): return None @@ -67,7 +60,6 @@ def parse_mkdocs_nav(mkdocs_config: Dict[str, Any], repo_root: str) -> List[Dict # Get docs_dir from config docs_dir = Path(repo_root) / mkdocs_config.get("docs_dir", "docs") - repo_root_path = Path(repo_root) def extract_pages(content_list): """Recursively extract all pages from a section's content, including nested subsections.""" @@ -75,7 +67,7 @@ def extract_pages(content_list): for item in content_list: if isinstance(item, str): # Plain string path like "examples.md" - page_info = get_page_info(item, docs_dir, repo_root_path) + page_info = get_page_info(item, docs_dir) if page_info: items.append( { @@ -89,7 +81,7 @@ def extract_pages(content_list): for title, path in item.items(): if isinstance(path, str): # Page with title - page_info = get_page_info(path, docs_dir, repo_root_path) + page_info = get_page_info(path, docs_dir) if page_info: items.append( { diff --git a/scripts/docs/hooks.py b/scripts/docs/hooks.py index ce5b3740bf..7e202c2587 100644 --- a/scripts/docs/hooks.py +++ b/scripts/docs/hooks.py @@ -14,7 +14,6 @@ WELL_KNOWN_SKILLS_DIR = ".well-known/skills" SKILL_PATH = ("skills", "dstack", "SKILL.md") -DISABLE_EXAMPLES_ENV = "DSTACK_DOCS_DISABLE_EXAMPLES" DISABLE_LLM_TXT_ENV = "DSTACK_DOCS_DISABLE_LLM_TXT" DISABLE_YAML_SCHEMAS_ENV = "DSTACK_DOCS_DISABLE_YAML_SCHEMAS" SCHEMA_REFERENCE_PREFIX = "docs/reference/" @@ -64,28 +63,9 @@ def _get_schema_expanded_content(rel_path, config, src_path=None): return _expand_schema_references(text) -def _get_materialized_content(rel_path, config): - """Return README content for examples/**/index.md stubs, else None.""" - if os.environ.get(DISABLE_EXAMPLES_ENV): - return None - - if rel_path.startswith("examples/") and rel_path.endswith("index.md"): - repo_root = os.path.dirname(config["config_file_path"]) - example_dir = os.path.dirname(rel_path) - readme_path = os.path.join(repo_root, example_dir, "README.md") - - if os.path.isfile(readme_path): - with open(readme_path, "r", encoding="utf-8") as f: - return f.read() - return None - - def on_page_read_source(page, config): - """Use README content for example stubs and expanded schema for reference docs when rendering HTML.""" + """Use expanded schema content for reference docs when rendering HTML.""" rel_path = page.file.src_uri - content = _get_materialized_content(rel_path, config) - if content is not None: - return content content = _get_schema_expanded_content(rel_path, config) if content is not None: return content @@ -93,8 +73,6 @@ def on_page_read_source(page, config): def on_config(config): - if os.environ.get(DISABLE_EXAMPLES_ENV): - log.warning("Examples documentation is disabled") if os.environ.get(DISABLE_YAML_SCHEMAS_ENV): log.warning("YAML schema reference generation is disabled") if os.environ.get(DISABLE_LLM_TXT_ENV): @@ -102,24 +80,6 @@ def on_config(config): return config -def on_page_context(context, page, config, nav): - """Override edit_url only for example stubs so Edit points to the README; other pages use theme default from edit_uri.""" - repo_url = (config.get("repo_url") or "").rstrip("/") - edit_uri = (config.get("edit_uri") or "edit/master/docs/").strip("/") - if not repo_url: - return context - # edit_uri is e.g. "edit/master/docs" -> branch is second segment - edit_parts = edit_uri.split("/") - branch = edit_parts[1] if len(edit_parts) >= 2 else "master" - - rel_path = page.file.src_uri - if rel_path.startswith("examples/") and rel_path.endswith("index.md"): - example_dir = os.path.dirname(rel_path) - page.edit_url = f"{repo_url}/edit/{branch}/{example_dir}/README.md" - - return context - - def on_post_build(config): """Copy .md files to site (raw) and write .well-known/skills index.""" site_dir = config["site_dir"] @@ -143,27 +103,17 @@ def on_post_build(config): src_path = os.path.join(root, file) rel_path = os.path.relpath(src_path, docs_dir).replace(os.sep, "/") - content = _get_materialized_content(rel_path, config) - - if content: - clean_name = os.path.dirname(rel_path) + ".md" - dest_path = os.path.join(site_dir, clean_name) - os.makedirs(os.path.dirname(dest_path), exist_ok=True) + content = _get_schema_expanded_content(rel_path, config, src_path=src_path) + dest_path = os.path.join(site_dir, rel_path) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + if content is not None: + # Write expanded schema content + log.info(f"Expanding schema references in {rel_path}") with open(dest_path, "w", encoding="utf-8") as f: f.write(content) else: - # Check if this is a schema reference file that needs expansion - content = _get_schema_expanded_content(rel_path, config, src_path=src_path) - dest_path = os.path.join(site_dir, rel_path) - os.makedirs(os.path.dirname(dest_path), exist_ok=True) - if content is not None: - # Write expanded schema content - log.info(f"Expanding schema references in {rel_path}") - with open(dest_path, "w", encoding="utf-8") as f: - f.write(content) - else: - # Just copy the file as-is - shutil.copy2(src_path, dest_path) + # Just copy the file as-is + shutil.copy2(src_path, dest_path) _write_well_known_skills(config, site_dir) _generate_llms_files(config, site_dir) From 8531602fb1edcc5b8e2e3a1dd0b3a5aedd3d2b69 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Wed, 6 May 2026 22:33:29 +0200 Subject: [PATCH 2/7] Move examples under /docs/, merge single and distributed training - Move docs/examples/ to docs/docs/examples/ so URLs become /docs/examples/... instead of /examples/.... The old /examples/// URLs continue to work via redirects, including the recently-published /docs/examples/ {single-node-training,distributed-training}/ paths. - Merge "Single-node training" and "Distributed training" example sections into a single "Training" section. TRL and Axolotl pages now contain both variants under top-level "Single-node training" and "Distributed training" H2 sections; Ray+RAGEN moves over unchanged. - Convert remaining absolute https://dstack.ai/(docs|examples)/... links to relative .md links throughout the moved example pages and the concept docs that point into them. Drop dead /docs/guides/{clusters,kubernetes} links (target pages were removed earlier) and replace with anchor links to the Kubernetes backend / cluster placement sections where appropriate. - Inline two GCP NCCL test yamls (a3mega-nccl-tests, a3high-nccl-tests) that were previously referenced via dead "Source code" admonitions. --- docs/assets/stylesheets/extra.css | 8 +- ...d-kubernetes-2024-recap-and-whats-ahead.md | 2 +- docs/blog/posts/changelog-07-25.md | 2 +- docs/blog/posts/ea-gtc25.md | 2 +- docs/blog/posts/gpu-health-checks.md | 4 +- docs/blog/posts/intel-gaudi.md | 2 +- docs/blog/posts/kubernetes-beta.md | 4 +- docs/blog/posts/mpi.md | 2 +- docs/blog/posts/nebius-in-dstack-sky.md | 4 +- docs/blog/posts/pd-disaggregation.md | 2 +- docs/blog/posts/toffee.md | 2 +- docs/docs/concepts/backends.md | 2 +- docs/docs/concepts/fleets.md | 15 +- docs/docs/concepts/gateways.md | 4 +- docs/docs/concepts/services.md | 4 +- docs/docs/concepts/tasks.md | 8 +- docs/{ => docs}/examples.md | 73 ++--- docs/{ => docs}/examples/accelerators/amd.md | 10 +- .../examples/accelerators/intel/index.md | 0 .../examples/accelerators/tenstorrent.md | 10 +- docs/{ => docs}/examples/accelerators/tpu.md | 10 +- docs/{ => docs}/examples/clusters/aws.md | 6 +- docs/{ => docs}/examples/clusters/crusoe.md | 14 +- docs/{ => docs}/examples/clusters/gcp.md | 10 +- docs/{ => docs}/examples/clusters/lambda.md | 16 +- .../examples/clusters/nccl-rccl-tests.md | 10 +- docs/{ => docs}/examples/clusters/nebius.md | 12 +- docs/{ => docs}/examples/inference/nim.md | 8 +- docs/{ => docs}/examples/inference/sglang.md | 6 +- docs/{ => docs}/examples/inference/trtllm.md | 6 +- docs/{ => docs}/examples/inference/vllm.md | 6 +- .../examples/llms/deepseek/index.md | 0 docs/{ => docs}/examples/llms/llama/index.md | 0 .../examples/misc/docker-compose/index.md | 0 .../{ => docs}/examples/models/deepseek-v4.md | 4 +- docs/{ => docs}/examples/models/qwen36.md | 4 +- .../{ => docs}/examples/models/wan22/index.md | 0 docs/docs/examples/training/axolotl.md | 185 ++++++++++++ .../examples/training}/ray-ragen.md | 4 +- docs/docs/examples/training/trl.md | 272 ++++++++++++++++++ docs/docs/guides/migration/slurm.md | 2 +- docs/docs/quickstart.md | 2 +- docs/examples/distributed-training/axolotl.md | 100 ------- docs/examples/distributed-training/trl.md | 160 ----------- docs/examples/single-node-training/axolotl.md | 100 ------- docs/examples/single-node-training/trl.md | 116 -------- docs/overrides/main.html | 9 +- mkdocs.yml | 132 +++++---- skills/dstack/SKILL.md | 6 +- 49 files changed, 670 insertions(+), 690 deletions(-) rename docs/{ => docs}/examples.md (67%) rename docs/{ => docs}/examples/accelerators/amd.md (95%) rename docs/{ => docs}/examples/accelerators/intel/index.md (100%) rename docs/{ => docs}/examples/accelerators/tenstorrent.md (94%) rename docs/{ => docs}/examples/accelerators/tpu.md (95%) rename docs/{ => docs}/examples/clusters/aws.md (92%) rename docs/{ => docs}/examples/clusters/crusoe.md (90%) rename docs/{ => docs}/examples/clusters/gcp.md (94%) rename docs/{ => docs}/examples/clusters/lambda.md (89%) rename docs/{ => docs}/examples/clusters/nccl-rccl-tests.md (89%) rename docs/{ => docs}/examples/clusters/nebius.md (92%) rename docs/{ => docs}/examples/inference/nim.md (84%) rename docs/{ => docs}/examples/inference/sglang.md (93%) rename docs/{ => docs}/examples/inference/trtllm.md (87%) rename docs/{ => docs}/examples/inference/vllm.md (88%) rename docs/{ => docs}/examples/llms/deepseek/index.md (100%) rename docs/{ => docs}/examples/llms/llama/index.md (100%) rename docs/{ => docs}/examples/misc/docker-compose/index.md (100%) rename docs/{ => docs}/examples/models/deepseek-v4.md (97%) rename docs/{ => docs}/examples/models/qwen36.md (97%) rename docs/{ => docs}/examples/models/wan22/index.md (100%) create mode 100644 docs/docs/examples/training/axolotl.md rename docs/{examples/distributed-training => docs/examples/training}/ray-ragen.md (93%) create mode 100644 docs/docs/examples/training/trl.md delete mode 100644 docs/examples/distributed-training/axolotl.md delete mode 100644 docs/examples/distributed-training/trl.md delete mode 100644 docs/examples/single-node-training/axolotl.md delete mode 100644 docs/examples/single-node-training/trl.md diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index cb2d68e55d..e344baf9d0 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -1283,19 +1283,19 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { - .md-tabs__item:nth-child(7) { + .md-tabs__item:nth-child(6) { margin-left: auto; padding-right: 0.5rem; } - .md-tabs__item:nth-child(n+7) .md-tabs__link { + .md-tabs__item:nth-child(n+6) .md-tabs__link { visibility: hidden; width: 35px; display: inline-block; margin-top: 12px; } - .md-tabs__item:nth-child(n+7) .md-tabs__link:before { + .md-tabs__item:nth-child(n+6) .md-tabs__link:before { width: 38px; height: 38px; margin-top: 4px; @@ -1318,7 +1318,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { margin-right: -7px; } */ - .md-tabs__item:nth-child(7) .md-tabs__link:before { + .md-tabs__item:nth-child(6) .md-tabs__link:before { position: relative; content: ''; width: 34px; diff --git a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md index 8980c984f1..fb43d7f3ea 100644 --- a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md +++ b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -104,7 +104,7 @@ efficient manner. ### NVIDIA -NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../examples/inference/nim.md) +NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../docs/examples/inference/nim.md) for model deployment, and we continue to enhance support for the rest of NVIDIA's ecosystem. ### AMD diff --git a/docs/blog/posts/changelog-07-25.md b/docs/blog/posts/changelog-07-25.md index 50c8ff032a..e231ac6a37 100644 --- a/docs/blog/posts/changelog-07-25.md +++ b/docs/blog/posts/changelog-07-25.md @@ -144,7 +144,7 @@ resources: #### AWS EFA -EFA is a network interface for EC2 that enables low-latency, high-bandwidth communication between nodes—crucial for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when using supported instance types in fleets. Check out our [example](../../examples/clusters/aws.md) +EFA is a network interface for EC2 that enables low-latency, high-bandwidth communication between nodes—crucial for scaling distributed deep learning. With `dstack`, EFA is automatically enabled when using supported instance types in fleets. Check out our [example](../../docs/examples/clusters/aws.md) #### Default Docker images diff --git a/docs/blog/posts/ea-gtc25.md b/docs/blog/posts/ea-gtc25.md index 4a287a21ea..499c5402cb 100644 --- a/docs/blog/posts/ea-gtc25.md +++ b/docs/blog/posts/ea-gtc25.md @@ -85,4 +85,4 @@ By adopting tools that are cloud-agnostic and developer-friendly, EA has reduced !!! info "What's next?" 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 2. Follow [Quickstart](../../docs/quickstart.md) - 3. Browse [Examples](../../examples.md) + 3. Browse [Examples](../../docs/examples.md) diff --git a/docs/blog/posts/gpu-health-checks.md b/docs/blog/posts/gpu-health-checks.md index 1fe89e1d1d..84746ed90f 100644 --- a/docs/blog/posts/gpu-health-checks.md +++ b/docs/blog/posts/gpu-health-checks.md @@ -51,7 +51,7 @@ A healthy instance is ready for workloads. A warning means you should monitor it This release focuses on passive checks using DCGM background health checks. These run continuously and do not interrupt workloads. -For active checks today, you can run [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests.md) as a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to verify GPU-to-GPU communication and bandwidth across a fleet. Active tests like these can reveal network or interconnect issues that passive monitoring might miss. More built-in support for active diagnostics is planned. +For active checks today, you can run [NCCL/RCCL tests](../../docs/examples/clusters/nccl-rccl-tests.md) as a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to verify GPU-to-GPU communication and bandwidth across a fleet. Active tests like these can reveal network or interconnect issues that passive monitoring might miss. More built-in support for active diagnostics is planned. ## Supported backends @@ -68,6 +68,6 @@ If you have experience with GPU reliability or ideas for automated recovery, joi !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) - 2. Explore the [clusters](../../examples.md#clusters) examples + 2. Explore the [fleets](../../docs/concepts/fleets.md#cluster-placement) guide 3. Learn more about [metrics](../../docs/concepts/metrics.md) 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/intel-gaudi.md b/docs/blog/posts/intel-gaudi.md index 4ac0e67708..37b8c383b4 100644 --- a/docs/blog/posts/intel-gaudi.md +++ b/docs/blog/posts/intel-gaudi.md @@ -158,7 +158,7 @@ $ dstack apply -f examples/single-node-training/trl/intel/.dstack.yml -R `dstack` will automatically create containers according to the run configuration and execute them across the fleet. -> Explore our [examples](../../examples/accelerators/intel/index.md) to learn how to train and deploy large models on +> Explore our [examples](../../docs/examples/accelerators/intel/index.md) to learn how to train and deploy large models on > Intel Gaudi AI Accelerator. !!! info "Intel Tiber AI Cloud" diff --git a/docs/blog/posts/kubernetes-beta.md b/docs/blog/posts/kubernetes-beta.md index a00a429af3..64fb6117c5 100644 --- a/docs/blog/posts/kubernetes-beta.md +++ b/docs/blog/posts/kubernetes-beta.md @@ -284,7 +284,7 @@ Submit the run nccl-tests? [y/n]: y
-For more examples, explore the [distirbuted training](../../examples.md#distributed-training) section in the docs. +For more examples, explore the [training](../../docs/examples.md#training) section in the docs. ## FAQ @@ -311,5 +311,5 @@ Support for AMD GPUs is coming soon — our team is actively working on it right 2. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 3. Browse the [clusters](../../examples.md#clusters) examples + 3. Browse the [fleets](../../docs/concepts/fleets.md#cluster-placement) guide 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/mpi.md b/docs/blog/posts/mpi.md index 37cd0dc7bf..02152aad3b 100644 --- a/docs/blog/posts/mpi.md +++ b/docs/blog/posts/mpi.md @@ -100,5 +100,5 @@ as well as use MPI for other tasks. !!! info "What's next?" 1. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Check the [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests.md) example + 2. Check the [NCCL/RCCL tests](../../docs/examples/clusters/nccl-rccl-tests.md) example 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nebius-in-dstack-sky.md b/docs/blog/posts/nebius-in-dstack-sky.md index 823576f377..1f911f98d3 100644 --- a/docs/blog/posts/nebius-in-dstack-sky.md +++ b/docs/blog/posts/nebius-in-dstack-sky.md @@ -104,7 +104,7 @@ $ dstack apply -f my-cluster.dstack.yml Once the fleet is ready, you can run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). `dstack` automatically configures drivers, networking, and fast GPU-to-GPU interconnect. -To learn more, see the [clusters](../../examples/clusters/nebius.md) guide. +To learn more, see the [clusters](../../docs/examples/clusters/nebius.md) guide. With Nebius joining `dstack` Sky, users can now run on-demand and spot GPUs and clusters directly through the marketplace—gaining access to the same production grade infrastrucure Nebius customers use for frontier-scale training, without needing a separate Nebius account. @@ -124,4 +124,4 @@ Our goal is to give teams maximum flexibility while removing the complexity of m 4. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 5. Read the [clusters](../../examples/clusters/nebius.md) guide + 5. Read the [clusters](../../docs/examples/clusters/nebius.md) guide diff --git a/docs/blog/posts/pd-disaggregation.md b/docs/blog/posts/pd-disaggregation.md index e9f0bc0a7c..dd3f27c9e8 100644 --- a/docs/blog/posts/pd-disaggregation.md +++ b/docs/blog/posts/pd-disaggregation.md @@ -27,7 +27,7 @@ For inference, `dstack` provides a [services](../../docs/concepts/services.md) a > If you’re new to Prefill–Decode disaggregation, see the official [SGLang docs](https://docs.sglang.io/advanced_features/pd_disaggregation.html). !!! note "Deprecation notice" - Configuring the SGLang router in a gateway is deprecated and will be disallowed in a future release. To run router and workers as separate replica groups, see [SGLang PD disaggregation (router as replica group)](https://dstack.ai/examples/inference/sglang/#pd-disaggregation). + Configuring the SGLang router in a gateway is deprecated and will be disallowed in a future release. To run router and workers as separate replica groups, see [SGLang PD disaggregation (router as replica group)](../../docs/examples/inference/sglang.md#pd-disaggregation). ## Services diff --git a/docs/blog/posts/toffee.md b/docs/blog/posts/toffee.md index 190ecf8c27..512218c1bb 100644 --- a/docs/blog/posts/toffee.md +++ b/docs/blog/posts/toffee.md @@ -85,4 +85,4 @@ As Toffee’s user base and model footprint grew, investing further in home-grow !!! info "What's next?" 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 2. Follow [Quickstart](../../docs/quickstart.md) - 3. Browse [Examples](../../examples.md) + 3. Browse [Examples](../../docs/examples.md) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 5bded6ba03..2f6186c6be 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1188,7 +1188,7 @@ projects: This applies to offers shown in `dstack apply` (run plans), during provisioning, and in `dstack offer`. Unlike other backends, offers for the `kubernetes` backend always reflect the lower limit of the range. -> To learn more, see the [Lambda](../../examples/clusters/lambda/#kubernetes) and [Crusoe](../../examples/clusters/crusoe/#kubernetes) examples. +> To learn more, see the [Lambda](../examples/clusters/lambda/#kubernetes) and [Crusoe](../examples/clusters/crusoe/#kubernetes) examples. ### Runpod diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 685392bd80..22057c0709 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -164,24 +164,22 @@ This property ensures that instances are interconnected. This is required for ru === "AWS" On AWS, `dstack` requires `public_ips` to be set to `false` in the backend configuration. - Refer to the [AWS](../../examples/clusters/aws.md) example for more details. + Refer to the [AWS](../examples/clusters/aws.md) example for more details. === "GCP" On GCP, you may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [GCP](../../examples/clusters/gcp.md) examples for more details. + Refer to the [GCP](../examples/clusters/gcp.md) examples for more details. === "Nebius" On [Nebius](https://docs.nebius.com/compute/clusters/gpu), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. === "Crusoe" On [Crusoe](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. - Refer to the [Crusoe](../../examples/clusters/crusoe.md#vms) example for more details. + Refer to the [Crusoe](../examples/clusters/crusoe.md#vms) example for more details. === "Kubernetes" If the Kubernetes cluster has interconnect configured, `dstack` can use it without additional setup. - See the [Lambda](../../examples/clusters/lambda.md#kubernetes) or [Crusoe](../../examples/clusters/crusoe.md#kubernetes) examples. - - > See the [Clusters](../../examples.md#clusters) examples. + See the [Lambda](../examples/clusters/lambda.md#kubernetes) or [Crusoe](../examples/clusters/crusoe.md#kubernetes) examples. @@ -211,6 +209,9 @@ This property ensures that instances are interconnected. This is required for ru +!!! info "Examples" + See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md). + ### Nodes The `nodes` property is supported only by backend fleets and specifies how many nodes `dstack` must or can provision. @@ -537,4 +538,4 @@ Use `--group-by gpu,backend` to aggregate offers. 2. Read about [Backends](backends.md) guide 3. Learn how to [export fleets](exports.md) to other projects 4. Explore the [`.dstack.yml` reference](../reference/dstack.yml/fleet.md) - 5. See the [Clusters](../../examples.md#clusters) example + 5. See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md) diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 53374aa53d..5e072a6966 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -97,10 +97,10 @@ router:
-If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../../examples/inference/sglang.md) or [Prefill-Decode workers](../../examples/inference/sglang.md#pd-disaggregation) (aka PD disaggregation). +If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../examples/inference/sglang.md) or [Prefill-Decode workers](../examples/inference/sglang.md#pd-disaggregation) (aka PD disaggregation). !!! note "PD disaggregation" - To run services with PD disaggregation see [SGLang PD disaggregation](https://dstack.ai/examples/inference/sglang/#pd-disaggregation). + To run services with PD disaggregation see [SGLang PD disaggregation](../examples/inference/sglang.md#pd-disaggregation). !!! note "Deprecation" Configuring the SGLang router in a gateway is deprecated and will be disallowed in a future release. diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 1923aa0655..a0e0de7936 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -1288,5 +1288,5 @@ The rolling deployment stops when all replicas are updated or when a new deploym 1. Read about [dev environments](dev-environments.md) and [tasks](tasks.md) 2. Learn how to manage [fleets](fleets.md) 3. See how to set up [gateways](gateways.md) - 4. Check the [vLLM](../../examples/inference/vllm.md) and - [NIM](../../examples/inference/nim.md) examples + 4. Check the [vLLM](../examples/inference/vllm.md) and + [NIM](../examples/inference/nim.md) examples diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md index dd4a83c62c..43eb8e80cb 100644 --- a/docs/docs/concepts/tasks.md +++ b/docs/docs/concepts/tasks.md @@ -150,8 +150,10 @@ Jobs on each node communicate using their private IP addresses. Use `DSTACK_MAST `dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks. -> For detailed examples, see the [distributed training](../../examples.md#distributed-training) - and [clusters](../../examples.md#clusters) examples. +!!! info "Examples" + See the training examples for [TRL](../examples/training/trl.md#distributed-training), [Axolotl](../examples/training/axolotl.md#distributed-training), and [Ray+RAGEN](../examples/training/ray-ragen.md). + + See the cluster examples for [AWS](../examples/clusters/aws.md), [GCP](../examples/clusters/gcp.md), [Lambda](../examples/clusters/lambda.md), [Crusoe](../examples/clusters/crusoe.md), [Nebius](../examples/clusters/nebius.md), and [NCCL/RCCL tests](../examples/clusters/nccl-rccl-tests.md). ??? info "Network interface" Distributed frameworks usually detect the correct network interface automatically, @@ -877,4 +879,4 @@ via the [`spot_policy`](../reference/dstack.yml/task.md#spot_policy) property. I !!! info "What's next?" 1. Read about [dev environments](dev-environments.md) and [services](services.md) 2. Learn how to manage [fleets](fleets.md) - 3. Check the [Axolotl](/examples/single-node-training/axolotl) example + 3. Check the [Axolotl](../examples/training/axolotl.md) example diff --git a/docs/examples.md b/docs/docs/examples.md similarity index 67% rename from docs/examples.md rename to docs/docs/examples.md index 5770425b9d..59203cdf8f 100644 --- a/docs/examples.md +++ b/docs/docs/examples.md @@ -14,66 +14,39 @@ hide: } --> -## Single-node training +## Training - -## Distributed training - @@ -82,7 +55,7 @@ hide: ## Clusters
-

GCP @@ -92,7 +65,7 @@ hide: Set up GCP A4 and A3 clusters with optimized networking

-

AWS @@ -102,7 +75,7 @@ hide: Set up AWS EFA clusters with optimized networking

-

Lambda @@ -112,7 +85,7 @@ hide: Set up Lambda clusters with optimized networking

-

Crusoe @@ -122,7 +95,7 @@ hide: Set up Crusoe clusters with optimized networking

-

Nebius @@ -132,7 +105,7 @@ hide: Set up Nebius clusters with optimized networking

-

NCCL/RCCL tests @@ -147,7 +120,7 @@ hide: ## Inference
-

SGLang @@ -156,7 +129,7 @@ hide: Deploy Qwen3.6-27B with SGLang

-

vLLM @@ -165,7 +138,7 @@ hide: Deploy Qwen3.6-27B with vLLM

-

NIM @@ -174,7 +147,7 @@ hide: Deploy a DeepSeek distilled model with NIM

-

TensorRT-LLM @@ -188,7 +161,7 @@ hide: ## Models
-

DeepSeek V4 @@ -199,7 +172,7 @@ hide:

-

Qwen 3.6 @@ -214,7 +187,7 @@ hide: ## Accelerators
-

AMD @@ -225,7 +198,7 @@ hide:

-

TPU @@ -236,7 +209,7 @@ hide:

-

Tenstorrent diff --git a/docs/examples/accelerators/amd.md b/docs/docs/examples/accelerators/amd.md similarity index 95% rename from docs/examples/accelerators/amd.md rename to docs/docs/examples/accelerators/amd.md index 5c0c306ce8..26f255f280 100644 --- a/docs/examples/accelerators/amd.md +++ b/docs/docs/examples/accelerators/amd.md @@ -6,12 +6,12 @@ description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, v # AMD `dstack` supports running dev environments, tasks, and services on AMD GPUs. -You can do that by setting up an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) +You can do that by setting up an [SSH fleet](../../concepts/fleets.md#ssh-fleets) with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the `runpod` backend. ## Deployment -Here are examples of a [service](../../docs/concepts/services.md) that deploy +Here are examples of a [service](../../concepts/services.md) that deploy `Qwen/Qwen3.6-27B` on AMD MI300X GPUs using [SGLang](https://github.com/sgl-project/sglang) and [vLLM](https://docs.vllm.ai/en/latest/). @@ -238,6 +238,6 @@ $ dstack apply -f 2. For multi-node training, run [NCCL/RCCL tests](../clusters/nccl-rccl-tests.md) to validate AMD cluster networking. -3. Check [dev environments](../../docs/concepts/dev-environments.md), - [tasks](../../docs/concepts/tasks.md), and - [services](../../docs/concepts/services.md). +3. Check [dev environments](../../concepts/dev-environments.md), + [tasks](../../concepts/tasks.md), and + [services](../../concepts/services.md). diff --git a/docs/examples/accelerators/intel/index.md b/docs/docs/examples/accelerators/intel/index.md similarity index 100% rename from docs/examples/accelerators/intel/index.md rename to docs/docs/examples/accelerators/intel/index.md diff --git a/docs/examples/accelerators/tenstorrent.md b/docs/docs/examples/accelerators/tenstorrent.md similarity index 94% rename from docs/examples/accelerators/tenstorrent.md rename to docs/docs/examples/accelerators/tenstorrent.md index 65005fd3a4..8344ced187 100644 --- a/docs/examples/accelerators/tenstorrent.md +++ b/docs/docs/examples/accelerators/tenstorrent.md @@ -42,7 +42,7 @@ description: Running dev environments, tasks, and services on Tenstorrent Wormho

- For more details on fleet configuration, refer to [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets). + For more details on fleet configuration, refer to [SSH fleets](../../concepts/fleets.md#ssh-fleets). ## Services @@ -123,10 +123,10 @@ Additionally, the model is available via `dstack`'s control plane UI: ![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-model-ui.png){ width=800 } -When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint is available at `https://./`. -> Services support many options, including authentication, auto-scaling policies, etc. To learn more, refer to [Services](../../docs/concepts/services.md). +> Services support many options, including authentication, auto-scaling policies, etc. To learn more, refer to [Services](../../concepts/services.md). ## Tasks @@ -159,7 +159,7 @@ resources:

-> Tasks support many options, including multi-node configuration, max duration, etc. To learn more, refer to [Tasks](../../docs/concepts/tasks.md). +> Tasks support many options, including multi-node configuration, max duration, etc. To learn more, refer to [Tasks](../../concepts/tasks.md). ## Dev environments @@ -191,7 +191,7 @@ If you run it via `dstack apply`, it will output the URL to access it via your d ![](https://dstack.ai/static-assets/static-assets/images/dstack-tenstorrent-cursor.png){ width=800 } -> Dev nevironments support many options, including inactivity and max duration, IDE configuration, etc. To learn more, refer to [Dev environments](../../docs/concepts/tasks.md). +> Dev nevironments support many options, including inactivity and max duration, IDE configuration, etc. To learn more, refer to [Dev environments](../../concepts/tasks.md). ??? info "Feedback" Found a bug, or want to request a feature? File it in the [issue tracker](https://github.com/dstackai/dstack/issues), diff --git a/docs/examples/accelerators/tpu.md b/docs/docs/examples/accelerators/tpu.md similarity index 95% rename from docs/examples/accelerators/tpu.md rename to docs/docs/examples/accelerators/tpu.md index 92640a4835..8c4d1584bb 100644 --- a/docs/examples/accelerators/tpu.md +++ b/docs/docs/examples/accelerators/tpu.md @@ -7,7 +7,7 @@ description: Deploying and fine-tuning models on Google Cloud TPUs using Optimum If you've configured the `gcp` backend in `dstack`, you can run dev environments, tasks, and services on [TPUs](https://cloud.google.com/tpu/docs/intro-to-tpu). Choose a TPU instance by specifying the TPU version and the number of cores (e.g. `v5litepod-8`) in the `gpu` property under `resources`, -or request TPUs by specifying `tpu` as `vendor` ([see examples](../../docs/guides/protips.md#gpu)). +or request TPUs by specifying `tpu` as `vendor` ([see examples](../../guides/protips.md#gpu)). Below are a few examples on using TPUs for deployment and fine-tuning. @@ -18,12 +18,12 @@ Below are a few examples on using TPUs for deployment and fine-tuning. !!! info "TPU storage" By default, each TPU VM contains a 100GB boot disk and its size cannot be changed. - If you need more storage, attach additional disks using [Volumes](../../docs/concepts/volumes.md). + If you need more storage, attach additional disks using [Volumes](../../concepts/volumes.md). ## Deployment Many serving frameworks including vLLM and TGI have TPU support. -Here's an example of a [service](../../docs/concepts/services.md) that deploys Llama 3.1 8B using +Here's an example of a [service](../../concepts/services.md) that deploys Llama 3.1 8B using [Optimum TPU](https://github.com/huggingface/optimum-tpu) and [vLLM](https://github.com/vllm-project/vllm). @@ -189,5 +189,5 @@ Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each co 1. Browse [Optimum TPU](https://github.com/huggingface/optimum-tpu), [Optimum TPU TGI](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and [vLLM](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html). -2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). +2. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md). diff --git a/docs/examples/clusters/aws.md b/docs/docs/examples/clusters/aws.md similarity index 92% rename from docs/examples/clusters/aws.md rename to docs/docs/examples/clusters/aws.md index 688af91e0e..54e1cd667d 100644 --- a/docs/examples/clusters/aws.md +++ b/docs/docs/examples/clusters/aws.md @@ -197,6 +197,6 @@ Provisioning... Instead of setting `python`, you can specify your own Docker image using `image`. Make sure that the image is properly configured for EFA. !!! info "What's next" - 1. Learn more about [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks) and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) - 2. Check [dev environments](../../docs/concepts/dev-environments.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) + 1. Learn more about [distributed tasks](../../concepts/tasks.md#distributed-tasks) and [cluster placement](../../concepts/fleets.md#cluster-placement) + 2. Check [dev environments](../../concepts/dev-environments.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) diff --git a/docs/examples/clusters/crusoe.md b/docs/docs/examples/clusters/crusoe.md similarity index 90% rename from docs/examples/clusters/crusoe.md rename to docs/docs/examples/clusters/crusoe.md index 2a9c108ec6..28901a8e3c 100644 --- a/docs/examples/clusters/crusoe.md +++ b/docs/docs/examples/clusters/crusoe.md @@ -67,7 +67,7 @@ $ dstack apply -f crusoe-fleet.dstack.yml This will automatically create an IB partition and provision instances with InfiniBand networking. -Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). > If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. @@ -84,7 +84,7 @@ Once the fleet is created, you can run [dev environments](../../docs/concepts/de ### Configure the backend -Follow the standard instructions for setting up a [`kubernetes`](../../docs/concepts/backends.md#kubernetes) backend: +Follow the standard instructions for setting up a [`kubernetes`](../../concepts/backends.md#kubernetes) backend:
@@ -133,15 +133,15 @@ $ dstack apply -f crusoe-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). ## NCCL tests -Use a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. +Use a [distributed task](../../concepts/tasks.md#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. === "VMs" - With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](../../docs/concepts/volumes.md#instance-volumes). + With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](../../concepts/volumes.md#instance-volumes).
@@ -275,6 +275,6 @@ $ dstack apply -f crusoe-nccl-tests.dstack.yml ## What's next -1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) -2. Check out [backends](../../docs/concepts/backends.md#crusoe-cloud) and [fleets](../../docs/concepts/fleets.md#cloud-fleets) +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Check out [backends](../../concepts/backends.md#crusoe-cloud) and [fleets](../../concepts/fleets.md#cloud-fleets) 3. Check the docs on [Crusoe's networking](https://docs.crusoecloud.com/networking/infiniband/) and ["Crusoe Managed" Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) diff --git a/docs/examples/clusters/gcp.md b/docs/docs/examples/clusters/gcp.md similarity index 94% rename from docs/examples/clusters/gcp.md rename to docs/docs/examples/clusters/gcp.md index b0f0393200..eb9ddef0c2 100644 --- a/docs/examples/clusters/gcp.md +++ b/docs/docs/examples/clusters/gcp.md @@ -518,10 +518,10 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt ### Distributed training === "A4" - You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A4 instances. + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A4 instances. === "A3 Mega" - You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: ```shell # ... @@ -540,7 +540,7 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt ``` === "A3 High/Edge" - You can use the standard [distributed task](../../docs/concepts/tasks.md#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + You can use the standard [distributed task](../../concepts/tasks.md#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: ```shell # ... @@ -577,6 +577,6 @@ In addition to distributed training, you can of course run regular tasks, dev en ## What's new -1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) -2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) 3. Check GCP's docs on using [A4](https://docs.cloud.google.com/compute/docs/gpus/create-gpu-vm-a3u-a4), and [A3 Mega/High/Edge](https://docs.cloud.google.com/compute/docs/gpus/gpudirect) instances diff --git a/docs/examples/clusters/lambda.md b/docs/docs/examples/clusters/lambda.md similarity index 89% rename from docs/examples/clusters/lambda.md rename to docs/docs/examples/clusters/lambda.md index e66e74573a..1ebe35ce76 100644 --- a/docs/examples/clusters/lambda.md +++ b/docs/docs/examples/clusters/lambda.md @@ -19,7 +19,7 @@ description: Setting up Lambda clusters using Kubernetes or 1-Click Clusters wit ### Configure the backend -Follow the standard instructions for setting up a [Kubernetes](../../docs/concepts/backends.md#kubernetes) backend: +Follow the standard instructions for setting up a [Kubernetes](../../concepts/backends.md#kubernetes) backend:
@@ -68,11 +68,11 @@ $ dstack apply -f lambda-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). ## 1-Click Clusters -Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](../../docs/concepts/backends.md#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](../../docs/concepts/fleets.md). +Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](../../concepts/backends.md#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](../../concepts/fleets.md). ### Prerequsisites @@ -80,7 +80,7 @@ Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-cl ### Create a fleet -Follow the standard instructions for setting up an [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets): +Follow the standard instructions for setting up an [SSH fleet](../../concepts/fleets.md#ssh-fleets):
@@ -116,11 +116,11 @@ $ dstack apply -f lambda-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). ## Run tasks -To run tasks on a cluster, you must use [distributed tasks](../../docs/concepts/tasks.md#distributed-task). +To run tasks on a cluster, you must use [distributed tasks](../../concepts/tasks.md#distributed-task). ### Run NCCL tests @@ -213,6 +213,6 @@ Provisioning... ## What's next -1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) -2. Read about the [Kubernetes backend](../../docs/concepts/backends.md#kubernetes) and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Read about the [Kubernetes backend](../../concepts/backends.md#kubernetes) and [cluster placement](../../concepts/fleets.md#cluster-placement) 3. Check Lambda's docs on [Kubernetes](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) and [1CC](https://docs.lambda.ai/public-cloud/1-click-clusters/) diff --git a/docs/examples/clusters/nccl-rccl-tests.md b/docs/docs/examples/clusters/nccl-rccl-tests.md similarity index 89% rename from docs/examples/clusters/nccl-rccl-tests.md rename to docs/docs/examples/clusters/nccl-rccl-tests.md index 4c565d8c68..196f08d495 100644 --- a/docs/examples/clusters/nccl-rccl-tests.md +++ b/docs/docs/examples/clusters/nccl-rccl-tests.md @@ -5,10 +5,10 @@ description: Running NCCL and RCCL tests to validate cluster network bandwidth # NCCL/RCCL tests -This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). +This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](../../concepts/tasks.md#distributed-tasks). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). ## Running as a task @@ -120,7 +120,7 @@ Here's an example of a task that runs AllReduce test on 2 nodes, each with 4 GPU ### Apply a configuration -To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command.
@@ -139,5 +139,5 @@ Submit the run nccl-tests? [y/n]: y ## What's next? -1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md). +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md). diff --git a/docs/examples/clusters/nebius.md b/docs/docs/examples/clusters/nebius.md similarity index 92% rename from docs/examples/clusters/nebius.md rename to docs/docs/examples/clusters/nebius.md index 6986a10ab5..20b1a47555 100644 --- a/docs/examples/clusters/nebius.md +++ b/docs/docs/examples/clusters/nebius.md @@ -75,7 +75,7 @@ $ dstack apply -f nebius-fleet.dstack.yml This will automatically create a Nebius cluster and provision instances. -Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). > If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. @@ -107,7 +107,7 @@ $ nebius mk8s cluster get-credentials --id <cluster id> --external ### Configure a backend -Follow the standard instructions for setting up a [`kubernetes`](../../docs/concepts/backends.md#kubernetes) backend: +Follow the standard instructions for setting up a [`kubernetes`](../../concepts/backends.md#kubernetes) backend:
@@ -154,11 +154,11 @@ $ dstack apply -f nebius-fleet.dstack.yml
-Once the fleet is created, you can run [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md). +Once the fleet is created, you can run [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md). ## NCCL tests -Use a [distributed task](../../docs/concepts/tasks.md#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth. +Use a [distributed task](../../concepts/tasks.md#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth.
@@ -252,6 +252,6 @@ nccl-tests provisioning completed (running) ## What's next -1. Learn about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md) -2. Check out [backends](../../docs/concepts/backends.md) and [fleets](../../docs/concepts/fleets.md) +1. Learn about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), [services](../../concepts/services.md) +2. Check out [backends](../../concepts/backends.md) and [fleets](../../concepts/fleets.md) 3. Read Nebius' docs on [networking for VMs](https://docs.nebius.com/compute/clusters/gpu) and the [managed Kubernetes service](https://docs.nebius.com/kubernetes). diff --git a/docs/examples/inference/nim.md b/docs/docs/examples/inference/nim.md similarity index 84% rename from docs/examples/inference/nim.md rename to docs/docs/examples/inference/nim.md index 263baa2737..f7d1c03edf 100644 --- a/docs/examples/inference/nim.md +++ b/docs/docs/examples/inference/nim.md @@ -8,7 +8,7 @@ description: Deploying Nemotron-3-Super-120B-A12B using NVIDIA NIM This example shows how to deploy Nemotron-3-Super-120B-A12B using [NVIDIA NIM](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) and `dstack`. ??? info "Prerequisites" - Once `dstack` is [installed](../../docs/installation.md), clone the repo with examples. + Once `dstack` is [installed](../../installation.md), clone the repo with examples.
@@ -54,7 +54,7 @@ resources: ### Running a configuration Save the configuration above as `nemotron120.dstack.yml`, then use the -[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. +[`dstack apply`](../../reference/cli/dstack/apply.md) command.
@@ -91,9 +91,9 @@ $ curl http://127.0.0.1:3000/proxy/services/main/nemotron120/v1/chat/completions
-When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint will be available at `https://nemotron120./`. +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint will be available at `https://nemotron120./`. ## What's next? -1. Check [services](../../docs/concepts/services.md) +1. Check [services](../../concepts/services.md) 2. Browse the [Nemotron-3-Super-120B-A12B model page](https://build.nvidia.com/nvidia/nemotron-3-super-120b-a12b) diff --git a/docs/examples/inference/sglang.md b/docs/docs/examples/inference/sglang.md similarity index 93% rename from docs/examples/inference/sglang.md rename to docs/docs/examples/inference/sglang.md index feda39a46d..775dcedd48 100644 --- a/docs/examples/inference/sglang.md +++ b/docs/docs/examples/inference/sglang.md @@ -95,7 +95,7 @@ standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. The first startup on MI300X can take longer while SGLang compiles ROCm kernels. Save one of the configurations above as `service.dstack.yml`, then use the -[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. +[`dstack apply`](../../reference/cli/dstack/apply.md) command.
@@ -132,7 +132,7 @@ Qwen3.6 uses thinking mode by default. To disable thinking, pass `"chat_template_kwargs": {"enable_thinking": false}` in the request body. To enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command. -> If a [gateway](../../docs/concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## Configuration options @@ -221,5 +221,5 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics ## What's next? -1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) 2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) diff --git a/docs/examples/inference/trtllm.md b/docs/docs/examples/inference/trtllm.md similarity index 87% rename from docs/examples/inference/trtllm.md rename to docs/docs/examples/inference/trtllm.md index 8f95cefc63..c058820b0a 100644 --- a/docs/examples/inference/trtllm.md +++ b/docs/docs/examples/inference/trtllm.md @@ -53,7 +53,7 @@ resources: ```
-Apply it with [`dstack apply`](../../docs/reference/cli/dstack/apply.md): +Apply it with [`dstack apply`](../../reference/cli/dstack/apply.md):
@@ -90,10 +90,10 @@ $ curl http://127.0.0.1:3000/proxy/services/main/qwen235/v1/chat/completions \
-When a [gateway](../../docs/concepts/gateways.md) is configured, the service endpoint will be available at `https://qwen235./`. +When a [gateway](../../concepts/gateways.md) is configured, the service endpoint will be available at `https://qwen235./`. ## What's next? -1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) 2. Browse the [TensorRT-LLM deployment guides](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html) and the [Qwen3 deployment guide](https://nvidia.github.io/TensorRT-LLM/deployment-guide/deployment-guide-for-qwen3-on-trtllm.html) 3. See the [`trtllm-serve` reference](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve/trtllm-serve.html) diff --git a/docs/examples/inference/vllm.md b/docs/docs/examples/inference/vllm.md similarity index 88% rename from docs/examples/inference/vllm.md rename to docs/docs/examples/inference/vllm.md index 4ac880defc..b5b83c4664 100644 --- a/docs/examples/inference/vllm.md +++ b/docs/docs/examples/inference/vllm.md @@ -89,7 +89,7 @@ Qwen3.6-27B is a multimodal model. For text-only workloads, add calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`. Save one of the configurations above as `service.dstack.yml`, then use the -[`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. +[`dstack apply`](../../reference/cli/dstack/apply.md) command.
@@ -122,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
-> If a [gateway](../../docs/concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. +> If a [gateway](../../concepts/gateways.md) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## What's next? -1. Read about [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) +1. Read about [services](../../concepts/services.md) and [gateways](../../concepts/gateways.md) 2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](../inference/sglang.md) example diff --git a/docs/examples/llms/deepseek/index.md b/docs/docs/examples/llms/deepseek/index.md similarity index 100% rename from docs/examples/llms/deepseek/index.md rename to docs/docs/examples/llms/deepseek/index.md diff --git a/docs/examples/llms/llama/index.md b/docs/docs/examples/llms/llama/index.md similarity index 100% rename from docs/examples/llms/llama/index.md rename to docs/docs/examples/llms/llama/index.md diff --git a/docs/examples/misc/docker-compose/index.md b/docs/docs/examples/misc/docker-compose/index.md similarity index 100% rename from docs/examples/misc/docker-compose/index.md rename to docs/docs/examples/misc/docker-compose/index.md diff --git a/docs/examples/models/deepseek-v4.md b/docs/docs/examples/models/deepseek-v4.md similarity index 97% rename from docs/examples/models/deepseek-v4.md rename to docs/docs/examples/models/deepseek-v4.md index 7efd9977e8..833e5163d7 100644 --- a/docs/examples/models/deepseek-v4.md +++ b/docs/docs/examples/models/deepseek-v4.md @@ -6,7 +6,7 @@ description: Deploying DeepSeek-V4-Pro using SGLang on NVIDIA B200:8 # DeepSeek V4 This example shows how to deploy `deepseek-ai/DeepSeek-V4-Pro` as a -[service](../../docs/concepts/services.md) using +[service](../../concepts/services.md) using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration @@ -64,7 +64,7 @@ This configuration uses the single-node Blackwell `DeepSeek-V4-Pro` recipe shape for `8 x NVIDIA B200`. Export your Hugging Face token and apply the configuration with -[`dstack apply`](../../docs/reference/cli/dstack/apply.md). +[`dstack apply`](../../reference/cli/dstack/apply.md).
diff --git a/docs/examples/models/qwen36.md b/docs/docs/examples/models/qwen36.md similarity index 97% rename from docs/examples/models/qwen36.md rename to docs/docs/examples/models/qwen36.md index 3723e36fa0..35ea72fd11 100644 --- a/docs/examples/models/qwen36.md +++ b/docs/docs/examples/models/qwen36.md @@ -6,7 +6,7 @@ description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs # Qwen 3.6 This example shows how to deploy `Qwen/Qwen3.6-27B` as a -[service](../../docs/concepts/services.md) using +[service](../../concepts/services.md) using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration @@ -92,7 +92,7 @@ The NVIDIA and AMD configurations above use pinned SGLang images and the same straightforward 4-GPU layout used across the Qwen 3.6 docs and examples. Apply the configuration with -[`dstack apply`](../../docs/reference/cli/dstack/apply.md). +[`dstack apply`](../../reference/cli/dstack/apply.md).
diff --git a/docs/examples/models/wan22/index.md b/docs/docs/examples/models/wan22/index.md similarity index 100% rename from docs/examples/models/wan22/index.md rename to docs/docs/examples/models/wan22/index.md diff --git a/docs/docs/examples/training/axolotl.md b/docs/docs/examples/training/axolotl.md new file mode 100644 index 0000000000..5266a86745 --- /dev/null +++ b/docs/docs/examples/training/axolotl.md @@ -0,0 +1,185 @@ +--- +title: Axolotl +description: Fine-tuning Llama models with Axolotl — single-node SFT with FSDP and QLoRA, or distributed across multiple nodes +--- + +# Axolotl + +This example shows how to use [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) with `dstack` to fine-tune Llama models — on a single node with SFT, FSDP, and QLoRA, or distributed across multiple nodes. + +## Single-node training + +This section walks through fine-tuning 4-bit quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. + +### Define a configuration + +Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer configuration from a [`scout-qlora-flexattn-fsdp2.yaml`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml) file. The configuration uses 4-bit axolotl quantized version of `meta-llama/Llama-4-Scout-17B-16E`, requiring only ~43GB VRAM/GPU with 4K context length. + +Below is a task configuration that does fine-tuning. + +
+ +```yaml +type: task +# The name is optional, if not specified, generated randomly +name: axolotl-nvidia-llama-scout-train + +# Using the official Axolotl's Docker image +image: axolotlai/axolotl:main-latest + +# Required environment variables +env: + - HF_TOKEN + - WANDB_API_KEY + - WANDB_PROJECT + - HUB_MODEL_ID +# Commands of the task +commands: + - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml + - | + axolotl train scout-qlora-flexattn-fsdp2.yaml \ + --wandb-project $WANDB_PROJECT \ + --wandb-name $DSTACK_RUN_NAME \ + --hub-model-id $HUB_MODEL_ID + +resources: + # Four GPU (required by FSDP) + gpu: H100:4 + # Shared memory size for inter-process communication + shm_size: 64GB + disk: 500GB.. +``` + +
+ +The task uses Axolotl's Docker image, where Axolotl is already pre-installed. + +!!! info "AMD" + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#axolotl). + +### Run the configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +
+ +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ WANDB_PROJECT=... +$ HUB_MODEL_ID=... +$ dstack apply -f train.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 + 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 + 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 + +Submit the run axolotl-nvidia-llama-scout-train? [y/n]: + +Provisioning... +---> 100% +``` + +
+ +## Distributed training + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +This section walks through running distributed fine-tuning of `Llama-3.1-70B` with QLoRA and FSDP across multiple nodes. + +### Define a configuration + +Once the fleet is created, define a distributed task configuration. Here's an example of a distributed `QLoRA` task using `FSDP`. + +
+ +```yaml +type: task +name: axolotl-multi-node-qlora-llama3-70b + +nodes: 2 + +image: nvcr.io/nvidia/pytorch:25.01-py3 + +env: + - HF_TOKEN + - WANDB_API_KEY + - WANDB_PROJECT + - HUB_MODEL_ID + - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + - NCCL_DEBUG=INFO + - ACCELERATE_LOG_LEVEL=info + +commands: + # Replacing the default Torch and FlashAttention in the NCG container with Axolotl-compatible versions. + # The preinstalled versions are incompatible with Axolotl. + - pip uninstall -y torch flash-attn + - pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/test/cu124 + - pip install --no-build-isolation axolotl[flash-attn,deepspeed] + - wget https://raw.githubusercontent.com/huggingface/trl/main/examples/accelerate_configs/fsdp1.yaml + - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/qlora-fsdp-70b.yaml + # Axolotl includes hf-xet version 1.1.0, which fails during downloads. Replacing it with the latest version (1.1.2). + - pip uninstall -y hf-xet + - pip install hf-xet --no-cache-dir + - | + accelerate launch \ + --config_file=fsdp1.yaml \ + -m axolotl.cli.train qlora-fsdp-70b.yaml \ + --hub-model-id $HUB_MODEL_ID \ + --output-dir /checkpoints/qlora-llama3-70b \ + --wandb-project $WANDB_PROJECT \ + --wandb-name $DSTACK_RUN_NAME \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM + +resources: + gpu: 80GB:8 + shm_size: 128GB + +volumes: + - /checkpoints:/checkpoints +``` + +
+ +!!! info "Docker image" + We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. + +### Run the configuration + +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
+ +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ WANDB_PROJECT=... +$ HUB_MODEL_ID=... +$ dstack apply -f train-distrib.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + +Submit the run axolotl-multi-node-qlora-llama3-70b? [y/n]: y + +Provisioning... +---> 100% +``` + +
+ +## What's next? + +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) +3. See the [AMD](../accelerators/amd.md#axolotl) example diff --git a/docs/examples/distributed-training/ray-ragen.md b/docs/docs/examples/training/ray-ragen.md similarity index 93% rename from docs/examples/distributed-training/ray-ragen.md rename to docs/docs/examples/training/ray-ragen.md index e3194b2b3a..73e8749e83 100644 --- a/docs/examples/distributed-training/ray-ragen.md +++ b/docs/docs/examples/training/ray-ragen.md @@ -11,7 +11,7 @@ to fine-tune an agent on multiple nodes. Under the hood `RAGEN` uses [verl](https://github.com/volcengine/verl) for Reinforcement Learning and [Ray](https://docs.ray.io/en/latest/) for distributed training. !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). ## Run a Ray cluster @@ -130,5 +130,5 @@ $ ray job submit \ Using Ray via `dstack` is a powerful way to get access to the rich Ray ecosystem while benefiting from `dstack`'s provisioning capabilities. !!! info "What's next" - 1. Read about [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks), [fleets](../../docs/concepts/fleets.md), and [cluster placement](../../docs/concepts/fleets.md#cluster-placement) + 1. Read about [distributed tasks](../../concepts/tasks.md#distributed-tasks), [fleets](../../concepts/fleets.md), and [cluster placement](../../concepts/fleets.md#cluster-placement) 2. Browse Ray's [docs](https://docs.ray.io/en/latest/train/examples.html) for other examples. diff --git a/docs/docs/examples/training/trl.md b/docs/docs/examples/training/trl.md new file mode 100644 index 0000000000..ffeb3766f8 --- /dev/null +++ b/docs/docs/examples/training/trl.md @@ -0,0 +1,272 @@ +--- +title: TRL +description: Fine-tuning Llama with TRL — single-node SFT with QLoRA, or distributed across multiple nodes with FSDP and DeepSpeed +--- + +# TRL + +This example walks you through how to use [TRL](https://github.com/huggingface/trl) with `dstack` to fine-tune `Llama-3.1-8B` — on a single node with SFT and QLoRA, or distributed across multiple nodes with [Accelerate](https://github.com/huggingface/accelerate) and [DeepSpeed](https://github.com/deepspeedai/DeepSpeed). + +## Single-node training + +### Define a configuration + +Below is a task configuration that does fine-tuning. + +
+ +```yaml +type: task +name: trl-train + +python: 3.12 +# Ensure nvcc is installed (req. for Flash Attention) +nvcc: true + +env: + - HF_TOKEN + - WANDB_API_KEY + - HUB_MODEL_ID +commands: + # Pin torch==2.6.0 to avoid building Flash Attention from source. + # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0. + - uv pip install torch==2.6.0 + - uv pip install transformers bitsandbytes peft wandb + - uv pip install flash_attn --no-build-isolation + - git clone https://github.com/huggingface/trl + - cd trl + - uv pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/multi_gpu.yaml \ + --num_processes $DSTACK_GPUS_PER_NODE \ + trl/scripts/sft.py \ + --model_name meta-llama/Meta-Llama-3.1-8B \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --lora_r 16 \ + --lora_alpha 32 \ + --lora_target_modules q_proj k_proj v_proj o_proj \ + --load_in_4bit \ + --use_peft \ + --attn_implementation "flash_attention_2" \ + --logging_steps=10 \ + --output_dir models/llama31 \ + --hub_model_id peterschmidt85/FineLlama-3.1-8B + +resources: + gpu: + # 24GB or more VRAM + memory: 24GB.. + # One or more GPU + count: 1.. + # Shared memory (for multi-gpu) + shm_size: 24GB +``` + +
+ +Change the `resources` property to specify more GPUs. + +!!! info "AMD" + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#trl). + +??? info "DeepSpeed" + For more memory-efficient use of multiple GPUs, consider using DeepSpeed and ZeRO Stage 3. + + To do this, use the `examples/accelerate_configs/deepspeed_zero3.yaml` configuration file instead of + `examples/accelerate_configs/multi_gpu.yaml`. + +### Run the configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration. + +
+ +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ HUB_MODEL_ID=... +$ dstack apply -f train.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 + 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 + 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 + +Submit the run trl-train? [y/n]: + +Provisioning... +---> 100% +``` + +
+ +## Distributed training + +!!! info "Prerequisites" + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../concepts/fleets.md#cluster-placement) or an [SSH fleet](../../concepts/fleets.md#ssh-placement)). + +### Define a configuration + +Once the fleet is created, define a distributed task configuration. Here's an example using either FSDP or DeepSpeed ZeRO-3. + +=== "FSDP" + +
+ + ```yaml + type: task + name: trl-train-fsdp-distrib + + nodes: 2 + + image: nvcr.io/nvidia/pytorch:25.01-py3 + + env: + - HF_TOKEN + - ACCELERATE_LOG_LEVEL=info + - WANDB_API_KEY + - MODEL_ID=meta-llama/Llama-3.1-8B + - HUB_MODEL_ID + + commands: + - pip install transformers bitsandbytes peft wandb + - git clone https://github.com/huggingface/trl + - cd trl + - pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/fsdp1.yaml \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + trl/scripts/sft.py \ + --model_name $MODEL_ID \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --attn_implementation flash_attention_2 \ + --logging_steps=10 \ + --output_dir /checkpoints/llama31-ft \ + --hub_model_id $HUB_MODEL_ID \ + --torch_dtype bfloat16 + + resources: + gpu: 80GB:8 + shm_size: 128GB + + volumes: + - /checkpoints:/checkpoints + ``` + +
+ +=== "DeepSpeed ZeRO-3" + +
+ + ```yaml + type: task + name: trl-train-deepspeed-distrib + + nodes: 2 + + image: nvcr.io/nvidia/pytorch:25.01-py3 + + env: + - HF_TOKEN + - WANDB_API_KEY + - HUB_MODEL_ID + - MODEL_ID=meta-llama/Llama-3.1-8B + - ACCELERATE_LOG_LEVEL=info + + commands: + - pip install transformers bitsandbytes peft wandb deepspeed + - git clone https://github.com/huggingface/trl + - cd trl + - pip install . + - | + accelerate launch \ + --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \ + --main_process_ip=$DSTACK_MASTER_NODE_IP \ + --main_process_port=8008 \ + --machine_rank=$DSTACK_NODE_RANK \ + --num_processes=$DSTACK_GPUS_NUM \ + --num_machines=$DSTACK_NODES_NUM \ + trl/scripts/sft.py \ + --model_name $MODEL_ID \ + --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ + --dataset_text_field="text" \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --learning_rate 2e-4 \ + --report_to wandb \ + --bf16 \ + --max_seq_length 1024 \ + --attn_implementation flash_attention_2 \ + --logging_steps=10 \ + --output_dir /checkpoints/llama31-ft \ + --hub_model_id $HUB_MODEL_ID \ + --torch_dtype bfloat16 + + resources: + gpu: 80GB:8 + shm_size: 128GB + + volumes: + - /checkpoints:/checkpoints + ``` + +
+ +!!! info "Docker image" + We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. + +### Run the configuration + +To run a configuration, use the [`dstack apply`](../../reference/cli/dstack/apply.md) command. + +
+ +```shell +$ HF_TOKEN=... +$ WANDB_API_KEY=... +$ HUB_MODEL_ID=... +$ dstack apply -f train-distrib.dstack.yml + + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle + +Submit the run trl-train-fsdp-distrib? [y/n]: y + +Provisioning... +---> 100% +``` + +
+ +## What's next? + +1. Check [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), + [services](../../concepts/services.md), and [fleets](../../concepts/fleets.md) +2. Read about [cluster placement](../../concepts/fleets.md#cluster-placement) +3. See the [AMD](../accelerators/amd.md#trl) example diff --git a/docs/docs/guides/migration/slurm.md b/docs/docs/guides/migration/slurm.md index 97b4546b58..2791075e8d 100644 --- a/docs/docs/guides/migration/slurm.md +++ b/docs/docs/guides/migration/slurm.md @@ -1847,4 +1847,4 @@ fi 1. Check out [Quickstart](../../quickstart.md) 2. Read about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md) -3. Browse the [examples](../../../examples.md) +3. Browse the [examples](../../examples.md) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 80a98f79bf..da37d46ded 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -277,5 +277,5 @@ Something not working? See the [troubleshooting](guides/troubleshooting.md) guid !!! info "What's next?" 1. Read about [backends](concepts/backends.md), [dev environments](concepts/dev-environments.md), [tasks](concepts/tasks.md), [services](concepts/services.md), and [fleets](concepts/services.md) - 2. Browse [examples](../examples.md) + 2. Browse [examples](examples.md) 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/examples/distributed-training/axolotl.md b/docs/examples/distributed-training/axolotl.md deleted file mode 100644 index c2e04d3fc6..0000000000 --- a/docs/examples/distributed-training/axolotl.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: Axolotl -description: Distributed fine-tuning with Axolotl and FSDP across multiple nodes ---- - -# Axolotl - -This example walks you through how to run distributed fine-tune using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) and [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). - -!!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). - -## Define a configuration - -Once the fleet is created, define a distributed task configuration. Here's an example of distributed `QLORA` task using `FSDP`. - -
- -```yaml -type: task -name: axolotl-multi-node-qlora-llama3-70b - -nodes: 2 - -image: nvcr.io/nvidia/pytorch:25.01-py3 - -env: - - HF_TOKEN - - WANDB_API_KEY - - WANDB_PROJECT - - HUB_MODEL_ID - - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - - NCCL_DEBUG=INFO - - ACCELERATE_LOG_LEVEL=info - -commands: - # Replacing the default Torch and FlashAttention in the NCG container with Axolotl-compatible versions. - # The preinstalled versions are incompatible with Axolotl. - - pip uninstall -y torch flash-attn - - pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/test/cu124 - - pip install --no-build-isolation axolotl[flash-attn,deepspeed] - - wget https://raw.githubusercontent.com/huggingface/trl/main/examples/accelerate_configs/fsdp1.yaml - - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/qlora-fsdp-70b.yaml - # Axolotl includes hf-xet version 1.1.0, which fails during downloads. Replacing it with the latest version (1.1.2). - - pip uninstall -y hf-xet - - pip install hf-xet --no-cache-dir - - | - accelerate launch \ - --config_file=fsdp1.yaml \ - -m axolotl.cli.train qlora-fsdp-70b.yaml \ - --hub-model-id $HUB_MODEL_ID \ - --output-dir /checkpoints/qlora-llama3-70b \ - --wandb-project $WANDB_PROJECT \ - --wandb-name $DSTACK_RUN_NAME \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM - -resources: - gpu: 80GB:8 - shm_size: 128GB - -volumes: - - /checkpoints:/checkpoints -``` -
- -!!! info "Docker image" - We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. - -### Apply the configuration - -To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. - -
- -```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ WANDB_PROJECT=... -$ HUB_MODEL_ID=... -$ dstack apply -f train-distrib.dstack.yml - - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle - 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle - -Submit the run axolotl-multi-node-qlora-llama3-70b? [y/n]: y - -Provisioning... ----> 100% -``` -
- -!!! info "What's next?" - 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) diff --git a/docs/examples/distributed-training/trl.md b/docs/examples/distributed-training/trl.md deleted file mode 100644 index 3a25c04b48..0000000000 --- a/docs/examples/distributed-training/trl.md +++ /dev/null @@ -1,160 +0,0 @@ ---- -title: TRL -description: Distributed fine-tuning with TRL, Accelerate, and DeepSpeed ---- - -# TRL - -This example walks you through how to run distributed fine-tune using [TRL](https://github.com/huggingface/trl), [Accelerate](https://github.com/huggingface/accelerate) and [Deepspeed](https://github.com/deepspeedai/DeepSpeed). - -!!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](../../docs/concepts/fleets.md#cluster-placement) or an [SSH fleet](../../docs/concepts/fleets.md#ssh-placement)). - -## Define a configuration - -Once the fleet is created, define a distributed task configuration. Here's an example of such a task. - -=== "FSDP" - -
- ```yaml - type: task - name: trl-train-fsdp-distrib - - nodes: 2 - - image: nvcr.io/nvidia/pytorch:25.01-py3 - - env: - - HF_TOKEN - - ACCELERATE_LOG_LEVEL=info - - WANDB_API_KEY - - MODEL_ID=meta-llama/Llama-3.1-8B - - HUB_MODEL_ID - - commands: - - pip install transformers bitsandbytes peft wandb - - git clone https://github.com/huggingface/trl - - cd trl - - pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/fsdp1.yaml \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM \ - trl/scripts/sft.py \ - --model_name $MODEL_ID \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --attn_implementation flash_attention_2 \ - --logging_steps=10 \ - --output_dir /checkpoints/llama31-ft \ - --hub_model_id $HUB_MODEL_ID \ - --torch_dtype bfloat16 - - resources: - gpu: 80GB:8 - shm_size: 128GB - - volumes: - - /checkpoints:/checkpoints - ``` -
- -=== "Deepseed ZeRO-3" - -
- ```yaml - type: task - name: trl-train-deepspeed-distrib - - nodes: 2 - - image: nvcr.io/nvidia/pytorch:25.01-py3 - - env: - - HF_TOKEN - - WANDB_API_KEY - - HUB_MODEL_ID - - MODEL_ID=meta-llama/Llama-3.1-8B - - ACCELERATE_LOG_LEVEL=info - - commands: - - pip install transformers bitsandbytes peft wandb deepspeed - - git clone https://github.com/huggingface/trl - - cd trl - - pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \ - --main_process_ip=$DSTACK_MASTER_NODE_IP \ - --main_process_port=8008 \ - --machine_rank=$DSTACK_NODE_RANK \ - --num_processes=$DSTACK_GPUS_NUM \ - --num_machines=$DSTACK_NODES_NUM \ - trl/scripts/sft.py \ - --model_name $MODEL_ID \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --attn_implementation flash_attention_2 \ - --logging_steps=10 \ - --output_dir /checkpoints/llama31-ft \ - --hub_model_id $HUB_MODEL_ID \ - --torch_dtype bfloat16 - - resources: - gpu: 80GB:8 - shm_size: 128GB - - volumes: - - /checkpoints:/checkpoints - ``` -
- -!!! info "Docker image" - We are using `nvcr.io/nvidia/pytorch:25.01-py3` from NGC because it includes the necessary libraries and packages for RDMA and InfiniBand support. - -### Apply the configuration - -To run a configuration, use the [`dstack apply`](../../docs/reference/cli/dstack/apply.md) command. - -
- -```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ HUB_MODEL_ID=... -$ dstack apply -f train-distrib.dstack.yml - - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle - 2 ssh (remote) cpu=208 mem=1772GB H100:80GB:8 instance $0 idle - -Submit the run trl-train-fsdp-distrib? [y/n]: y - -Provisioning... ----> 100% -``` -
- -!!! info "What's next?" - 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Read about [cluster placement](../../docs/concepts/fleets.md#cluster-placement) diff --git a/docs/examples/single-node-training/axolotl.md b/docs/examples/single-node-training/axolotl.md deleted file mode 100644 index 3ab19d0502..0000000000 --- a/docs/examples/single-node-training/axolotl.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: Axolotl -description: Fine-tuning models with Axolotl using FSDP and QLoRA ---- - -# Axolotl - -This example shows how to use [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) with `dstack` to fine-tune 4-bit Quantized `Llama-4-Scout-17B-16E` using SFT with FSDP and QLoRA. - -??? info "Prerequisites" - Once `dstack` is [installed](../../docs/installation.md), clone the repo with examples. - -
- - ```shell - $ git clone https://github.com/dstackai/dstack - $ cd dstack - ``` - -
- -## Define a configuration - -Axolotl reads the model, QLoRA, and dataset arguments, as well as trainer configuration from a [`scout-qlora-flexattn-fsdp2.yaml`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml) file. The configuration uses 4-bit axolotl quantized version of `meta-llama/Llama-4-Scout-17B-16E`, requiring only ~43GB VRAM/GPU with 4K context length. - -Below is a task configuration that does fine-tuning. - -
- -```yaml -type: task -# The name is optional, if not specified, generated randomly -name: axolotl-nvidia-llama-scout-train - -# Using the official Axolotl's Docker image -image: axolotlai/axolotl:main-latest - -# Required environment variables -env: - - HF_TOKEN - - WANDB_API_KEY - - WANDB_PROJECT - - HUB_MODEL_ID -# Commands of the task -commands: - - wget https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml - - | - axolotl train scout-qlora-flexattn-fsdp2.yaml \ - --wandb-project $WANDB_PROJECT \ - --wandb-name $DSTACK_RUN_NAME \ - --hub-model-id $HUB_MODEL_ID - -resources: - # Four GPU (required by FSDP) - gpu: H100:4 - # Shared memory size for inter-process communication - shm_size: 64GB - disk: 500GB.. -``` - -
- -The task uses Axolotl's Docker image, where Axolotl is already pre-installed. - -!!! info "AMD" - The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#axolotl). - -## Run the configuration - -Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the -cloud resources and run the configuration. - -
- -```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ WANDB_PROJECT=... -$ HUB_MODEL_ID=... -$ dstack apply -f train.dstack.yml - - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 - 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 - 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 - -Submit the run axolotl-nvidia-llama-scout-train? [y/n]: - -Provisioning... ----> 100% -``` - -
- -## What's next? - -1. Browse the [Axolotl distributed training](../distributed-training/axolotl.md) example -2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), [fleets](../../docs/concepts/fleets.md) -3. See the [AMD](../accelerators/amd.md#axolotl) example diff --git a/docs/examples/single-node-training/trl.md b/docs/examples/single-node-training/trl.md deleted file mode 100644 index 7295055259..0000000000 --- a/docs/examples/single-node-training/trl.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -title: TRL -description: Fine-tuning Llama with TRL using SFT and QLoRA ---- - -# TRL - -This example walks you through how to use [TRL](https://github.com/huggingface/trl) to fine-tune `Llama-3.1-8B` with `dstack` using SFT with QLoRA. - -## Define a configuration - -Below is a task configuration that does fine-tuning. - -
- -```yaml -type: task -name: trl-train - -python: 3.12 -# Ensure nvcc is installed (req. for Flash Attention) -nvcc: true - -env: - - HF_TOKEN - - WANDB_API_KEY - - HUB_MODEL_ID -commands: - # Pin torch==2.6.0 to avoid building Flash Attention from source. - # Prebuilt Flash Attention wheels are not available for the latest torch==2.7.0. - - uv pip install torch==2.6.0 - - uv pip install transformers bitsandbytes peft wandb - - uv pip install flash_attn --no-build-isolation - - git clone https://github.com/huggingface/trl - - cd trl - - uv pip install . - - | - accelerate launch \ - --config_file=examples/accelerate_configs/multi_gpu.yaml \ - --num_processes $DSTACK_GPUS_PER_NODE \ - trl/scripts/sft.py \ - --model_name meta-llama/Meta-Llama-3.1-8B \ - --dataset_name OpenAssistant/oasst_top1_2023-08-25 \ - --dataset_text_field="text" \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --learning_rate 2e-4 \ - --report_to wandb \ - --bf16 \ - --max_seq_length 1024 \ - --lora_r 16 \ - --lora_alpha 32 \ - --lora_target_modules q_proj k_proj v_proj o_proj \ - --load_in_4bit \ - --use_peft \ - --attn_implementation "flash_attention_2" \ - --logging_steps=10 \ - --output_dir models/llama31 \ - --hub_model_id peterschmidt85/FineLlama-3.1-8B - -resources: - gpu: - # 24GB or more VRAM - memory: 24GB.. - # One or more GPU - count: 1.. - # Shared memory (for multi-gpu) - shm_size: 24GB -``` - -
- -Change the `resources` property to specify more GPUs. - -!!! info "AMD" - The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](../accelerators/amd.md#trl). - -??? info "DeepSpeed" - For more memory-efficient use of multiple GPUs, consider using DeepSpeed and ZeRO Stage 3. - - To do this, use the `examples/accelerate_configs/deepspeed_zero3.yaml` configuration file instead of - `examples/accelerate_configs/multi_gpu.yaml`. - -## Run the configuration - -Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the -cloud resources and run the configuration. - -
- -```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ HUB_MODEL_ID=... -$ dstack apply -f train.dstack.yml - - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 vastai (cz-czechia) cpu=64 mem=128GB H100:80GB:2 18794506 $3.8907 - 2 vastai (us-texas) cpu=52 mem=64GB H100:80GB:2 20442365 $3.6926 - 3 vastai (fr-france) cpu=64 mem=96GB H100:80GB:2 20379984 $3.7389 - -Submit the run trl-train? [y/n]: - -Provisioning... ----> 100% -``` - -
- -## What's next? - -1. Browse the [TRL distributed training](../distributed-training/trl.md) example -2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), - [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) -3. See the [AMD](../accelerators/amd.md#trl) example diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 3ae52c2be3..805495b0b1 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -223,11 +223,10 @@