From f35f199e0f29836ffa24b273c527b8a8b7d85ea9 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Sat, 8 Mar 2025 17:37:55 +0100 Subject: [PATCH 1/2] [Blog] Using SSH fleets with TensorWave's private AMD cloud --- docs/blog/posts/amd-on-tensorwave.md | 243 +++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 docs/blog/posts/amd-on-tensorwave.md diff --git a/docs/blog/posts/amd-on-tensorwave.md b/docs/blog/posts/amd-on-tensorwave.md new file mode 100644 index 000000000..b3638a933 --- /dev/null +++ b/docs/blog/posts/amd-on-tensorwave.md @@ -0,0 +1,243 @@ +--- +title: Using SSH fleets with TensorWave's private AMD cloud +date: 2025-03-11 +description: "This tutorial walks you through how dstack can be used with TensorWave's private AMD cloud using SSH fleets." +slug: amd-on-tensorwave +image: https://github.com/dstackai/static-assets/blob/main/static-assets/images/dstack-tensorwave-v2.png?raw=true +categories: + - Fleets + - AMD + - Private clouds +--- + +# Using SSH fleets with TensorWave's private AMD cloud + +Since last month, when we introduced support for private clouds and data centers, it has become easier to use `dstack` +to orchestrate AI containers with any AI cloud vendor, whether they provide on-demand compute or reserved clusters. + +In this tutorial, we’ll walk you through how `dstack` can be used with +[TensorWave :material-arrow-top-right-thin:{ .external }](https://tensorwave.com/){:target="_blank"} using +[SSH fleets](../../docs/concepts/fleets.md#ssh). + + + + + +TensorWave is a cloud provider specializing in large-scale AMD GPU clusters for both +training and inference. + +Before following this tutorial, ensure you have access to a cluster. You’ll see the cluster and its nodes in your +TensorWave dashboard. + + + +## Creating a fleet + +??? info "Prerequisites" + Once `dstack` is [installed](https://dstack.ai/docs/installation), create a project repo folder and run `dstack init`. + +
+ + ```shell + $ mkdir tensorwave-demo && cd tensorwave-demo + $ dstack init + ``` + +
+ +Now, define an SSH fleet configuration by listing the IP addresses of each node in the cluster, +along with the SSH user and SSH key configured for each host. + +
+ +```yaml +type: fleet +name: my-tensorwave-fleet + +placement: cluster + +ssh_config: + user: dstack + identity_file: ~/.ssh/id_rsa + hosts: + - hostname: 64.139.222.107 + blocks: auto + - hostname: 64.139.222.108 + blocks: auto +``` + +
+ +You can set `blocks` to `auto` if you want to run concurrent workloads on each instance. +Otherwise, you can omit this property. + +Once the configuration is ready, apply it using `dstack apply`: + +
+ +```shell +$ dstack apply -f fleet.dstack.yml + +Provisioning... +---> 100% + + FLEET INSTANCE RESOURCES STATUS CREATED + my-tensorwave-fleet 0 8xMI300X (192GB) 0/8 busy 3 mins ago + 1 8xMI300X (192GB) 0/8 busy 3 mins ago + +``` + +
+ +`dstack` will automatically connect to each host, detect the hardware, install dependencies, and make them ready for +workloads. + +## Running workloads + +Once the fleet is created, you can use `dstack` to run workloads. + +### Dev environments + +A dev environment lets you access an instance through your desktop IDE. + +
+ +```yaml +type: dev-environment +name: vscode + +image: rocm/pytorch:rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0 +ide: vscode + +resources: + gpu: MI300X:8 +``` + +
+ +Apply the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
+ +```shell +$ dstack apply -f .dstack.yml + +Submit the run `vscode`? [y/n]: y + +Launching `vscode`... +---> 100% + +To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+vscode/workflow +``` + +
+ +Open the link to access the dev environment using your desktop IDE. + +### Tasks + +A task allows you to schedule a job or run a web app. Tasks can be distributed and support port forwarding. + +Below is a distributed training task configuration: + +
+ +```yaml +type: task +name: train-distrib + +nodes: 2 + +image: rocm/pytorch:rocm6.3.3_ubuntu22.04_py3.10_pytorch_release_2.4.0 +commands: + - pip install torch + - export NCCL_IB_GID_INDEX=3 + - export NCCL_NET_GDR_LEVEL=0 + - torchrun --nproc_per_node=8 --nnodes=2 --node_rank=$DSTACK_NODE_RANK --master_port=29600 --master_addr=$DSTACK_MASTER_NODE_IP test/tensorwave/multinode.py 5000 50 + +resources: + gpu: MI300X:8 +``` + +
+ +Run the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
+ +```shell +$ dstack apply -f train.dstack.yml + +Submit the run `streamlit`? [y/n]: y + +Provisioning `train-distrib`... +---> 100% +``` + +
+ +`dstack` automatically runs the container on each node while passing +[system environment variables](../../docs/concepts/tasks.md#system-environment-variables) +which you can use with `torchrun`, `accelerate`, or other distributed frameworks. + +### Services + +A service allows you to deploy a model or any web app as a scalable and secure endpoint. + +Create the following configuration file inside the repo: + +
+ +```yaml +type: service +name: deepseek-r1-sglang + +image: rocm/sglang-staging:20250212 +env: + - MODEL_ID=deepseek-ai/DeepSeek-R1 + - HSA_NO_SCRATCH_RECLAIM=1 +commands: + - python3 -m sglang.launch_server --model-path $MODEL_ID --port 8000 --tp 8 --trust-remote-code +port: 8000 +model: deepseek-ai/DeepSeek-R1 + +resources: + gpu: mi300x:8 + +volumes: + - /root/.cache/huggingface:/root/.cache/huggingface +``` + +
+ +Run the configuration via [`dstack apply`](../../docs/reference/cli/dstack/apply.md): + +
+ +```shell +$ dstack apply -f deepseek.dstack.yml + +Submit the run `deepseek-r1-sglang`? [y/n]: y + +Provisioning `deepseek-r1-sglang`... +---> 100% + +Service is published at: + http://localhost:3000/proxy/services/main/deepseek-r1-sglang/ +Model deepseek-ai/DeepSeek-R1 is published at: + http://localhost:3000/proxy/models/main/ +``` + +
+ +## See it in action + +Want to see how it works? Check out the video below: + + + +!!! info "What's next?" + 1. See [SSH fleets](../../docs/concepts/fleets.md#ssh) + 2. Read about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md) + 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd) From 8e1dadf064f9a2697721cb1f0b1842d27475d575 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 11 Mar 2025 04:06:28 +0100 Subject: [PATCH 2/2] [Blog] Using SSH fleets with TensorWave's private AMD cloud (updated Partners page) --- docs/assets/images/hotaisle-logo.svg | 4676 +----------------------- docs/assets/images/tensorwave-logo.svg | 10 + docs/assets/stylesheets/landing.css | 74 +- docs/overrides/main.html | 2 +- docs/partners.md | 134 +- 5 files changed, 164 insertions(+), 4732 deletions(-) create mode 100644 docs/assets/images/tensorwave-logo.svg diff --git a/docs/assets/images/hotaisle-logo.svg b/docs/assets/images/hotaisle-logo.svg index 87109624a..3aaea1930 100644 --- a/docs/assets/images/hotaisle-logo.svg +++ b/docs/assets/images/hotaisle-logo.svg @@ -1,4670 +1,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + diff --git a/docs/assets/images/tensorwave-logo.svg b/docs/assets/images/tensorwave-logo.svg new file mode 100644 index 000000000..93cbeb765 --- /dev/null +++ b/docs/assets/images/tensorwave-logo.svg @@ -0,0 +1,10 @@ + + + + + diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css index b0ede2060..f9464adab 100644 --- a/docs/assets/stylesheets/landing.css +++ b/docs/assets/stylesheets/landing.css @@ -386,7 +386,7 @@ } .providers.tx-landing__highlights_grid { - grid-gap: 28px !important; + grid-gap: 20px !important; } #typed { @@ -397,15 +397,22 @@ .providers.tx-landing__highlights_grid .feature-cell h3 { align-content: center; - font-size: 1.1em; + font-size: 1em; font-weight: 600; padding-bottom: 0.05em; line-height: 25px; } .providers.tx-landing__highlights_grid .feature-cell { - row-gap: 18px; - padding: 26px 39px; + row-gap: 22px; + padding: 25px 30px; + aspect-ratio: 1.05; + + @media screen and (min-width: 76.1875em) { + &:nth-child(1) { + border-top-left-radius: 3px; + } + } } .tx-landing__highlights_grid .feature-cell { @@ -418,6 +425,54 @@ flex-direction: column; } +@media screen and (min-width: 76.1875em) { + .providers.tx-landing__highlights_grid .feature-cell { + border-radius: 0; + border-left: none; + border-bottom: none; + } + + .nvidia.providers.tx-landing__highlights_grid .feature-cell { + &:nth-child(1), &:nth-child(6), &:nth-child(11) { + border-left: 0.5px dotted rgba(0, 0, 0, 0.75); + } + + &:nth-child(n+7) { + border-bottom: 0.5px dotted rgba(0, 0, 0, 0.75); + } + + &:nth-child(5) { + border-top-right-radius: 3px; + } + + &:nth-child(5), &:nth-child(11) { + border-bottom-right-radius: 3px; + } + + &:nth-child(11) { + border-bottom-left-radius: 3px; + } + + &:nth-child(10) { + border-bottom-right-radius: 3px; + } + } +} + +:is(.amd).providers.tx-landing__highlights_grid .feature-cell { + &:nth-child(1) { + border-left: 0.5px dotted rgba(0, 0, 0, 0.75); + border-bottom-left-radius: 3px; + } + + border-bottom: 0.5px dotted rgba(0, 0, 0, 0.75); + + &:nth-child(3) { + border-top-right-radius: 3px; + border-bottom-right-radius: 3px; + } +} + .providers.tx-landing__highlights_grid.other .feature-cell { column-gap: 15px; flex-direction: row; @@ -436,6 +491,13 @@ grid-template-columns: repeat(4, 1fr) !important; } + .providers.tx-landing__highlights_grid { + grid-gap: 0px !important; + border: none; + + grid-template-columns: repeat(5, 1fr) !important; + } + .tx-landing__highlights_grid .feature-cell { } } @@ -444,9 +506,9 @@ background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.005), rgba(0, 42, 255, 0.005), rgba(225, 101, 254, 0.01)); } -.tx-landing__highlights_grid .feature-cell:hover { +/*.tx-landing__highlights_grid .feature-cell:hover { background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.03), rgba(0, 42, 255, 0.03), rgba(225, 101, 254, 0.05)); -} +}*/ .tx-landing__highlights_grid .feature-cell strong { font-weight: 500; diff --git a/docs/overrides/main.html b/docs/overrides/main.html index dccf469f4..ff2c35883 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -132,7 +132,7 @@ Discord GitHub Contributing - Ambassador program +