From f7a2b44b72e8890c3880845870ee5381391472cd Mon Sep 17 00:00:00 2001
From: Jvst Me <git@jvst.me>
Date: Mon, 6 Oct 2025 13:31:53 +0200
Subject: [PATCH] [Docs]: GCP A4 cluster example

---
 docs/docs/concepts/fleets.md          |   7 +-
 docs/docs/guides/clusters.md          |  11 +--
 docs/examples.md                      |  10 +++
 docs/examples/clusters/a4/index.md    |   0
 examples/clusters/a3mega/README.md    |   2 +-
 examples/clusters/a4/README.md        | 122 ++++++++++++++++++++++++++
 examples/clusters/a4/fleet.dstack.yml |  13 +++
 examples/clusters/efa/README.md       |   2 +-
 mkdocs.yml                            |   1 +
 9 files changed, 158 insertions(+), 10 deletions(-)
 create mode 100644 docs/examples/clusters/a4/index.md
 create mode 100644 examples/clusters/a4/README.md
 create mode 100644 examples/clusters/a4/fleet.dstack.yml

diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md
index cd49ff707..42a62dc54 100644
--- a/docs/docs/concepts/fleets.md
+++ b/docs/docs/concepts/fleets.md
@@ -118,11 +118,12 @@ This ensures all instances are provisioned with optimal inter-node connectivity.
     Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details.
 
 ??? info "GCP"
-    When you create a fleet with GCP, for the A3 Mega and A3 High instance types, [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking is automatically configured.
+    When you create a fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type.
 
     !!! info "Backend configuration"    
-        Note, GPUDirect-TCPXO and GPUDirect-TCPX require `extra_vpcs` to be configured  in the `gcp` backend configuration.
-        Refer to the [A3 Mega](../../examples/clusters/a3mega/index.md) and 
+        You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration.
+        Refer to the [A4](../../examples/clusters/a4/index.md),
+        [A3 Mega](../../examples/clusters/a3mega/index.md), and 
         [A3 High](../../examples/clusters/a3high/index.md) examples for more details.
 
 ??? info "Nebius"
diff --git a/docs/docs/guides/clusters.md b/docs/docs/guides/clusters.md
index ce81a69fc..650aed2b2 100644
--- a/docs/docs/guides/clusters.md
+++ b/docs/docs/guides/clusters.md
@@ -25,18 +25,19 @@ For cloud fleets, fast interconnect is currently supported only on the `aws`, `g
         Refer to the [EFA](../../examples/clusters/efa/index.md) example for more details.
 
 === "GCP"
-    When you create a cloud fleet with GCP, for the A3 Mega and A3 High instance types, [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking is automatically configured.
+    When you create a cloud fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX :material-arrow-top-right-thin:{ .external }](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot){:target="_blank"} networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type.
 
     !!! info "Backend configuration"    
-        Note, GPUDirect-TCPXO and GPUDirect-TCPX require `extra_vpcs` to be configured  in the `gcp` backend configuration.
-        Refer to the [A3 Mega](../../examples/clusters/a3mega/index.md) and 
+        You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration.
+        Refer to the [A4](../../examples/clusters/a4/index.md),
+        [A3 Mega](../../examples/clusters/a3mega/index.md), and 
         [A3 High](../../examples/clusters/a3high/index.md) examples for more details.
 
 === "Nebius"
     When you create a cloud fleet with Nebius, [InfiniBand :material-arrow-top-right-thin:{ .external }](https://docs.nebius.com/compute/clusters/gpu){:target="_blank"} networking is automatically configured if it’s supported for the corresponding instance type.
 
-> To request fast interconnect support for a other backends,
-file an [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}. 
+> To request fast interconnect support for other backends,
+file an [issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues){:target="_ blank"}.
 
 ## Distributed tasks
 
diff --git a/docs/examples.md b/docs/examples.md
index a4e147dc0..26b95b075 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -100,6 +100,16 @@ hide:
             Run multi-node RCCL tests with MPI
         </p>
     </a>
+    <a href="/examples/clusters/a4"
+       class="feature-cell sky">
+        <h3>
+            GCP A4
+        </h3>
+
+        <p>
+            Set up GCP A4 clusters with optimized networking
+        </p>
+    </a>
     <a href="/examples/clusters/a3mega"
        class="feature-cell sky">
         <h3>
diff --git a/docs/examples/clusters/a4/index.md b/docs/examples/clusters/a4/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/clusters/a3mega/README.md b/examples/clusters/a3mega/README.md
index 52fc9b5be..a0c117553 100644
--- a/examples/clusters/a3mega/README.md
+++ b/examples/clusters/a3mega/README.md
@@ -119,7 +119,7 @@ Fleet a3mega-cluster does not exist yet.
 Create the fleet? [y/n]: y
 
 Provisioning...
----> 100%                    
+---> 100%
 ```
 
 </div>
diff --git a/examples/clusters/a4/README.md b/examples/clusters/a4/README.md
new file mode 100644
index 000000000..43e9a4609
--- /dev/null
+++ b/examples/clusters/a4/README.md
@@ -0,0 +1,122 @@
+# GCP A4
+
+This example shows how to set up a GCP A4 cluster with optimized RoCE networking and run NCCL Tests on it using `dstack`.
+
+GCP A4 instances provide eight NVIDIA B200 GPUs per VM, each with 180GB memory. These instances also have eight NVIDIA ConnectX-7 (CX-7) NICs that utilize RDMA over Converged Ethernet (RoCE) networking, making them ideal for large-scale distributed deep learning.
+
+## Configure the GCP backend
+
+First, configure the `gcp` backend for A4 RoCE support. Specify one VPC in `extra_vpcs` for general traffic between nodes (in addition to the main VPC), and one VPC in `roce_vpcs` for GPU-to-GPU communication.
+
+<div editor-title="~/.dstack/server/config.yml">
+
+```yaml
+projects:
+- name: main
+  backends:
+  - type: gcp
+    project_id: my-project
+    creds:
+      type: default
+    vpc_name: my-vpc-0  # Main VPC (1 subnet, omit to use the default VPC)
+    extra_vpcs:
+    - my-vpc-1          # Extra VPC (1 subnet)
+    roce_vpcs:
+    - my-vpc-mrdma      # RoCE VPC (8 subnets, RoCE profile)
+```
+
+</div>
+
+!!! info "RoCE VPC setup"
+    The VPC listed in `roce_vpcs` must be created with the RoCE profile and have **eight subnets** (one per GPU). Follow [GCP's RoCE setup guide](https://cloud.google.com/ai-hypercomputer/docs/create/create-vm#setup-network) for details.
+
+!!! info "Firewall rules"
+    Ensure all VPCs allow internal traffic between nodes for MPI/NCCL to function.
+
+## Create a fleet
+
+Define your fleet configuration:
+
+<div editor-title="examples/clusters/a4/fleet.dstack.yml">
+
+```yaml
+type: fleet
+name: a4-cluster
+
+nodes: 2
+placement: cluster
+
+# Specify the zone where you have configured the RoCE VPC
+availability_zones: [us-west2-c]
+backends: [gcp]
+spot_policy: auto
+
+resources:
+  gpu: B200:8
+```
+
+</div>
+
+Then apply it with `dstack apply`:
+
+<div class="termy">
+
+```shell
+$ dstack apply -f examples/clusters/a4/fleet.dstack.yml
+
+Provisioning...
+---> 100%
+
+ FLEET       INSTANCE  BACKEND         GPU                  PRICE    STATUS  CREATED
+ a4-cluster  0         gcp (us-west2)  B200:180GB:8 (spot)  $51.552  idle    9 mins ago
+             1         gcp (us-west2)  B200:180GB:8 (spot)  $51.552  idle    9 mins ago
+```
+
+</div>
+
+`dstack` will provision the instances and set up ten network interfaces on each instance:
+
+- 1 regular network interface in the main VPC (`vpc_name`)
+- 1 regular interface in an extra VPC (`extra_vpcs`)
+- 8 RoCE-enabled interfaces in a dedicated VPC (`roce_vpcs`)
+
+!!! info "Spot instances"
+    Currently, the `gcp` backend supports only A4 spot instances.
+
+## Run NCCL tests
+
+To validate networking and GPU performance, you can run [NCCL tests](https://dstack.ai/examples/clusters/nccl-tests/):
+
+<div class="termy">
+
+```shell
+$ dstack apply -f examples/clusters/nccl-tests/.dstack.yml
+
+Provisioning...
+---> 100%
+
+  nThread 1 nGpus 1 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+        size         count      type   redop    root     time   algbw   busbw  wrong     time   algbw   busbw  wrong
+         (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+     8388608       2097152     float     sum      -1    156.9   53.47  100.25      0    167.6   50.06   93.86      0
+    16777216       4194304     float     sum      -1    196.3   85.49  160.29      0    206.2   81.37  152.57      0
+    33554432       8388608     float     sum      -1    258.5  129.82  243.42      0    261.8  128.18  240.33      0
+    67108864      16777216     float     sum      -1    369.4  181.69  340.67      0    371.2  180.79  338.98      0
+   134217728      33554432     float     sum      -1    638.5  210.22  394.17      0    587.2  228.57  428.56      0
+   268435456      67108864     float     sum      -1    940.3  285.49  535.29      0    950.7  282.36  529.43      0
+   536870912     134217728     float     sum      -1   1695.2  316.70  593.81      0   1666.9  322.08  603.89      0
+  1073741824     268435456     float     sum      -1   3229.9  332.44  623.33      0   3201.8  335.35  628.78      0
+  2147483648     536870912     float     sum      -1   6107.7  351.61  659.26      0   6157.1  348.78  653.97      0
+  4294967296    1073741824     float     sum      -1    11952  359.36  673.79      0    11942  359.65  674.34      0
+  8589934592    2147483648     float     sum      -1    23563  364.55  683.52      0    23702  362.42  679.54      0
+  Out of bounds values : 0 OK
+  Avg bus bandwidth    : 165.789
+```
+
+</div>
+
+!!! info "What's next"
+    1. Learn more about [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks) 
+    2. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments),
+       [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets)
+    3. Read the [Clusters](https://dstack.ai/docs/guides/clusters) guide
diff --git a/examples/clusters/a4/fleet.dstack.yml b/examples/clusters/a4/fleet.dstack.yml
new file mode 100644
index 000000000..ac97e22de
--- /dev/null
+++ b/examples/clusters/a4/fleet.dstack.yml
@@ -0,0 +1,13 @@
+type: fleet
+name: a4-cluster
+
+nodes: 2
+placement: cluster
+
+# Specify the zone where you have configured the RoCE VPC
+availability_zones: [us-west2-c]
+backends: [gcp]
+spot_policy: auto
+
+resources:
+  gpu: B200:8
diff --git a/examples/clusters/efa/README.md b/examples/clusters/efa/README.md
index 07da4ac8f..0df910a98 100644
--- a/examples/clusters/efa/README.md
+++ b/examples/clusters/efa/README.md
@@ -64,7 +64,7 @@ Provisioning...
 
  FLEET         INSTANCE  BACKEND          INSTANCE TYPE  GPU          PRICE   STATUS  CREATED 
  my-efa-fleet  0         aws (us-west-2)  p4d.24xlarge   H100:8:80GB  $98.32  idle    3 mins ago      
-               1         aws (us-west-2)  p4d.24xlarge   $98.32  idle    3 mins ago    
+               1         aws (us-west-2)  p4d.24xlarge   H100:8:80GB  $98.32  idle    3 mins ago    
 ```
 
 </div>
diff --git a/mkdocs.yml b/mkdocs.yml
index 5d80ceaa9..ec8062f74 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -280,6 +280,7 @@ nav:
     - Clusters:
         - NCCL tests: examples/clusters/nccl-tests/index.md
         - RCCL tests: examples/clusters/rccl-tests/index.md
+        - GCP A4: examples/clusters/a4/index.md
         - GCP A3 Mega: examples/clusters/a3mega/index.md
         - GCP A3 High: examples/clusters/a3high/index.md
         - AWS EFA: examples/clusters/efa/index.md