From 108d193652c6fc9fe25fc6d83ba4a0d155405dfe Mon Sep 17 00:00:00 2001 From: "Solovev, Timur" Date: Thu, 6 Feb 2025 14:08:42 +0100 Subject: [PATCH 1/3] feature(aws): Extend EFA attachment --- .../_internal/core/backends/aws/compute.py | 1 + .../_internal/core/backends/aws/resources.py | 38 ++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 164f6ed25..7b7d2374d 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -240,6 +240,7 @@ def create_instance( allocate_public_ip=allocate_public_ip, placement_group_name=instance_config.placement_group_name, enable_efa=enable_efa, + max_efa_interfaces=max_efa_interfaces, reservation_id=instance_config.reservation, is_capacity_block=is_capacity_block, ) diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 1819b972f..38d805ed3 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -140,6 +140,7 @@ def create_instances_struct( allocate_public_ip: bool = True, placement_group_name: Optional[str] = None, enable_efa: bool = False, + max_efa_interfaces: int = 0, reservation_id: Optional[str] = None, is_capacity_block: bool = False, ) -> Dict[str, Any]: @@ -199,9 +200,44 @@ def create_instances_struct( "DeviceIndex": 0, "SubnetId": subnet_id, "Groups": [security_group_id], - "InterfaceType": "efa" if enable_efa else "interface", + "InterfaceType": "efa" if max_efa_interfaces > 0 else "interface", }, ] + + if max_efa_interfaces > 1 and allocate_public_ip is False: + if instance_type == "p5.48xlarge": + # EFA configuration for P5 instances: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 + p5d_interfaces = [ + { + "NetworkCardIndex": i, + "DeviceIndex": 1, + "InterfaceType": "efa" if i % 4 == 0 else "efa-only", + } + for i in range(1, 32) + ] + for interface in p5d_interfaces: + struct["NetworkInterfaces"].append( + { + "AssociatePublicIpAddress": allocate_public_ip, + "NetworkCardIndex": interface["NetworkCardIndex"], + "DeviceIndex": interface["DeviceIndex"], + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": interface["InterfaceType"], + } + ) + else: + for i in range(1, max_efa_interfaces): + struct["NetworkInterfaces"].append( + { + "AssociatePublicIpAddress": allocate_public_ip, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": "efa-only", # Set specifically to efa-only to keep the interfaces exclusively for GPU-to-GPU communications and not mix with IP traffic + } + ) else: struct["SecurityGroupIds"] = [security_group_id] From 923482e6c4ce94d8842205f4645d873a85776d3a Mon Sep 17 00:00:00 2001 From: "Solovev, Timur" Date: Fri, 7 Feb 2025 17:22:37 +0100 Subject: [PATCH 2/3] refactor(aws): Simplify NetworkInterfaces logic --- .../_internal/core/backends/aws/resources.py | 41 ++++++------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 38d805ed3..6e05cf4cf 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -205,39 +205,22 @@ def create_instances_struct( ] if max_efa_interfaces > 1 and allocate_public_ip is False: - if instance_type == "p5.48xlarge": - # EFA configuration for P5 instances: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 - p5d_interfaces = [ + for i in range(1, max_efa_interfaces): + # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication + interface_type = "efa-only" + if instance_type == "p5.48xlarge": + # EFA configuration for P5 instances: + interface_type = "efa" if i % 4 == 0 else "efa-only" + struct["NetworkInterfaces"].append( { + "AssociatePublicIpAddress": allocate_public_ip, "NetworkCardIndex": i, "DeviceIndex": 1, - "InterfaceType": "efa" if i % 4 == 0 else "efa-only", + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": interface_type, } - for i in range(1, 32) - ] - for interface in p5d_interfaces: - struct["NetworkInterfaces"].append( - { - "AssociatePublicIpAddress": allocate_public_ip, - "NetworkCardIndex": interface["NetworkCardIndex"], - "DeviceIndex": interface["DeviceIndex"], - "SubnetId": subnet_id, - "Groups": [security_group_id], - "InterfaceType": interface["InterfaceType"], - } - ) - else: - for i in range(1, max_efa_interfaces): - struct["NetworkInterfaces"].append( - { - "AssociatePublicIpAddress": allocate_public_ip, - "NetworkCardIndex": i, - "DeviceIndex": 1, - "SubnetId": subnet_id, - "Groups": [security_group_id], - "InterfaceType": "efa-only", # Set specifically to efa-only to keep the interfaces exclusively for GPU-to-GPU communications and not mix with IP traffic - } - ) + ) else: struct["SecurityGroupIds"] = [security_group_id] From 85fbcc530556405d0ffb637c159b3f0d0fe51c53 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 10 Feb 2025 14:28:02 +0500 Subject: [PATCH 3/3] Update docs on EFA --- docs/docs/concepts/fleets.md | 4 ++-- src/dstack/_internal/core/backends/aws/resources.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 876112a4a..f148316bd 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -47,8 +47,8 @@ This ensures all instances are provisioned in the same backend and region with o ??? info "AWS" `dstack` automatically enables the Elastic Fabric Adapter for all [EFA-capable instance types :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types){:target="_blank"}. - Currently, only one EFA interface is enabled per instance, regardless of its maximum capacity. - This will change once [this issue :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/issues/1804){:target="_blank"} is resolved. + If the `aws` backend config has `public_ips: false` set, `dstack` enables the maximum number of interfaces supported by the instance. + Otherwise, if instances have public IPs, only one EFA interface is enabled per instance due to AWS limitations. > The `cluster` placement is supported only for `aws`, `azure`, `gcp`, `oci`, and `vultr` > backends. diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 6e05cf4cf..ae45d1634 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -184,7 +184,7 @@ def create_instances_struct( # AWS allows specifying either NetworkInterfaces for specific subnet_id # or instance-level SecurityGroupIds in case of no specific subnet_id, not both. if subnet_id is not None: - # Even if the instance type supports multiple cards, we always request only one interface + # If the instance type supports multiple cards, we request multiple interfaces only if not allocate_public_ip # due to the limitation: "AssociatePublicIpAddress [...] You cannot specify more than one # network interface in the request". # Error message: "(InvalidParameterCombination) when calling the RunInstances operation: @@ -210,6 +210,7 @@ def create_instances_struct( interface_type = "efa-only" if instance_type == "p5.48xlarge": # EFA configuration for P5 instances: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 interface_type = "efa" if i % 4 == 0 else "efa-only" struct["NetworkInterfaces"].append( {