From 8fc681624030a18b051a6e716eff407a8b7346c3 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 20 Aug 2025 09:50:52 +0545 Subject: [PATCH 01/10] Add DigitalOcean Backend --- .../_internal/core/backends/configurators.py | 9 + .../core/backends/digitalocean/__init__.py | 1 + .../core/backends/digitalocean/api_client.py | 97 ++++++++++ .../core/backends/digitalocean/backend.py | 16 ++ .../core/backends/digitalocean/compute.py | 178 ++++++++++++++++++ .../backends/digitalocean/configurator.py | 62 ++++++ .../core/backends/digitalocean/models.py | 50 +++++ src/dstack/_internal/core/backends/models.py | 8 + src/dstack/_internal/core/errors.py | 4 + .../_internal/core/models/backends/base.py | 4 +- 10 files changed, 428 insertions(+), 1 deletion(-) create mode 100644 src/dstack/_internal/core/backends/digitalocean/__init__.py create mode 100644 src/dstack/_internal/core/backends/digitalocean/api_client.py create mode 100644 src/dstack/_internal/core/backends/digitalocean/backend.py create mode 100644 src/dstack/_internal/core/backends/digitalocean/compute.py create mode 100644 src/dstack/_internal/core/backends/digitalocean/configurator.py create mode 100644 src/dstack/_internal/core/backends/digitalocean/models.py diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index a2df6a4e6..11e88d5d2 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -47,6 +47,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.digitalocean.configurator import ( + DigitalOceanConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(DigitalOceanConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.gcp.configurator import GCPConfigurator diff --git a/src/dstack/_internal/core/backends/digitalocean/__init__.py b/src/dstack/_internal/core/backends/digitalocean/__init__.py new file mode 100644 index 000000000..0f0092fd9 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/__init__.py @@ -0,0 +1 @@ +# DigitalOcean backend for dstack diff --git a/src/dstack/_internal/core/backends/digitalocean/api_client.py b/src/dstack/_internal/core/backends/digitalocean/api_client.py new file mode 100644 index 000000000..62be91aaf --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/api_client.py @@ -0,0 +1,97 @@ +from typing import Any, Dict, List, Optional + +import requests + +from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error +from dstack._internal.core.errors import BackendRateLimitExceededError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# DigitalOcean API endpoints +STANDARD_CLOUD_API_URL = "https://api.digitalocean.com/v2" +AMD_CLOUD_API_URL = "https://api-amd.digitalocean.com/v2" + + +class DigitalOceanAPIClient: + def __init__(self, api_key: str, flavor: str = "standard"): + self.api_key = api_key + self.flavor = flavor + self.base_url = self._get_base_url() + + def _get_base_url(self) -> str: + if self.flavor == "amd": + return AMD_CLOUD_API_URL + return STANDARD_CLOUD_API_URL + + def validate_api_key(self) -> bool: + try: + response = self._make_request("GET", "/account") + response.raise_for_status() + return True + except requests.HTTPError as e: + status = e.response.status_code + if status == 401: + raise_invalid_credentials_error( + fields=[["creds", "api_key"]], details="Invaild API key" + ) + raise e + + def list_ssh_keys(self) -> List[Dict[str, Any]]: + response = self._make_request("GET", "/account/keys") + response.raise_for_status() + return response.json()["ssh_keys"] + + def create_ssh_key(self, name: str, public_key: str) -> Dict[str, Any]: + payload = {"name": name, "public_key": public_key} + response = self._make_request("POST", "/account/keys", json=payload) + response.raise_for_status() + return response.json()["ssh_key"] + + def get_or_create_ssh_key(self, name: str, public_key: str) -> int: + ssh_keys = self.list_ssh_keys() + for ssh_key in ssh_keys: + if ssh_key["public_key"].strip() == public_key.strip(): + return ssh_key["id"] + + ssh_key = self.create_ssh_key(name, public_key) + return ssh_key["id"] + + def create_droplet(self, droplet_config: Dict[str, Any]) -> Dict[str, Any]: + response = self._make_request("POST", "/droplets", json=droplet_config) + response.raise_for_status() + return response.json()["droplet"] + + def get_droplet(self, droplet_id: str) -> Dict[str, Any]: + response = self._make_request("GET", f"/droplets/{droplet_id}") + response.raise_for_status() + return response.json()["droplet"] + + def delete_droplet(self, droplet_id: str) -> None: + response = self._make_request("DELETE", f"/droplets/{droplet_id}") + if response.status_code == 404: + logger.debug("DigitalOcean droplet %s not found", droplet_id) + return + response.raise_for_status() + + def _make_request( + self, method: str, endpoint: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + ) -> requests.Response: + url = f"{self.base_url}{endpoint}" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + + response = requests.request( + method=method, + url=url, + headers=headers, + json=json, + timeout=timeout, + ) + + if response.status_code == 429: + raise BackendRateLimitExceededError("API rate limit exceeded.") + + return response diff --git a/src/dstack/_internal/core/backends/digitalocean/backend.py b/src/dstack/_internal/core/backends/digitalocean/backend.py new file mode 100644 index 000000000..006ae9cc6 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.digitalocean.compute import DigitalOceanCompute +from dstack._internal.core.backends.digitalocean.models import DigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType + + +class DigitalOceanBackend(Backend): + TYPE = BackendType.DIGITALOCEAN + COMPUTE_CLASS = DigitalOceanCompute + + def __init__(self, config: DigitalOceanConfig): + self.config = config + self._compute = DigitalOceanCompute(self.config) + + def compute(self) -> DigitalOceanCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/digitalocean/compute.py b/src/dstack/_internal/core/backends/digitalocean/compute.py new file mode 100644 index 000000000..37f00a804 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/compute.py @@ -0,0 +1,178 @@ +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.digitalocean import DigitalOceanProvider + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithCreateInstanceSupport, + generate_unique_instance_name, + get_user_data, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.digitalocean.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean.models import DigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 + +# Setup commands for DigitalOcean instances +SETUP_COMMANDS = [ + "sudo ufw delete limit ssh", + "sudo ufw allow ssh", +] + +DOCKER_INSTALL_COMMANDS = [ + "export DEBIAN_FRONTEND=noninteractive", + "mkdir -p /etc/apt/keyrings", + "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", + 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', + "apt-get update", + "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", +] + + +class DigitalOceanCompute( + ComputeWithCreateInstanceSupport, + Compute, +): + def __init__(self, config: DigitalOceanConfig): + super().__init__() + self.config = config + self.api_client = DigitalOceanAPIClient(config.creds.api_key, config.flavor or "standard") + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.catalog.add_provider( + DigitalOceanProvider(token=config.creds.api_key, flavor=config.flavor or "standard") + ) + # self.catalog.add_provider( + # DigitalOceanProvider(token=config.creds.api_key, flavor="standard") + # ) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.DIGITALOCEAN, + locations=self.config.regions, + requirements=requirements, + catalog=self.catalog, + ) + return [ + InstanceOfferWithAvailability( + **offer.dict(), + availability=InstanceAvailability.AVAILABLE, + ) + for offer in offers + ] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + + project_ssh_key = instance_config.ssh_keys[0] + ssh_key_id = self.api_client.get_or_create_ssh_key( + name=f"dstack-{instance_config.project_name}", + public_key=project_ssh_key.public, + ) + + # Use the instance name directly from the offer (gpuhunt handles flavor-specific naming) + size_slug = instance_offer.instance.name + + if not instance_offer.instance.resources.gpus: + backend_specific_commands = SETUP_COMMANDS + DOCKER_INSTALL_COMMANDS + else: + backend_specific_commands = SETUP_COMMANDS + + # Prepare droplet configuration + droplet_config = { + "name": instance_name, + "region": instance_offer.region, + "size": size_slug, + "image": self._get_image_for_instance(instance_offer), + "ssh_keys": [ssh_key_id], + "backups": False, + "ipv6": False, + "monitoring": False, + "tags": [], + "user_data": get_user_data( + authorized_keys=instance_config.get_public_keys(), + backend_specific_commands=backend_specific_commands, + ), + } + + droplet = self.api_client.create_droplet(droplet_config) + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=str(droplet["id"]), + hostname=None, # Will be set when droplet is active + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="root", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=None, + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + droplet = self.api_client.get_droplet(provisioning_data.instance_id) + if droplet["status"] == "active": + for network in droplet["networks"]["v4"]: + if network["type"] == "public": + provisioning_data.hostname = network["ip_address"] + break + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + self.api_client.delete_droplet(instance_id) + + def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str: + if not instance_offer.instance.resources.gpus: + # No GPUs, use CPU image + return "ubuntu-24-04-x64" + + gpu_count = len(instance_offer.instance.resources.gpus) + gpu_name = instance_offer.instance.resources.gpus[0].name + + if gpu_name == "MI300X": + # AMD GPU + return "digitaloceanai-rocmjupyter" + else: + # NVIDIA GPUs - DO only supports 1 and 8 GPU configurations. + # DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". But for x8 assuming same. + # See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image) + if gpu_count == 8: + return "gpu-h100x8-base" + elif gpu_count == 1: + return "gpu-h100x1-base" + else: + # For Unsupported GPU count - use single GPU image and log warning + logger.warning( + f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image" + ) + return "gpu-h100x1-base" diff --git a/src/dstack/_internal/core/backends/digitalocean/configurator.py b/src/dstack/_internal/core/backends/digitalocean/configurator.py new file mode 100644 index 000000000..c0aef559c --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/configurator.py @@ -0,0 +1,62 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, +) +from dstack._internal.core.backends.digitalocean.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean.backend import DigitalOceanBackend +from dstack._internal.core.backends.digitalocean.models import ( + AnyDigitalOceanBackendConfig, + AnyDigitalOceanCreds, + DigitalOceanBackendConfig, + DigitalOceanBackendConfigWithCreds, + DigitalOceanConfig, + DigitalOceanCreds, + DigitalOceanStoredConfig, +) +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class DigitalOceanConfigurator(Configurator): + TYPE = BackendType.DIGITALOCEAN + BACKEND_CLASS = DigitalOceanBackend + + def validate_config( + self, config: DigitalOceanBackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_creds(config.creds, config.flavor or "standard") + + def create_backend( + self, project_name: str, config: DigitalOceanBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=DigitalOceanStoredConfig( + **DigitalOceanBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=DigitalOceanCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config( + self, record: BackendRecord, include_creds: bool + ) -> AnyDigitalOceanBackendConfig: + config = self._get_config(record) + if include_creds: + return DigitalOceanBackendConfigWithCreds.__response__.parse_obj(config) + return DigitalOceanBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> DigitalOceanBackend: + config = self._get_config(record) + return DigitalOceanBackend(config=config) + + def _get_config(self, record: BackendRecord) -> DigitalOceanConfig: + return DigitalOceanConfig.__response__( + **json.loads(record.config), + creds=DigitalOceanCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyDigitalOceanCreds, flavor: str): + api_client = DigitalOceanAPIClient(creds.api_key, flavor) + api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/digitalocean/models.py b/src/dstack/_internal/core/backends/digitalocean/models.py new file mode 100644 index 000000000..43549d832 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean/models.py @@ -0,0 +1,50 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class DigitalOceanAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyDigitalOceanCreds = DigitalOceanAPIKeyCreds +DigitalOceanCreds = AnyDigitalOceanCreds + + +class DigitalOceanBackendConfig(CoreModel): + type: Annotated[ + Literal["digitalocean"], + Field(description="The type of backend"), + ] = "digitalocean" + flavor: Annotated[ + Optional[Literal["standard", "amd"]], + Field( + description="The DigitalOcean cloud flavor. Either 'standard' or 'amd'. Defaults to 'standard'" + ), + ] = "standard" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of DigitalOcean regions. Omit to use all regions"), + ] = None + + +class DigitalOceanBackendConfigWithCreds(DigitalOceanBackendConfig): + creds: Annotated[AnyDigitalOceanCreds, Field(description="The credentials")] + + +AnyDigitalOceanBackendConfig = Union[DigitalOceanBackendConfig, DigitalOceanBackendConfigWithCreds] + + +class DigitalOceanBackendFileConfigWithCreds(DigitalOceanBackendConfig): + creds: Annotated[AnyDigitalOceanCreds, Field(description="The credentials")] + + +class DigitalOceanStoredConfig(DigitalOceanBackendConfig): + pass + + +class DigitalOceanConfig(DigitalOceanStoredConfig): + creds: AnyDigitalOceanCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index 109745970..b0502aa01 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -20,6 +20,11 @@ DataCrunchBackendConfig, DataCrunchBackendConfigWithCreds, ) +from dstack._internal.core.backends.digitalocean.models import ( + DigitalOceanBackendConfig, + DigitalOceanBackendConfigWithCreds, + DigitalOceanBackendFileConfigWithCreds, +) from dstack._internal.core.backends.dstack.models import ( DstackBackendConfig, DstackBaseBackendConfig, @@ -77,6 +82,7 @@ CloudRiftBackendConfig, CudoBackendConfig, DataCrunchBackendConfig, + DigitalOceanBackendConfig, GCPBackendConfig, HotAisleBackendConfig, KubernetesBackendConfig, @@ -100,6 +106,7 @@ CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, + DigitalOceanBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, @@ -122,6 +129,7 @@ CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, + DigitalOceanBackendFileConfigWithCreds, GCPBackendFileConfigWithCreds, HotAisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, diff --git a/src/dstack/_internal/core/errors.py b/src/dstack/_internal/core/errors.py index 0bfd5f6f3..cd6e3d7e5 100644 --- a/src/dstack/_internal/core/errors.py +++ b/src/dstack/_internal/core/errors.py @@ -90,6 +90,10 @@ class BackendAuthError(BackendError): pass +class BackendRateLimitExceededError(BackendError): + pass + + class ComputeError(BackendError): pass diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 067e181f6..705ff5d3d 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -8,9 +8,10 @@ class BackendType(str, enum.Enum): AZURE (BackendType): Microsoft Azure CLOUDRIFT (BackendType): CloudRift CUDO (BackendType): Cudo + DATACRUNCH (BackendType): DataCrunch + DIGITALOCEAN (BackendType): DigitalOcean DSTACK (BackendType): dstack Sky GCP (BackendType): Google Cloud Platform - DATACRUNCH (BackendType): DataCrunch HOTAISLE (BackendType): Hot Aisle KUBERNETES (BackendType): Kubernetes LAMBDA (BackendType): Lambda Cloud @@ -27,6 +28,7 @@ class BackendType(str, enum.Enum): CLOUDRIFT = "cloudrift" CUDO = "cudo" DATACRUNCH = "datacrunch" + DIGITALOCEAN = "digitalocean" DSTACK = "dstack" GCP = "gcp" HOTAISLE = "hotaisle" From 41f5ec37bea8ab3032b6295155c50989720488d7 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 26 Aug 2025 15:21:06 +0545 Subject: [PATCH 02/10] Add DigitalOcean base class with DigitalOceanCloud and AMDDevCloud sub classes --- .../core/backends/amddevcloud/__init__.py | 1 + .../core/backends/amddevcloud/backend.py | 16 ++ .../core/backends/amddevcloud/compute.py | 5 + .../core/backends/amddevcloud/configurator.py | 24 +++ .../_internal/core/backends/base/offers.py | 2 + .../_internal/core/backends/configurators.py | 6 + .../core/backends/digitalocean/backend.py | 10 +- .../core/backends/digitalocean/compute.py | 179 +----------------- .../backends/digitalocean/configurator.py | 63 ++---- .../core/backends/digitalocean/models.py | 50 ----- .../backends/digitalocean_base/__init__.py | 1 + .../api_client.py | 34 ++-- .../backends/digitalocean_base/backend.py | 5 + .../backends/digitalocean_base/compute.py | 175 +++++++++++++++++ .../digitalocean_base/configurator.py | 53 ++++++ .../core/backends/digitalocean_base/models.py | 46 +++++ src/dstack/_internal/core/backends/models.py | 14 +- src/dstack/_internal/core/errors.py | 4 - .../_internal/core/models/backends/base.py | 2 + 19 files changed, 376 insertions(+), 314 deletions(-) create mode 100644 src/dstack/_internal/core/backends/amddevcloud/__init__.py create mode 100644 src/dstack/_internal/core/backends/amddevcloud/backend.py create mode 100644 src/dstack/_internal/core/backends/amddevcloud/compute.py create mode 100644 src/dstack/_internal/core/backends/amddevcloud/configurator.py delete mode 100644 src/dstack/_internal/core/backends/digitalocean/models.py create mode 100644 src/dstack/_internal/core/backends/digitalocean_base/__init__.py rename src/dstack/_internal/core/backends/{digitalocean => digitalocean_base}/api_client.py (70%) create mode 100644 src/dstack/_internal/core/backends/digitalocean_base/backend.py create mode 100644 src/dstack/_internal/core/backends/digitalocean_base/compute.py create mode 100644 src/dstack/_internal/core/backends/digitalocean_base/configurator.py create mode 100644 src/dstack/_internal/core/backends/digitalocean_base/models.py diff --git a/src/dstack/_internal/core/backends/amddevcloud/__init__.py b/src/dstack/_internal/core/backends/amddevcloud/__init__.py new file mode 100644 index 000000000..16e553969 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/__init__.py @@ -0,0 +1 @@ +# This package contains the implementation for the AMDDevCloud backend. diff --git a/src/dstack/_internal/core/backends/amddevcloud/backend.py b/src/dstack/_internal/core/backends/amddevcloud/backend.py new file mode 100644 index 000000000..9a0477d76 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.amddevcloud.compute import AMDDevCloudCompute +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType + + +class AMDDevCloudBackend(BaseDigitalOceanBackend): + TYPE = BackendType.AMDDEVCLOUD + COMPUTE_CLASS = AMDDevCloudCompute + + def __init__(self, config: BaseDigitalOceanConfig, api_url: str): + self.config = config + self._compute = AMDDevCloudCompute(self.config, api_url=api_url, type=self.TYPE) + + def compute(self) -> AMDDevCloudCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/amddevcloud/compute.py b/src/dstack/_internal/core/backends/amddevcloud/compute.py new file mode 100644 index 000000000..945eb63f9 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/compute.py @@ -0,0 +1,5 @@ +from dstack._internal.core.backends.digitalocean_base.compute import BaseDigitalOceanCompute + + +class AMDDevCloudCompute(BaseDigitalOceanCompute): + pass diff --git a/src/dstack/_internal/core/backends/amddevcloud/configurator.py b/src/dstack/_internal/core/backends/amddevcloud/configurator.py new file mode 100644 index 000000000..f9634fbe0 --- /dev/null +++ b/src/dstack/_internal/core/backends/amddevcloud/configurator.py @@ -0,0 +1,24 @@ +from dstack._internal.core.backends.amddevcloud.backend import AMDDevCloudBackend +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.configurator import ( + BaseDigitalOceanConfigurator, +) +from dstack._internal.core.backends.digitalocean_base.models import AnyBaseDigitalOceanCreds +from dstack._internal.core.models.backends.base import ( + BackendType, +) + + +class AMDDevCloudConfigurator(BaseDigitalOceanConfigurator): + TYPE = BackendType.AMDDEVCLOUD + BACKEND_CLASS = AMDDevCloudBackend + API_URL = "https://api-amd.digitalocean.com" + + def get_backend(self, record) -> BaseDigitalOceanBackend: + config = self._get_config(record) + return AMDDevCloudBackend(config=config, api_url=self.API_URL) + + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) + api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/base/offers.py b/src/dstack/_internal/core/backends/base/offers.py index ed9b44a08..d3d004172 100644 --- a/src/dstack/_internal/core/backends/base/offers.py +++ b/src/dstack/_internal/core/backends/base/offers.py @@ -34,6 +34,8 @@ def get_catalog_offers( provider = backend.value if backend == BackendType.LAMBDA: provider = "lambdalabs" + if backend == BackendType.AMDDEVCLOUD: + provider = "digitalocean" q = requirements_to_query_filter(requirements) q.provider = [provider] offers = [] diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index 11e88d5d2..6284dd0a5 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -5,6 +5,12 @@ _CONFIGURATOR_CLASSES: List[Type[Configurator]] = [] +try: + from dstack._internal.core.backends.amddevcloud.configurator import AMDDevCloudConfigurator + + _CONFIGURATOR_CLASSES.append(AMDDevCloudConfigurator) +except ImportError: + pass try: from dstack._internal.core.backends.aws.configurator import AWSConfigurator diff --git a/src/dstack/_internal/core/backends/digitalocean/backend.py b/src/dstack/_internal/core/backends/digitalocean/backend.py index 006ae9cc6..fc09b4c03 100644 --- a/src/dstack/_internal/core/backends/digitalocean/backend.py +++ b/src/dstack/_internal/core/backends/digitalocean/backend.py @@ -1,16 +1,16 @@ -from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.backends.digitalocean.compute import DigitalOceanCompute -from dstack._internal.core.backends.digitalocean.models import DigitalOceanConfig +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig from dstack._internal.core.models.backends.base import BackendType -class DigitalOceanBackend(Backend): +class DigitalOceanBackend(BaseDigitalOceanBackend): TYPE = BackendType.DIGITALOCEAN COMPUTE_CLASS = DigitalOceanCompute - def __init__(self, config: DigitalOceanConfig): + def __init__(self, config: BaseDigitalOceanConfig, api_url: str): self.config = config - self._compute = DigitalOceanCompute(self.config) + self._compute = DigitalOceanCompute(self.config, api_url=api_url, type=self.TYPE) def compute(self) -> DigitalOceanCompute: return self._compute diff --git a/src/dstack/_internal/core/backends/digitalocean/compute.py b/src/dstack/_internal/core/backends/digitalocean/compute.py index 37f00a804..e3b26d026 100644 --- a/src/dstack/_internal/core/backends/digitalocean/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean/compute.py @@ -1,178 +1,5 @@ -from typing import List, Optional +from ..digitalocean_base.compute import BaseDigitalOceanCompute -import gpuhunt -from gpuhunt.providers.digitalocean import DigitalOceanProvider -from dstack._internal.core.backends.base.backend import Compute -from dstack._internal.core.backends.base.compute import ( - ComputeWithCreateInstanceSupport, - generate_unique_instance_name, - get_user_data, -) -from dstack._internal.core.backends.base.offers import get_catalog_offers -from dstack._internal.core.backends.digitalocean.api_client import DigitalOceanAPIClient -from dstack._internal.core.backends.digitalocean.models import DigitalOceanConfig -from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import ( - InstanceAvailability, - InstanceConfiguration, - InstanceOfferWithAvailability, -) -from dstack._internal.core.models.placement import PlacementGroup -from dstack._internal.core.models.runs import JobProvisioningData, Requirements -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - -MAX_INSTANCE_NAME_LEN = 60 - -# Setup commands for DigitalOcean instances -SETUP_COMMANDS = [ - "sudo ufw delete limit ssh", - "sudo ufw allow ssh", -] - -DOCKER_INSTALL_COMMANDS = [ - "export DEBIAN_FRONTEND=noninteractive", - "mkdir -p /etc/apt/keyrings", - "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", - 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', - "apt-get update", - "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", -] - - -class DigitalOceanCompute( - ComputeWithCreateInstanceSupport, - Compute, -): - def __init__(self, config: DigitalOceanConfig): - super().__init__() - self.config = config - self.api_client = DigitalOceanAPIClient(config.creds.api_key, config.flavor or "standard") - self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) - self.catalog.add_provider( - DigitalOceanProvider(token=config.creds.api_key, flavor=config.flavor or "standard") - ) - # self.catalog.add_provider( - # DigitalOceanProvider(token=config.creds.api_key, flavor="standard") - # ) - - def get_offers( - self, requirements: Optional[Requirements] = None - ) -> List[InstanceOfferWithAvailability]: - offers = get_catalog_offers( - backend=BackendType.DIGITALOCEAN, - locations=self.config.regions, - requirements=requirements, - catalog=self.catalog, - ) - return [ - InstanceOfferWithAvailability( - **offer.dict(), - availability=InstanceAvailability.AVAILABLE, - ) - for offer in offers - ] - - def create_instance( - self, - instance_offer: InstanceOfferWithAvailability, - instance_config: InstanceConfiguration, - placement_group: Optional[PlacementGroup], - ) -> JobProvisioningData: - instance_name = generate_unique_instance_name( - instance_config, max_length=MAX_INSTANCE_NAME_LEN - ) - - project_ssh_key = instance_config.ssh_keys[0] - ssh_key_id = self.api_client.get_or_create_ssh_key( - name=f"dstack-{instance_config.project_name}", - public_key=project_ssh_key.public, - ) - - # Use the instance name directly from the offer (gpuhunt handles flavor-specific naming) - size_slug = instance_offer.instance.name - - if not instance_offer.instance.resources.gpus: - backend_specific_commands = SETUP_COMMANDS + DOCKER_INSTALL_COMMANDS - else: - backend_specific_commands = SETUP_COMMANDS - - # Prepare droplet configuration - droplet_config = { - "name": instance_name, - "region": instance_offer.region, - "size": size_slug, - "image": self._get_image_for_instance(instance_offer), - "ssh_keys": [ssh_key_id], - "backups": False, - "ipv6": False, - "monitoring": False, - "tags": [], - "user_data": get_user_data( - authorized_keys=instance_config.get_public_keys(), - backend_specific_commands=backend_specific_commands, - ), - } - - droplet = self.api_client.create_droplet(droplet_config) - - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=str(droplet["id"]), - hostname=None, # Will be set when droplet is active - internal_ip=None, - region=instance_offer.region, - price=instance_offer.price, - username="root", - ssh_port=22, - dockerized=True, - ssh_proxy=None, - backend_data=None, - ) - - def update_provisioning_data( - self, - provisioning_data: JobProvisioningData, - project_ssh_public_key: str, - project_ssh_private_key: str, - ): - droplet = self.api_client.get_droplet(provisioning_data.instance_id) - if droplet["status"] == "active": - for network in droplet["networks"]["v4"]: - if network["type"] == "public": - provisioning_data.hostname = network["ip_address"] - break - - def terminate_instance( - self, instance_id: str, region: str, backend_data: Optional[str] = None - ): - self.api_client.delete_droplet(instance_id) - - def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str: - if not instance_offer.instance.resources.gpus: - # No GPUs, use CPU image - return "ubuntu-24-04-x64" - - gpu_count = len(instance_offer.instance.resources.gpus) - gpu_name = instance_offer.instance.resources.gpus[0].name - - if gpu_name == "MI300X": - # AMD GPU - return "digitaloceanai-rocmjupyter" - else: - # NVIDIA GPUs - DO only supports 1 and 8 GPU configurations. - # DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". But for x8 assuming same. - # See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image) - if gpu_count == 8: - return "gpu-h100x8-base" - elif gpu_count == 1: - return "gpu-h100x1-base" - else: - # For Unsupported GPU count - use single GPU image and log warning - logger.warning( - f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image" - ) - return "gpu-h100x1-base" +class DigitalOceanCompute(BaseDigitalOceanCompute): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean/configurator.py b/src/dstack/_internal/core/backends/digitalocean/configurator.py index c0aef559c..292b5a38a 100644 --- a/src/dstack/_internal/core/backends/digitalocean/configurator.py +++ b/src/dstack/_internal/core/backends/digitalocean/configurator.py @@ -1,62 +1,27 @@ -import json - -from dstack._internal.core.backends.base.configurator import ( - BackendRecord, - Configurator, -) -from dstack._internal.core.backends.digitalocean.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.base.configurator import BackendRecord from dstack._internal.core.backends.digitalocean.backend import DigitalOceanBackend -from dstack._internal.core.backends.digitalocean.models import ( - AnyDigitalOceanBackendConfig, - AnyDigitalOceanCreds, - DigitalOceanBackendConfig, - DigitalOceanBackendConfigWithCreds, - DigitalOceanConfig, - DigitalOceanCreds, - DigitalOceanStoredConfig, +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.configurator import ( + BaseDigitalOceanConfigurator, +) +from dstack._internal.core.backends.digitalocean_base.models import ( + AnyBaseDigitalOceanCreds, ) from dstack._internal.core.models.backends.base import ( BackendType, ) -class DigitalOceanConfigurator(Configurator): +class DigitalOceanConfigurator(BaseDigitalOceanConfigurator): TYPE = BackendType.DIGITALOCEAN BACKEND_CLASS = DigitalOceanBackend + API_URL = "https://api.digitalocean.com" - def validate_config( - self, config: DigitalOceanBackendConfigWithCreds, default_creds_enabled: bool - ): - self._validate_creds(config.creds, config.flavor or "standard") - - def create_backend( - self, project_name: str, config: DigitalOceanBackendConfigWithCreds - ) -> BackendRecord: - return BackendRecord( - config=DigitalOceanStoredConfig( - **DigitalOceanBackendConfig.__response__.parse_obj(config).dict() - ).json(), - auth=DigitalOceanCreds.parse_obj(config.creds).json(), - ) - - def get_backend_config( - self, record: BackendRecord, include_creds: bool - ) -> AnyDigitalOceanBackendConfig: - config = self._get_config(record) - if include_creds: - return DigitalOceanBackendConfigWithCreds.__response__.parse_obj(config) - return DigitalOceanBackendConfig.__response__.parse_obj(config) - - def get_backend(self, record: BackendRecord) -> DigitalOceanBackend: + def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: config = self._get_config(record) - return DigitalOceanBackend(config=config) - - def _get_config(self, record: BackendRecord) -> DigitalOceanConfig: - return DigitalOceanConfig.__response__( - **json.loads(record.config), - creds=DigitalOceanCreds.parse_raw(record.auth), - ) + return DigitalOceanBackend(config=config, api_url=self.API_URL) - def _validate_creds(self, creds: AnyDigitalOceanCreds, flavor: str): - api_client = DigitalOceanAPIClient(creds.api_key, flavor) + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) api_client.validate_api_key() diff --git a/src/dstack/_internal/core/backends/digitalocean/models.py b/src/dstack/_internal/core/backends/digitalocean/models.py deleted file mode 100644 index 43549d832..000000000 --- a/src/dstack/_internal/core/backends/digitalocean/models.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Annotated, List, Literal, Optional, Union - -from pydantic import Field - -from dstack._internal.core.models.common import CoreModel - - -class DigitalOceanAPIKeyCreds(CoreModel): - type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" - api_key: Annotated[str, Field(description="The API key")] - - -AnyDigitalOceanCreds = DigitalOceanAPIKeyCreds -DigitalOceanCreds = AnyDigitalOceanCreds - - -class DigitalOceanBackendConfig(CoreModel): - type: Annotated[ - Literal["digitalocean"], - Field(description="The type of backend"), - ] = "digitalocean" - flavor: Annotated[ - Optional[Literal["standard", "amd"]], - Field( - description="The DigitalOcean cloud flavor. Either 'standard' or 'amd'. Defaults to 'standard'" - ), - ] = "standard" - regions: Annotated[ - Optional[List[str]], - Field(description="The list of DigitalOcean regions. Omit to use all regions"), - ] = None - - -class DigitalOceanBackendConfigWithCreds(DigitalOceanBackendConfig): - creds: Annotated[AnyDigitalOceanCreds, Field(description="The credentials")] - - -AnyDigitalOceanBackendConfig = Union[DigitalOceanBackendConfig, DigitalOceanBackendConfigWithCreds] - - -class DigitalOceanBackendFileConfigWithCreds(DigitalOceanBackendConfig): - creds: Annotated[AnyDigitalOceanCreds, Field(description="The credentials")] - - -class DigitalOceanStoredConfig(DigitalOceanBackendConfig): - pass - - -class DigitalOceanConfig(DigitalOceanStoredConfig): - creds: AnyDigitalOceanCreds diff --git a/src/dstack/_internal/core/backends/digitalocean_base/__init__.py b/src/dstack/_internal/core/backends/digitalocean_base/__init__.py new file mode 100644 index 000000000..cc8247e94 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/__init__.py @@ -0,0 +1 @@ +# This package contains the base classes for DigitalOcean and AMDDevCloud backends. diff --git a/src/dstack/_internal/core/backends/digitalocean/api_client.py b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py similarity index 70% rename from src/dstack/_internal/core/backends/digitalocean/api_client.py rename to src/dstack/_internal/core/backends/digitalocean_base/api_client.py index 62be91aaf..7398f6a3e 100644 --- a/src/dstack/_internal/core/backends/digitalocean/api_client.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py @@ -3,30 +3,20 @@ import requests from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error -from dstack._internal.core.errors import BackendRateLimitExceededError +from dstack._internal.core.errors import NoCapacityError from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) -# DigitalOcean API endpoints -STANDARD_CLOUD_API_URL = "https://api.digitalocean.com/v2" -AMD_CLOUD_API_URL = "https://api-amd.digitalocean.com/v2" - class DigitalOceanAPIClient: - def __init__(self, api_key: str, flavor: str = "standard"): + def __init__(self, api_key: str, api_url: str): self.api_key = api_key - self.flavor = flavor - self.base_url = self._get_base_url() - - def _get_base_url(self) -> str: - if self.flavor == "amd": - return AMD_CLOUD_API_URL - return STANDARD_CLOUD_API_URL + self.base_url = api_url def validate_api_key(self) -> bool: try: - response = self._make_request("GET", "/account") + response = self._make_request("GET", "/v2/account") response.raise_for_status() return True except requests.HTTPError as e: @@ -38,13 +28,13 @@ def validate_api_key(self) -> bool: raise e def list_ssh_keys(self) -> List[Dict[str, Any]]: - response = self._make_request("GET", "/account/keys") + response = self._make_request("GET", "/v2/account/keys") response.raise_for_status() return response.json()["ssh_keys"] def create_ssh_key(self, name: str, public_key: str) -> Dict[str, Any]: payload = {"name": name, "public_key": public_key} - response = self._make_request("POST", "/account/keys", json=payload) + response = self._make_request("POST", "/v2/account/keys", json=payload) response.raise_for_status() return response.json()["ssh_key"] @@ -58,17 +48,19 @@ def get_or_create_ssh_key(self, name: str, public_key: str) -> int: return ssh_key["id"] def create_droplet(self, droplet_config: Dict[str, Any]) -> Dict[str, Any]: - response = self._make_request("POST", "/droplets", json=droplet_config) + response = self._make_request("POST", "/v2/droplets", json=droplet_config) + if response.status_code == 422: + raise NoCapacityError(response.json()["message"]) response.raise_for_status() return response.json()["droplet"] def get_droplet(self, droplet_id: str) -> Dict[str, Any]: - response = self._make_request("GET", f"/droplets/{droplet_id}") + response = self._make_request("GET", f"/v2/droplets/{droplet_id}") response.raise_for_status() return response.json()["droplet"] def delete_droplet(self, droplet_id: str) -> None: - response = self._make_request("DELETE", f"/droplets/{droplet_id}") + response = self._make_request("DELETE", f"/v2/droplets/{droplet_id}") if response.status_code == 404: logger.debug("DigitalOcean droplet %s not found", droplet_id) return @@ -90,8 +82,4 @@ def _make_request( json=json, timeout=timeout, ) - - if response.status_code == 429: - raise BackendRateLimitExceededError("API rate limit exceeded.") - return response diff --git a/src/dstack/_internal/core/backends/digitalocean_base/backend.py b/src/dstack/_internal/core/backends/digitalocean_base/backend.py new file mode 100644 index 000000000..42884b307 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/backend.py @@ -0,0 +1,5 @@ +from dstack._internal.core.backends.base.backend import Backend + + +class BaseDigitalOceanBackend(Backend): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py new file mode 100644 index 000000000..9579b7fdb --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -0,0 +1,175 @@ +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.digitalocean import DigitalOceanProvider + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithCreateInstanceSupport, + generate_unique_instance_name, + get_user_data, +) +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient +from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import PlacementGroup +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +MAX_INSTANCE_NAME_LEN = 60 + +# Setup commands for DigitalOcean instances +SETUP_COMMANDS = [ + "sudo ufw delete limit ssh", + "sudo ufw allow ssh", +] + +DOCKER_INSTALL_COMMANDS = [ + "export DEBIAN_FRONTEND=noninteractive", + "mkdir -p /etc/apt/keyrings", + "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg", + 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null', + "apt-get update", + "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin", +] + + +class BaseDigitalOceanCompute( + ComputeWithCreateInstanceSupport, + Compute, +): + def __init__(self, config: BaseDigitalOceanConfig, api_url: str, type: BackendType): + super().__init__() + self.config = config + self.api_client = DigitalOceanAPIClient(config.creds.api_key, api_url) + self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self.BACKEND_TYPE = type + self.catalog.add_provider( + DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url) + ) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=self.BACKEND_TYPE, + locations=self.config.regions, + requirements=requirements, + catalog=self.catalog, + ) + return [ + InstanceOfferWithAvailability( + **offer.dict(), + availability=InstanceAvailability.AVAILABLE, + ) + for offer in offers + ] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name( + instance_config, max_length=MAX_INSTANCE_NAME_LEN + ) + + project_ssh_key = instance_config.ssh_keys[0] + ssh_key_id = self.api_client.get_or_create_ssh_key( + name=f"dstack-{instance_config.project_name}", + public_key=project_ssh_key.public, + ) + + size_slug = instance_offer.instance.name + + if not instance_offer.instance.resources.gpus: + backend_specific_commands = SETUP_COMMANDS + DOCKER_INSTALL_COMMANDS + else: + backend_specific_commands = SETUP_COMMANDS + + # Prepare droplet configuration + droplet_config = { + "name": instance_name, + "region": instance_offer.region, + "size": size_slug, + "image": self._get_image_for_instance(instance_offer), + "ssh_keys": [ssh_key_id], + "backups": False, + "ipv6": False, + "monitoring": False, + "tags": [], + "user_data": get_user_data( + authorized_keys=instance_config.get_public_keys(), + backend_specific_commands=backend_specific_commands, + ), + } + + droplet = self.api_client.create_droplet(droplet_config) + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=str(droplet["id"]), + hostname=None, + internal_ip=None, + region=instance_offer.region, + price=instance_offer.price, + username="root", + ssh_port=22, + dockerized=True, + ssh_proxy=None, + backend_data=None, + ) + + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + droplet = self.api_client.get_droplet(provisioning_data.instance_id) + if droplet["status"] == "active": + for network in droplet["networks"]["v4"]: + if network["type"] == "public": + provisioning_data.hostname = network["ip_address"] + break + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + self.api_client.delete_droplet(instance_id) + + def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str: + if not instance_offer.instance.resources.gpus: + # No GPUs, use CPU image + return "ubuntu-24-04-x64" + + gpu_count = len(instance_offer.instance.resources.gpus) + gpu_name = instance_offer.instance.resources.gpus[0].name + + if gpu_name == "MI300X": + # AMD GPU + return "digitaloceanai-rocmjupyter" + else: + # NVIDIA GPUs - DO only supports 1 and 8 GPU configurations. + # DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". DO does not provide guidance for x8 GPUs so assuming the same applies. + # See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image) + if gpu_count == 8: + return "gpu-h100x8-base" + elif gpu_count == 1: + return "gpu-h100x1-base" + else: + # For Unsupported GPU count - use single GPU image and log warning + logger.warning( + f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image" + ) + return "gpu-h100x1-base" diff --git a/src/dstack/_internal/core/backends/digitalocean_base/configurator.py b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py new file mode 100644 index 000000000..0612fc16e --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py @@ -0,0 +1,53 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, +) +from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend +from dstack._internal.core.backends.digitalocean_base.models import ( + AnyBaseDigitalOceanBackendConfig, + AnyBaseDigitalOceanCreds, + BaseDigitalOceanBackendConfig, + BaseDigitalOceanBackendConfigWithCreds, + BaseDigitalOceanConfig, + BaseDigitalOceanCreds, + BaseDigitalOceanStoredConfig, +) + + +class BaseDigitalOceanConfigurator(Configurator): + def validate_config( + self, config: BaseDigitalOceanBackendConfigWithCreds, default_creds_enabled: bool + ): + self._validate_creds(config.creds) + + def create_backend( + self, project_name: str, config: BaseDigitalOceanBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=BaseDigitalOceanStoredConfig( + **BaseDigitalOceanBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=BaseDigitalOceanCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config( + self, record: BackendRecord, include_creds: bool + ) -> AnyBaseDigitalOceanBackendConfig: + config = self._get_config(record) + if include_creds: + return BaseDigitalOceanBackendConfigWithCreds.__response__.parse_obj(config) + return BaseDigitalOceanBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: + pass + + def _get_config(self, record: BackendRecord) -> BaseDigitalOceanConfig: + return BaseDigitalOceanConfig.__response__( + **json.loads(record.config), + creds=BaseDigitalOceanCreds.parse_raw(record.auth), + ) + + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + pass diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py new file mode 100644 index 000000000..a910a44a7 --- /dev/null +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -0,0 +1,46 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class BaseDigitalOceanAPIKeyCreds(CoreModel): + type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key" + api_key: Annotated[str, Field(description="The API key")] + + +AnyBaseDigitalOceanCreds = BaseDigitalOceanAPIKeyCreds +BaseDigitalOceanCreds = AnyBaseDigitalOceanCreds + + +class BaseDigitalOceanBackendConfig(CoreModel): + type: Annotated[ + Literal["amddevcloud", "digitalocean"], + Field(description="The type of backend"), + ] = "digitalocean" + regions: Annotated[ + Optional[List[str]], + Field(description="The list of DigitalOcean regions. Omit to use all regions"), + ] = None + + +class BaseDigitalOceanBackendConfigWithCreds(BaseDigitalOceanBackendConfig): + creds: Annotated[AnyBaseDigitalOceanCreds, Field(description="The credentials")] + + +AnyBaseDigitalOceanBackendConfig = Union[ + BaseDigitalOceanBackendConfig, BaseDigitalOceanBackendConfigWithCreds +] + + +class BaseDigitalOceanBackendFileConfigWithCreds(BaseDigitalOceanBackendConfig): + creds: Annotated[AnyBaseDigitalOceanCreds, Field(description="The credentials")] + + +class BaseDigitalOceanStoredConfig(BaseDigitalOceanBackendConfig): + pass + + +class BaseDigitalOceanConfig(BaseDigitalOceanStoredConfig): + creds: AnyBaseDigitalOceanCreds diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index b0502aa01..b35fa2c33 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -20,10 +20,10 @@ DataCrunchBackendConfig, DataCrunchBackendConfigWithCreds, ) -from dstack._internal.core.backends.digitalocean.models import ( - DigitalOceanBackendConfig, - DigitalOceanBackendConfigWithCreds, - DigitalOceanBackendFileConfigWithCreds, +from dstack._internal.core.backends.digitalocean_base.models import ( + BaseDigitalOceanBackendConfig, + BaseDigitalOceanBackendConfigWithCreds, + BaseDigitalOceanBackendFileConfigWithCreds, ) from dstack._internal.core.backends.dstack.models import ( DstackBackendConfig, @@ -82,7 +82,7 @@ CloudRiftBackendConfig, CudoBackendConfig, DataCrunchBackendConfig, - DigitalOceanBackendConfig, + BaseDigitalOceanBackendConfig, GCPBackendConfig, HotAisleBackendConfig, KubernetesBackendConfig, @@ -106,7 +106,7 @@ CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, - DigitalOceanBackendConfigWithCreds, + BaseDigitalOceanBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, KubernetesBackendConfigWithCreds, @@ -129,7 +129,7 @@ CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, - DigitalOceanBackendFileConfigWithCreds, + BaseDigitalOceanBackendFileConfigWithCreds, GCPBackendFileConfigWithCreds, HotAisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, diff --git a/src/dstack/_internal/core/errors.py b/src/dstack/_internal/core/errors.py index cd6e3d7e5..0bfd5f6f3 100644 --- a/src/dstack/_internal/core/errors.py +++ b/src/dstack/_internal/core/errors.py @@ -90,10 +90,6 @@ class BackendAuthError(BackendError): pass -class BackendRateLimitExceededError(BackendError): - pass - - class ComputeError(BackendError): pass diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 705ff5d3d..3d33e75b6 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -4,6 +4,7 @@ class BackendType(str, enum.Enum): """ Attributes: + AMDDEVCLOUD (BackendType): AMD Developer Cloud AWS (BackendType): Amazon Web Services AZURE (BackendType): Microsoft Azure CLOUDRIFT (BackendType): CloudRift @@ -23,6 +24,7 @@ class BackendType(str, enum.Enum): VULTR (BackendType): Vultr """ + AMDDEVCLOUD = "amddevcloud" AWS = "aws" AZURE = "azure" CLOUDRIFT = "cloudrift" From 1393761cbe33791af7b9642949619e4ca5862418 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 26 Aug 2025 20:27:29 +0545 Subject: [PATCH 03/10] Add project_name config --- .../core/backends/digitalocean_base/api_client.py | 12 ++++++++++++ .../core/backends/digitalocean_base/compute.py | 3 ++- .../core/backends/digitalocean_base/models.py | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py index 7398f6a3e..170631315 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py @@ -32,6 +32,18 @@ def list_ssh_keys(self) -> List[Dict[str, Any]]: response.raise_for_status() return response.json()["ssh_keys"] + def list_projects(self) -> List[Dict[str, Any]]: + response = self._make_request("GET", "/v2/projects") + response.raise_for_status() + return response.json()["projects"] + + def get_project_id(self, project_name: str) -> Optional[str]: + projects = self.list_projects() + for project in projects: + if project["name"] == project_name: + return project["id"] + return None + def create_ssh_key(self, name: str, public_key: str) -> Dict[str, Any]: payload = {"name": name, "public_key": public_key} response = self._make_request("POST", "/v2/account/keys", json=payload) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py index 9579b7fdb..d1dbefde6 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -96,7 +96,7 @@ def create_instance( else: backend_specific_commands = SETUP_COMMANDS - # Prepare droplet configuration + project_id = self.api_client.get_project_id(self.config.project_name) droplet_config = { "name": instance_name, "region": instance_offer.region, @@ -111,6 +111,7 @@ def create_instance( authorized_keys=instance_config.get_public_keys(), backend_specific_commands=backend_specific_commands, ), + **({"project_id": project_id} if project_id is not None else {}), } droplet = self.api_client.create_droplet(droplet_config) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py index a910a44a7..84dd6f466 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/models.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -19,6 +19,7 @@ class BaseDigitalOceanBackendConfig(CoreModel): Literal["amddevcloud", "digitalocean"], Field(description="The type of backend"), ] = "digitalocean" + project_name: Annotated[str, Field(description="The name of the DigitalOcean project")] = None regions: Annotated[ Optional[List[str]], Field(description="The list of DigitalOcean regions. Omit to use all regions"), From ba37fb0357c8282dd3219162d87149413fd76171 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Fri, 29 Aug 2025 19:47:29 +0545 Subject: [PATCH 04/10] Resolve Review Comments and Update docs --- docs/docs/concepts/backends.md | 126 ++++++++++++++---- docs/docs/reference/server/config.yml.md | 67 +++++++--- .../core/backends/amddevcloud/configurator.py | 6 +- .../backends/digitalocean/configurator.py | 6 +- .../backends/digitalocean_base/api_client.py | 9 +- .../backends/digitalocean_base/compute.py | 11 +- .../digitalocean_base/configurator.py | 5 +- .../core/backends/digitalocean_base/models.py | 4 +- src/dstack/_internal/core/backends/models.py | 3 +- 9 files changed, 180 insertions(+), 57 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 81a6cf48e..6b4d16a63 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -579,34 +579,6 @@ gcloud projects list --format="json(projectId)" Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances. -## Hot Aisle - -Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). -Create a new team and generate an API key for the member in the team. - -Then, go ahead and configure the backend: - -
- -```yaml -projects: -- name: main - backends: - - type: hotaisle - team_handle: hotaisle-team-handle - creds: - type: api_key - api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f -``` - -
- -??? info "Required permissions" - The API key must have the following roles assigned: - - * **Owner role for the user** - Required for creating and managing SSH keys - * **Operator role for the team** - Required for managing virtual machines within the team - ## Lambda Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key` @@ -937,6 +909,104 @@ projects: +## AMD Developer Cloud +Log into your [AMD Developer Cloud :material-arrow-top-right-thin:{ .external }](https://amd.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. + +Then, go ahead and configure the backend: + +
+ +```yaml +projects: +- name: main + backends: + - type: amddevcloud + project_name: my-amd-project + creds: + type: api_key + api_key: dop_v1_71ea79a0c4bf2ffa70ac9d2a7b2689d2b41768567b22ebabe58a80066dcc5e92 +``` + +
+ +??? info "Project Name" + **project_name** configuration is optional. If it is not provided, the default project is used. + +??? info "Required permissions" + The API key must have the following scopes assigned: + + * **account** - read + * **droplet** - create,read,update,delete,admin + * **project** - create,read,update,delete + * **regions** - read + * **sizes** - read + * **ssh_key** - create,read,update,delete + + + +## Digital Ocean +Log into your [Digital Ocean :material-arrow-top-right-thin:{ .external }](https://cloud.digitalocean.com/login) account. Click `API` in the sidebar and click the button `Generate New Token`. + +Then, go ahead and configure the backend: + +
+ +```yaml +projects: +- name: main + backends: + - type: digitalocean + project_name: my-digital-ocean-project + creds: + type: api_key + api_key: dop_v1_71ea79a0c4bf2ffa70ac9d2a7b2689d2b41768567b22ebabe58a80066dcc5e92 +``` + +
+ +??? info "Project Name" + **project_name** configuration is optional. If it is not provided, the default project is used. + +??? info "Required permissions" + The API key must have the following scopes assigned: + + * **account** - read + * **droplet** - create,read,update,delete,admin + * **project** - create,read,update,delete + * **regions** - read + * **sizes** - read + * **ssh_key** - create,read,update,delete + + +## Hot Aisle + +Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/). +Create a new team and generate an API key for the member in the team. + +Then, go ahead and configure the backend: + +
+ +```yaml +projects: +- name: main + backends: + - type: hotaisle + team_handle: hotaisle-team-handle + creds: + type: api_key + api_key: 9c27a4bb7a8e472fae12ab34.3f2e3c1db75b9a0187fd2196c6b3e56d2b912e1c439ba08d89e7b6fcd4ef1d3f +``` + +
+ +??? info "Required permissions" + The API key must have the following roles assigned: + + * **Owner role for the user** - Required for creating and managing SSH keys + * **Operator role for the team** - Required for managing virtual machines within the team + + ## CloudRift Log into your [CloudRift :material-arrow-top-right-thin:{ .external }](https://console.cloudrift.ai/) console, click `API Keys` in the sidebar and click the button to create a new API key. diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index a8dc3b2ca..25f649ddf 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -126,22 +126,6 @@ to configure [backends](../../concepts/backends.md) and other [server-level sett type: required: true -##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } - -#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds - overrides: - show_root_heading: false - type: - required: true - item_id_prefix: hotaisle- - -###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } - -#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds - overrides: - show_root_heading: false - type: - required: true ##### `projects[n].backends[type=lambda]` { #lambda data-toc-label="lambda" } @@ -332,6 +316,57 @@ to configure [backends](../../concepts/backends.md) and other [server-level sett type: required: true +##### `projects[n].backends[type=amddevcloud]` { #amddevcloud data-toc-label="amddevcloud" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: amddevcloud- + +###### `projects[n].backends[type=amddevcloud].creds` { #amddevcloud-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=digitalocean]` { #digitalocean data-toc-label="digitalocean" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: digitalocean- + +###### `projects[n].backends[type=digitalocean].creds` { #digitalocean-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.digitalocean_base.models.BaseDigitalOceanAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + +##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: hotaisle- + +###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds + overrides: + show_root_heading: false + type: + required: true + ##### `projects[n].backends[type=cloudrift]` { #cloudrift data-toc-label="cloudrift" } #SCHEMA# dstack._internal.core.backends.cloudrift.models.CloudRiftBackendConfigWithCreds diff --git a/src/dstack/_internal/core/backends/amddevcloud/configurator.py b/src/dstack/_internal/core/backends/amddevcloud/configurator.py index f9634fbe0..1f464fc54 100644 --- a/src/dstack/_internal/core/backends/amddevcloud/configurator.py +++ b/src/dstack/_internal/core/backends/amddevcloud/configurator.py @@ -1,3 +1,5 @@ +from typing import Optional + from dstack._internal.core.backends.amddevcloud.backend import AMDDevCloudBackend from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend @@ -19,6 +21,8 @@ def get_backend(self, record) -> BaseDigitalOceanBackend: config = self._get_config(record) return AMDDevCloudBackend(config=config, api_url=self.API_URL) - def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) api_client.validate_api_key() + if project_name: + api_client.validate_project_name(project_name) diff --git a/src/dstack/_internal/core/backends/digitalocean/configurator.py b/src/dstack/_internal/core/backends/digitalocean/configurator.py index 292b5a38a..045372312 100644 --- a/src/dstack/_internal/core/backends/digitalocean/configurator.py +++ b/src/dstack/_internal/core/backends/digitalocean/configurator.py @@ -1,3 +1,5 @@ +from typing import Optional + from dstack._internal.core.backends.base.configurator import BackendRecord from dstack._internal.core.backends.digitalocean.backend import DigitalOceanBackend from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient @@ -22,6 +24,8 @@ def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend: config = self._get_config(record) return DigitalOceanBackend(config=config, api_url=self.API_URL) - def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL) api_client.validate_api_key() + if project_name: + api_client.validate_project_name(project_name) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py index 170631315..d9454b5df 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py @@ -27,6 +27,14 @@ def validate_api_key(self) -> bool: ) raise e + def validate_project_name(self, project_name: str) -> bool: + if self.get_project_id(project_name) is None: + raise_invalid_credentials_error( + fields=[["config", "project_name"]], + details=f"Project with name '{project_name}' does not exist", + ) + return True + def list_ssh_keys(self) -> List[Dict[str, Any]]: response = self._make_request("GET", "/v2/account/keys") response.raise_for_status() @@ -83,7 +91,6 @@ def _make_request( ) -> requests.Response: url = f"{self.base_url}{endpoint}" headers = { - "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}", } diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py index d1dbefde6..b443c2007 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -88,7 +88,6 @@ def create_instance( name=f"dstack-{instance_config.project_name}", public_key=project_ssh_key.public, ) - size_slug = instance_offer.instance.name if not instance_offer.instance.resources.gpus: @@ -96,7 +95,11 @@ def create_instance( else: backend_specific_commands = SETUP_COMMANDS - project_id = self.api_client.get_project_id(self.config.project_name) + project_id = ( + self.api_client.get_project_id(self.config.project_name) + if self.config.project_name + else None + ) droplet_config = { "name": instance_name, "region": instance_offer.region, @@ -155,9 +158,9 @@ def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) return "ubuntu-24-04-x64" gpu_count = len(instance_offer.instance.resources.gpus) - gpu_name = instance_offer.instance.resources.gpus[0].name + gpu_vendor = instance_offer.instance.resources.gpus[0].vendor - if gpu_name == "MI300X": + if gpu_vendor == "amd": # AMD GPU return "digitaloceanai-rocmjupyter" else: diff --git a/src/dstack/_internal/core/backends/digitalocean_base/configurator.py b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py index 0612fc16e..b57559f1a 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/configurator.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/configurator.py @@ -1,4 +1,5 @@ import json +from typing import Optional from dstack._internal.core.backends.base.configurator import ( BackendRecord, @@ -20,7 +21,7 @@ class BaseDigitalOceanConfigurator(Configurator): def validate_config( self, config: BaseDigitalOceanBackendConfigWithCreds, default_creds_enabled: bool ): - self._validate_creds(config.creds) + self._validate_creds(config.creds, config.project_name) def create_backend( self, project_name: str, config: BaseDigitalOceanBackendConfigWithCreds @@ -49,5 +50,5 @@ def _get_config(self, record: BackendRecord) -> BaseDigitalOceanConfig: creds=BaseDigitalOceanCreds.parse_raw(record.auth), ) - def _validate_creds(self, creds: AnyBaseDigitalOceanCreds): + def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None): pass diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py index 84dd6f466..074d0494a 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/models.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -19,10 +19,10 @@ class BaseDigitalOceanBackendConfig(CoreModel): Literal["amddevcloud", "digitalocean"], Field(description="The type of backend"), ] = "digitalocean" - project_name: Annotated[str, Field(description="The name of the DigitalOcean project")] = None + project_name: Annotated[Optional[str], Field(description="The name of the project")] = None regions: Annotated[ Optional[List[str]], - Field(description="The list of DigitalOcean regions. Omit to use all regions"), + Field(description="The list of regions. Omit to use all regions"), ] = None diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index b35fa2c33..1715080f8 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -23,7 +23,6 @@ from dstack._internal.core.backends.digitalocean_base.models import ( BaseDigitalOceanBackendConfig, BaseDigitalOceanBackendConfigWithCreds, - BaseDigitalOceanBackendFileConfigWithCreds, ) from dstack._internal.core.backends.dstack.models import ( DstackBackendConfig, @@ -129,7 +128,7 @@ CloudRiftBackendConfigWithCreds, CudoBackendConfigWithCreds, DataCrunchBackendConfigWithCreds, - BaseDigitalOceanBackendFileConfigWithCreds, + BaseDigitalOceanBackendConfigWithCreds, GCPBackendFileConfigWithCreds, HotAisleBackendFileConfigWithCreds, KubernetesBackendFileConfigWithCreds, From 566bc215c80e6c9980292776d8b56d16313e4f18 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 16:23:11 +0545 Subject: [PATCH 05/10] Update src/dstack/_internal/core/backends/digitalocean_base/api_client.py Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- .../_internal/core/backends/digitalocean_base/api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py index d9454b5df..afedd36ff 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/api_client.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/api_client.py @@ -30,7 +30,7 @@ def validate_api_key(self) -> bool: def validate_project_name(self, project_name: str) -> bool: if self.get_project_id(project_name) is None: raise_invalid_credentials_error( - fields=[["config", "project_name"]], + fields=[["project_name"]], details=f"Project with name '{project_name}' does not exist", ) return True From 385bbe198c56d2fa35c41996772d7308d1c1bb32 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 16:25:53 +0545 Subject: [PATCH 06/10] Update src/dstack/_internal/core/backends/digitalocean_base/compute.py Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- .../core/backends/digitalocean_base/compute.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py index b443c2007..5e260bf17 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -95,11 +95,11 @@ def create_instance( else: backend_specific_commands = SETUP_COMMANDS - project_id = ( - self.api_client.get_project_id(self.config.project_name) - if self.config.project_name - else None - ) + project_id = None + if self.config.project_name: + project_id = self.api_client.get_project_id(self.config.project_name) + if project_id is None: + raise BackendError(f"Project {self.config.project_name} does not exist") droplet_config = { "name": instance_name, "region": instance_offer.region, From feeaa1cbd70686b364d6a04561663ebd699472fa Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 16:27:44 +0545 Subject: [PATCH 07/10] Update src/dstack/_internal/core/backends/digitalocean_base/compute.py Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- src/dstack/_internal/core/backends/digitalocean_base/compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py index 5e260bf17..6ee00e2d9 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -160,7 +160,7 @@ def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) gpu_count = len(instance_offer.instance.resources.gpus) gpu_vendor = instance_offer.instance.resources.gpus[0].vendor - if gpu_vendor == "amd": + if gpu_vendor == gpuhunt.AcceleratorVendor.AMD: # AMD GPU return "digitaloceanai-rocmjupyter" else: From f4c306ed04d5753cdeeb442be64d68473f5888c0 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 16:28:59 +0545 Subject: [PATCH 08/10] Update src/dstack/_internal/core/backends/digitalocean_base/models.py Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- .../_internal/core/backends/digitalocean_base/models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py index 074d0494a..4f0b43e86 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/models.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -35,10 +35,6 @@ class BaseDigitalOceanBackendConfigWithCreds(BaseDigitalOceanBackendConfig): ] -class BaseDigitalOceanBackendFileConfigWithCreds(BaseDigitalOceanBackendConfig): - creds: Annotated[AnyBaseDigitalOceanCreds, Field(description="The credentials")] - - class BaseDigitalOceanStoredConfig(BaseDigitalOceanBackendConfig): pass From 2fd9d31a8976d9b06b44d8cecf5933edd84269ab Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 16:29:13 +0545 Subject: [PATCH 09/10] Update src/dstack/_internal/core/backends/digitalocean_base/models.py Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- src/dstack/_internal/core/backends/digitalocean_base/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/backends/digitalocean_base/models.py b/src/dstack/_internal/core/backends/digitalocean_base/models.py index 4f0b43e86..e3d179fcc 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/models.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/models.py @@ -18,7 +18,7 @@ class BaseDigitalOceanBackendConfig(CoreModel): type: Annotated[ Literal["amddevcloud", "digitalocean"], Field(description="The type of backend"), - ] = "digitalocean" + ] project_name: Annotated[Optional[str], Field(description="The name of the project")] = None regions: Annotated[ Optional[List[str]], From 8a5f925227c6cfe3da6ae079e219c3033719ddd9 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 2 Sep 2025 17:10:49 +0545 Subject: [PATCH 10/10] Bump gpuhunt to 0.1.8 --- pyproject.toml | 2 +- src/dstack/_internal/core/backends/digitalocean_base/compute.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a14ac0cbb..342f6571a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.7", + "gpuhunt==0.1.8", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", diff --git a/src/dstack/_internal/core/backends/digitalocean_base/compute.py b/src/dstack/_internal/core/backends/digitalocean_base/compute.py index 6ee00e2d9..6a5325c54 100644 --- a/src/dstack/_internal/core/backends/digitalocean_base/compute.py +++ b/src/dstack/_internal/core/backends/digitalocean_base/compute.py @@ -12,6 +12,7 @@ from dstack._internal.core.backends.base.offers import get_catalog_offers from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig +from dstack._internal.core.errors import BackendError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.instances import ( InstanceAvailability,