Add tpu support in gcp (#1323)

Bihan · Bihan  Rana · web-flow · commit 71365d5eb1a8 · 2024-06-13T11:38:23.000+05:00
* Add TPU support in gcp

* Filter TPU Pods for initial release

* Modify pretty_resources for TPU

---------

Co-authored-by: Bihan  Rana &lt;bihan@Bihans-MacBook-Pro.local&gt;
diff --git a/setup.py b/setup.py
@@ -91,6 +91,7 @@ def get_long_description():
     "google-cloud-logging>=2.0.0",
     "google-api-python-client>=2.80.0",
     "google-cloud-billing>=1.11.0",
+    "google-cloud-tpu>=1.18.3",
 ]
 
 DATACRUNCH_DEPS = ["datacrunch"]
diff --git a/src/dstack/_internal/core/backends/gcp/compute.py b/src/dstack/_internal/core/backends/gcp/compute.py
@@ -4,13 +4,15 @@
 
 import google.api_core.exceptions
 import google.cloud.compute_v1 as compute_v1
+from google.cloud import tpu_v2
 
 import dstack._internal.core.backends.gcp.auth as auth
 import dstack._internal.core.backends.gcp.resources as gcp_resources
 from dstack._internal.core.backends.base.compute import (
     Compute,
     get_gateway_user_data,
     get_instance_name,
+    get_shim_commands,
     get_user_data,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -45,6 +47,7 @@ def __init__(self, config: GCPConfig):
         self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
         self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
         self.subnetworks_client = compute_v1.SubnetworksClient(credentials=self.credentials)
+        self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials)
 
     def get_offers(
         self, requirements: Optional[Requirements] = None
@@ -70,7 +73,7 @@ def get_offers(
             availability = InstanceAvailability.NO_QUOTA
             if _has_gpu_quota(quotas[region], offer.instance.resources):
                 availability = InstanceAvailability.UNKNOWN
-            # todo quotas: cpu, memory, global gpu
+            # todo quotas: cpu, memory, global gpu, tpu
             offers_with_availability.append(
                 InstanceOfferWithAvailability(**offer.dict(), availability=availability)
             )
@@ -84,13 +87,22 @@ def terminate_instance(
         # Old instances have region set to zone, e.g. us-central1-a.
         # New instance have region set to region, e.g. us-central1. Zone is stored in backend_data.
         zone = region
+        is_tpu = False
         if backend_data is not None:
             backend_data_dict = json.loads(backend_data)
             zone = backend_data_dict["zone"]
+            is_tpu = backend_data_dict.get("is_tpu", False)
         try:
-            self.instances_client.delete(
-                project=self.config.project_id, zone=zone, instance=instance_id
-            )
+            if is_tpu:
+                name = f"projects/{self.project_id}/locations/{zone}/nodes/{instance_id}"
+                delete_request = tpu_v2.DeleteNodeRequest(
+                    name=name,
+                )
+                self.tpu_client.delete_node(request=delete_request)
+            else:
+                self.instances_client.delete(
+                    project=self.config.project_id, zone=zone, instance=instance_id
+                )
         except google.api_core.exceptions.NotFound:
             pass
 
@@ -120,21 +132,74 @@ def create_instance(
                 network=self.config.vpc_resource_name,
             )
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
-
         # Choose any usable subnet in a VPC.
         # Configuring a specific subnet per region is not supported yet.
         subnetwork = _get_vpc_subnet(
             subnetworks_client=self.subnetworks_client,
             config=self.config,
             region=instance_offer.region,
         )
+        commands = get_shim_commands(authorized_keys=authorized_keys)
+        startup_script = " ".join([" && ".join(commands)])
+        startup_script = "#! /bin/bash\n" + startup_script
+        instance_id = f"tpu-{instance_config.instance_name}"
 
         labels = {
             "owner": "dstack",
             "dstack_project": instance_config.project_name.lower(),
             "dstack_user": instance_config.user.lower(),
         }
         labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
+        tpu = (
+            _is_tpu(instance_offer.instance.resources.gpus[0].name)
+            if instance_offer.instance.resources.gpus
+            else False
+        )
+        if tpu:
+            for zone in _get_instance_zones(instance_offer):
+                tpu_node = gcp_resources.create_tpu_node_struct(
+                    instance_name=instance_offer.instance.name,
+                    startup_script=startup_script,
+                    authorized_keys=authorized_keys,
+                    spot=instance_offer.instance.resources.spot,
+                    labels=labels,
+                )
+
+                create_node_request = tpu_v2.CreateNodeRequest(
+                    parent=f"projects/{self.config.project_id}/locations/{zone}",
+                    node_id=instance_id,
+                    node=tpu_node,
+                )
+                try:
+                    operation = self.tpu_client.create_node(request=create_node_request)
+                    gcp_resources.wait_for_operation(
+                        operation, verbose_name="tpu instance creation"
+                    )
+                except (
+                    google.api_core.exceptions.ServiceUnavailable,
+                    google.api_core.exceptions.NotFound,
+                    google.api_core.exceptions.ResourceExhausted,
+                ):
+                    continue
+                node_request = tpu_v2.GetNodeRequest(
+                    name=f"projects/dstack/locations/{zone}/nodes/{instance_id}",
+                )
+                instance = self.tpu_client.get_node(request=node_request)
+                return JobProvisioningData(
+                    backend=instance_offer.backend,
+                    instance_type=instance_offer.instance,
+                    instance_id=instance_id,
+                    hostname=instance.network_endpoints[0].access_config.external_ip,
+                    internal_ip=None,
+                    region=zone,
+                    price=instance_offer.price,
+                    ssh_port=22,
+                    username="ubuntu",
+                    ssh_proxy=None,
+                    dockerized=True,
+                    backend_data=json.dumps({"is_tpu": tpu, "zone": zone}),
+                )
+            raise NoCapacityError()
 
         for zone in _get_instance_zones(instance_offer):
             request = compute_v1.InsertInstanceRequest()
@@ -301,6 +366,9 @@ def _filter(offer: InstanceOffer) -> bool:
         # strip zone
         if offer.region[:-2] not in regions:
             return False
+        # remove TPU Pod for initial release
+        if _is_tpu(f"tpu-{offer.instance.name}") and _is_pod(offer.instance.name):
+            return False
         for family in [
             "e2-medium",
             "e2-standard-",
@@ -324,6 +392,8 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
     if not resources.gpus:
         return True
     gpu = resources.gpus[0]
+    if _is_tpu(gpu.name):
+        return True
     quota_name = f"NVIDIA_{gpu.name}_GPUS"
     if gpu.name == "A100" and gpu.memory_mib == 80 * 1024:
         quota_name = "NVIDIA_A100_80GB_GPUS"
@@ -352,3 +422,31 @@ def _get_instance_zones(instance_offer: InstanceOffer) -> List[str]:
             continue
         zones.append(offer.region)
     return zones
+
+
+def _is_tpu(name: str) -> bool:
+    tpu_versions = ["tpu-v2", "tpu-v3", "tpu-v4", "tpu-v5p", "tpu-v5litepod"]
+    parts = name.split("-")
+    if len(parts) == 3:
+        version = f"{parts[0]}-{parts[1]}"
+        cores = parts[2]
+        if version in tpu_versions and cores.isdigit():
+            return True
+    return False
+
+
+def _is_pod(instance_name: str) -> bool:
+    parts = instance_name.split("-")
+    if len(parts) != 2:
+        raise ValueError(f"Invalid tpu type: {instance_name}")
+    version, tensor_cores = parts
+    try:
+        tensor_cores = int(tensor_cores)
+    except ValueError:
+        raise ValueError(f"Invalid number in tpu tensor cores: {tensor_cores}")
+    if version in ["v2", "v3"]:
+        return tensor_cores > 8
+    elif version in ["v4", "v5p", "v5litepod"]:
+        return True
+    else:
+        raise ValueError(f"Unknown TPU version: {version}")
diff --git a/src/dstack/_internal/core/backends/gcp/resources.py b/src/dstack/_internal/core/backends/gcp/resources.py
@@ -6,6 +6,8 @@
 import google.api_core.exceptions
 import google.cloud.compute_v1 as compute_v1
 from google.api_core.extended_operation import ExtendedOperation
+from google.api_core.operation import Operation
+from google.cloud import tpu_v2
 
 import dstack.version as version
 from dstack._internal.core.errors import ComputeError
@@ -278,3 +280,32 @@ def generate_random_resource_name(length: int = 40) -> str:
     return random.choice(string.ascii_lowercase) + "".join(
         random.choice(string.ascii_lowercase + string.digits) for _ in range(length)
     )
+
+
+def create_tpu_node_struct(
+    instance_name: str,
+    startup_script: str,
+    authorized_keys: List[str],
+    spot: bool,
+    labels: Dict[str, str],
+) -> tpu_v2.Node:
+    node = tpu_v2.Node()
+    if spot:
+        node.scheduling_config = tpu_v2.SchedulingConfig(preemptible=True)
+    node.accelerator_type = instance_name
+    node.runtime_version = "tpu-ubuntu2204-base"
+    node.network_config = tpu_v2.NetworkConfig(enable_external_ips=True)
+    ssh_keys = "\n".join(f"ubuntu:{key}" for key in authorized_keys)
+    node.metadata = {"ssh-keys": ssh_keys, "startup-script": startup_script}
+    node.labels = labels
+    return node
+
+
+def wait_for_operation(operation: Operation, verbose_name: str = "operation", timeout: int = 300):
+    try:
+        result = operation.result(timeout=timeout)
+    except Exception as e:
+        logger.error("Error during %s: %s", verbose_name, e)
+        logger.error("Operation ID: %s", operation)
+        raise operation.exception() or RuntimeError(str(e))
+    return result
diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py
@@ -85,13 +85,27 @@ def pretty_resources(
     """
     parts = []
     if cpus is not None:
-        parts.append(f"{cpus}xCPU")
+        if isinstance(cpus, int):
+            if cpus > 0:
+                parts.append(f"{cpus}xCPU")
+        else:
+            parts.append(f"{cpus}xCPU")
     if memory is not None:
-        parts.append(f"{memory}")
+        if isinstance(memory, str):
+            memory_value = int(memory[:-2])
+            if memory_value > 0:
+                parts.append(f"{memory}")
+        else:
+            parts.append(f"{memory}")
     if gpu_count:
         gpu_parts = []
         if gpu_memory is not None:
-            gpu_parts.append(f"{gpu_memory}")
+            if isinstance(gpu_memory, str):
+                gpu_memory_value = int(gpu_memory[:-2])
+                if gpu_memory_value > 0:
+                    parts.append(f"{gpu_memory}")
+            else:
+                gpu_parts.append(f"{gpu_memory}")
         if total_gpu_memory is not None:
             gpu_parts.append(f"total {total_gpu_memory}")
         if compute_capability is not None:

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@ def get_long_description():`
`91`	`91`	`"google-cloud-logging>=2.0.0",`
`92`	`92`	`"google-api-python-client>=2.80.0",`
`93`	`93`	`"google-cloud-billing>=1.11.0",`
	`94`	`+ "google-cloud-tpu>=1.18.3",`
`94`	`95`	`]`
`95`	`96`
`96`	`97`	`DATACRUNCH_DEPS = ["datacrunch"]`