google/cloud/tpu/v1/cloud_tpu.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.tpu.v1;

import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/timestamp.proto";

option go_package = "cloud.google.com/go/tpu/apiv1/tpupb;tpupb";
option java_multiple_files = true;
option java_outer_classname = "CloudTpuProto";
option java_package = "com.google.cloud.tpu.v1";

// Manages TPU nodes and other resources
//
// TPU API v1
service Tpu {
  option (google.api.default_host) = "tpu.googleapis.com";
  option (google.api.oauth_scopes) =
      "https://www.googleapis.com/auth/cloud-platform";

  // Lists nodes.
  rpc ListNodes(ListNodesRequest) returns (ListNodesResponse) {
    option (google.api.http) = {
      get: "/v1/{parent=projects/*/locations/*}/nodes"
    };
    option (google.api.method_signature) = "parent";
  }

  // Gets the details of a node.
  rpc GetNode(GetNodeRequest) returns (Node) {
    option (google.api.http) = {
      get: "/v1/{name=projects/*/locations/*/nodes/*}"
    };
    option (google.api.method_signature) = "name";
  }

  // Creates a node.
  rpc CreateNode(CreateNodeRequest) returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1/{parent=projects/*/locations/*}/nodes"
      body: "node"
    };
    option (google.api.method_signature) = "parent,node,node_id";
    option (google.longrunning.operation_info) = {
      response_type: "Node"
      metadata_type: "OperationMetadata"
    };
  }

  // Deletes a node.
  rpc DeleteNode(DeleteNodeRequest) returns (google.longrunning.Operation) {
    option (google.api.http) = {
      delete: "/v1/{name=projects/*/locations/*/nodes/*}"
    };
    option (google.api.method_signature) = "name";
    option (google.longrunning.operation_info) = {
      response_type: "Node"
      metadata_type: "OperationMetadata"
    };
  }

  // Reimages a node's OS.
  rpc ReimageNode(ReimageNodeRequest) returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1/{name=projects/*/locations/*/nodes/*}:reimage"
      body: "*"
    };
    option (google.longrunning.operation_info) = {
      response_type: "Node"
      metadata_type: "OperationMetadata"
    };
  }

  // Stops a node, this operation is only available with single TPU nodes.
  rpc StopNode(StopNodeRequest) returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1/{name=projects/*/locations/*/nodes/*}:stop"
      body: "*"
    };
    option (google.longrunning.operation_info) = {
      response_type: "Node"
      metadata_type: "OperationMetadata"
    };
  }

  // Starts a node.
  rpc StartNode(StartNodeRequest) returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1/{name=projects/*/locations/*/nodes/*}:start"
      body: "*"
    };
    option (google.longrunning.operation_info) = {
      response_type: "Node"
      metadata_type: "OperationMetadata"
    };
  }

  // List TensorFlow versions supported by this API.
  rpc ListTensorFlowVersions(ListTensorFlowVersionsRequest)
      returns (ListTensorFlowVersionsResponse) {
    option (google.api.http) = {
      get: "/v1/{parent=projects/*/locations/*}/tensorflowVersions"
    };
    option (google.api.method_signature) = "parent";
  }

  // Gets TensorFlow Version.
  rpc GetTensorFlowVersion(GetTensorFlowVersionRequest)
      returns (TensorFlowVersion) {
    option (google.api.http) = {
      get: "/v1/{name=projects/*/locations/*/tensorflowVersions/*}"
    };
    option (google.api.method_signature) = "name";
  }

  // Lists accelerator types supported by this API.
  rpc ListAcceleratorTypes(ListAcceleratorTypesRequest)
      returns (ListAcceleratorTypesResponse) {
    option (google.api.http) = {
      get: "/v1/{parent=projects/*/locations/*}/acceleratorTypes"
    };
    option (google.api.method_signature) = "parent";
  }

  // Gets AcceleratorType.
  rpc GetAcceleratorType(GetAcceleratorTypeRequest) returns (AcceleratorType) {
    option (google.api.http) = {
      get: "/v1/{name=projects/*/locations/*/acceleratorTypes/*}"
    };
    option (google.api.method_signature) = "name";
  }
}

// Sets the scheduling options for this node.
message SchedulingConfig {
  // Defines whether the node is preemptible.
  bool preemptible = 1;

  // Whether the node is created under a reservation.
  bool reserved = 2;
}

// A network endpoint over which a TPU worker can be reached.
message NetworkEndpoint {
  // The IP address of this network endpoint.
  string ip_address = 1;

  // The port of this network endpoint.
  int32 port = 2;
}

// A TPU instance.
message Node {
  option (google.api.resource) = {
    type: "tpu.googleapis.com/Node"
    pattern: "projects/{project}/locations/{location}/nodes/{node}"
  };

  // Represents the different states of a TPU node during its lifecycle.
  enum State {
    // TPU node state is not known/set.
    STATE_UNSPECIFIED = 0;

    // TPU node is being created.
    CREATING = 1;

    // TPU node has been created.
    READY = 2;

    // TPU node is restarting.
    RESTARTING = 3;

    // TPU node is undergoing reimaging.
    REIMAGING = 4;

    // TPU node is being deleted.
    DELETING = 5;

    // TPU node is being repaired and may be unusable. Details can be
    // found in the `help_description` field.
    REPAIRING = 6;

    // TPU node is stopped.
    STOPPED = 8;

    // TPU node is currently stopping.
    STOPPING = 9;

    // TPU node is currently starting.
    STARTING = 10;

    // TPU node has been preempted. Only applies to Preemptible TPU Nodes.
    PREEMPTED = 11;

    // TPU node has been terminated due to maintenance or has reached the end of
    // its life cycle (for preemptible nodes).
    TERMINATED = 12;

    // TPU node is currently hiding.
    HIDING = 13;

    // TPU node has been hidden.
    HIDDEN = 14;

    // TPU node is currently unhiding.
    UNHIDING = 15;
  }

  // Health defines the status of a TPU node as reported by
  // Health Monitor.
  enum Health {
    // Health status is unknown: not initialized or failed to retrieve.
    HEALTH_UNSPECIFIED = 0;

    // The resource is healthy.
    HEALTHY = 1;

    // The resource is unhealthy.
    DEPRECATED_UNHEALTHY = 2;

    // The resource is unresponsive.
    TIMEOUT = 3;

    // The in-guest ML stack is unhealthy.
    UNHEALTHY_TENSORFLOW = 4;

    // The node is under maintenance/priority boost caused rescheduling and
    // will resume running once rescheduled.
    UNHEALTHY_MAINTENANCE = 5;
  }

  // TPU API Version.
  enum ApiVersion {
    // API version is unknown.
    API_VERSION_UNSPECIFIED = 0;

    // TPU API V1Alpha1 version.
    V1_ALPHA1 = 1;

    // TPU API V1 version.
    V1 = 2;

    // TPU API V2Alpha1 version.
    V2_ALPHA1 = 3;
  }

  // Output only. Immutable. The name of the TPU
  string name = 1 [
    (google.api.field_behavior) = IMMUTABLE,
    (google.api.field_behavior) = OUTPUT_ONLY
  ];

  // The user-supplied description of the TPU. Maximum of 512 characters.
  string description = 3;

  // Required. The type of hardware accelerators associated with this node.
  string accelerator_type = 5 [(google.api.field_behavior) = REQUIRED];

  // Output only. DEPRECATED! Use network_endpoints instead.
  // The network address for the TPU Node as visible to Compute Engine
  // instances.
  string ip_address = 8 [deprecated = true];

  // Output only. DEPRECATED! Use network_endpoints instead.
  // The network port for the TPU Node as visible to Compute Engine instances.
  string port = 14 [deprecated = true];

  // Output only. The current state for the TPU Node.
  State state = 9 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. If this field is populated, it contains a description of why
  // the TPU Node is unhealthy.
  string health_description = 10 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Required. The version of Tensorflow running in the Node.
  string tensorflow_version = 11 [(google.api.field_behavior) = REQUIRED];

  // The name of a network they wish to peer the TPU node to. It must be a
  // preexisting Compute Engine network inside of the project on which this API
  // has been activated. If none is provided, "default" will be used.
  string network = 12;

  // The CIDR block that the TPU node will use when selecting an IP address.
  // This CIDR block must be a /29 block; the Compute Engine networks API
  // forbids a smaller block, and using a larger block would be wasteful (a
  // node can only consume one IP address). Errors will occur if the CIDR block
  // has already been used for a currently existing TPU node, the CIDR block
  // conflicts with any subnetworks in the user's provided network, or the
  // provided network is peered with another network that is using that CIDR
  // block.
  string cidr_block = 13;

  // Output only. The service account used to run the tensor flow services
  // within the node. To share resources, including Google Cloud Storage data,
  // with the Tensorflow job running in the Node, this account must have
  // permissions to that data.
  string service_account = 15 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The time when the node was created.
  google.protobuf.Timestamp create_time = 16
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // The scheduling options for this node.
  SchedulingConfig scheduling_config = 17;

  // Output only. The network endpoints where TPU workers can be accessed and
  // sent work. It is recommended that Tensorflow clients of the node reach out
  // to the 0th entry in this map first.
  repeated NetworkEndpoint network_endpoints = 21
      [(google.api.field_behavior) = OUTPUT_ONLY];

  // The health status of the TPU node.
  Health health = 22;

  // Resource labels to represent user-provided metadata.
  map<string, string> labels = 24;

  // Whether the VPC peering for the node is set up through Service Networking
  // API. The VPC Peering should be set up before provisioning the node.
  // If this field is set, cidr_block field should not be specified. If the
  // network, that you want to peer the TPU Node to, is Shared VPC networks,
  // the node must be created with this this field enabled.
  bool use_service_networking = 27;

  // Output only. The API version that created this Node.
  ApiVersion api_version = 38 [(google.api.field_behavior) = OUTPUT_ONLY];

  // Output only. The Symptoms that have occurred to the TPU Node.
  repeated Symptom symptoms = 39 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Request for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
message ListNodesRequest {
  // Required. The parent resource name.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" }
  ];

  // The maximum number of items to return.
  int32 page_size = 2;

  // The next_page_token value returned from a previous List request, if any.
  string page_token = 3;
}

// Response for [ListNodes][google.cloud.tpu.v1.Tpu.ListNodes].
message ListNodesResponse {
  // The listed nodes.
  repeated Node nodes = 1;

  // The next page token or empty if none.
  string next_page_token = 2;

  // Locations that could not be reached.
  repeated string unreachable = 3;
}

// Request for [GetNode][google.cloud.tpu.v1.Tpu.GetNode].
message GetNodeRequest {
  // Required. The resource name.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" }
  ];
}

// Request for [CreateNode][google.cloud.tpu.v1.Tpu.CreateNode].
message CreateNodeRequest {
  // Required. The parent resource name.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = { child_type: "tpu.googleapis.com/Node" }
  ];

  // The unqualified resource name.
  string node_id = 2;

  // Required. The node.
  Node node = 3 [(google.api.field_behavior) = REQUIRED];
}

// Request for [DeleteNode][google.cloud.tpu.v1.Tpu.DeleteNode].
message DeleteNodeRequest {
  // Required. The resource name.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = { type: "tpu.googleapis.com/Node" }
  ];
}

// Request for [ReimageNode][google.cloud.tpu.v1.Tpu.ReimageNode].
message ReimageNodeRequest {
  // The resource name.
  string name = 1;

  // The version for reimage to create.
  string tensorflow_version = 2;
}

// Request for [StopNode][google.cloud.tpu.v1.Tpu.StopNode].
message StopNodeRequest {
  // The resource name.
  string name = 1;
}

// Request for [StartNode][google.cloud.tpu.v1.Tpu.StartNode].
message StartNodeRequest {
  // The resource name.
  string name = 1;
}

// A tensorflow version that a Node can be configured with.
message TensorFlowVersion {
  option (google.api.resource) = {
    type: "tpu.googleapis.com/TensorFlowVersion"
    pattern: "projects/{project}/locations/{location}/tensorFlowVersions/{tensor_flow_version}"
  };

  // The resource name.
  string name = 1;

  // the tensorflow version.
  string version = 2;
}

// Request for
// [GetTensorFlowVersion][google.cloud.tpu.v1.Tpu.GetTensorFlowVersion].
message GetTensorFlowVersionRequest {
  // Required. The resource name.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "tpu.googleapis.com/TensorFlowVersion"
    }
  ];
}

// Request for
// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
message ListTensorFlowVersionsRequest {
  // Required. The parent resource name.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "tpu.googleapis.com/TensorFlowVersion"
    }
  ];

  // The maximum number of items to return.
  int32 page_size = 2;

  // The next_page_token value returned from a previous List request, if any.
  string page_token = 3;

  // List filter.
  string filter = 5;

  // Sort results.
  string order_by = 6;
}

// Response for
// [ListTensorFlowVersions][google.cloud.tpu.v1.Tpu.ListTensorFlowVersions].
message ListTensorFlowVersionsResponse {
  // The listed nodes.
  repeated TensorFlowVersion tensorflow_versions = 1;

  // The next page token or empty if none.
  string next_page_token = 2;

  // Locations that could not be reached.
  repeated string unreachable = 3;
}

// A accelerator type that a Node can be configured with.
message AcceleratorType {
  option (google.api.resource) = {
    type: "tpu.googleapis.com/AcceleratorType"
    pattern: "projects/{project}/locations/{location}/acceleratorTypes/{accelerator_type}"
  };

  // The resource name.
  string name = 1;

  // the accelerator type.
  string type = 2;
}

// Request for [GetAcceleratorType][google.cloud.tpu.v1.Tpu.GetAcceleratorType].
message GetAcceleratorTypeRequest {
  // Required. The resource name.
  string name = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "tpu.googleapis.com/AcceleratorType"
    }
  ];
}

// Request for
// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
message ListAcceleratorTypesRequest {
  // Required. The parent resource name.
  string parent = 1 [
    (google.api.field_behavior) = REQUIRED,
    (google.api.resource_reference) = {
      type: "tpu.googleapis.com/AcceleratorType"
    }
  ];

  // The maximum number of items to return.
  int32 page_size = 2;

  // The next_page_token value returned from a previous List request, if any.
  string page_token = 3;

  // List filter.
  string filter = 5;

  // Sort results.
  string order_by = 6;
}

// Response for
// [ListAcceleratorTypes][google.cloud.tpu.v1.Tpu.ListAcceleratorTypes].
message ListAcceleratorTypesResponse {
  // The listed nodes.
  repeated AcceleratorType accelerator_types = 1;

  // The next page token or empty if none.
  string next_page_token = 2;

  // Locations that could not be reached.
  repeated string unreachable = 3;
}

// Metadata describing an [Operation][google.longrunning.Operation]
message OperationMetadata {
  // The time the operation was created.
  google.protobuf.Timestamp create_time = 1;

  // The time the operation finished running.
  google.protobuf.Timestamp end_time = 2;

  // Target of the operation - for example
  // projects/project-1/connectivityTests/test-1
  string target = 3;

  // Name of the verb executed by the operation.
  string verb = 4;

  // Human-readable status of the operation, if any.
  string status_detail = 5;

  // Specifies if cancellation was requested for the operation.
  bool cancel_requested = 6;

  // API version.
  string api_version = 7;
}

// A Symptom instance.
message Symptom {
  // SymptomType represents the different types of Symptoms that a TPU can be
  // at.
  enum SymptomType {
    // Unspecified symptom.
    SYMPTOM_TYPE_UNSPECIFIED = 0;

    // TPU VM memory is low.
    LOW_MEMORY = 1;

    // TPU runtime is out of memory.
    OUT_OF_MEMORY = 2;

    // TPU runtime execution has timed out.
    EXECUTE_TIMED_OUT = 3;

    // TPU runtime fails to construct a mesh that recognizes each TPU device's
    // neighbors.
    MESH_BUILD_FAIL = 4;

    // TPU HBM is out of memory.
    HBM_OUT_OF_MEMORY = 5;

    // Abusive behaviors have been identified on the current project.
    PROJECT_ABUSE = 6;
  }

  // Timestamp when the Symptom is created.
  google.protobuf.Timestamp create_time = 1;

  // Type of the Symptom.
  SymptomType symptom_type = 2;

  // Detailed information of the current Symptom.
  string details = 3;

  // A string used to uniquely distinguish a worker within a TPU node.
  string worker_id = 4;
}