/
custom_job.proto
240 lines (202 loc) · 10.1 KB
/
custom_job.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.aiplatform.v1beta1;
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/aiplatform/v1beta1/encryption_spec.proto";
import "google/cloud/aiplatform/v1beta1/env_var.proto";
import "google/cloud/aiplatform/v1beta1/io.proto";
import "google/cloud/aiplatform/v1beta1/job_state.proto";
import "google/cloud/aiplatform/v1beta1/machine_resources.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
import "google/api/annotations.proto";
option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1";
option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1;aiplatform";
option java_multiple_files = true;
option java_outer_classname = "CustomJobProto";
option java_package = "com.google.cloud.aiplatform.v1beta1";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1";
option ruby_package = "Google::Cloud::AIPlatform::V1beta1";
// Represents a job that runs custom workloads such as a Docker container or a
// Python package. A CustomJob can have multiple worker pools and each worker
// pool can have its own machine and input spec. A CustomJob will be cleaned up
// once the job enters terminal state (failed or succeeded).
message CustomJob {
option (google.api.resource) = {
type: "aiplatform.googleapis.com/CustomJob"
pattern: "projects/{project}/locations/{location}/customJobs/{custom_job}"
};
// Output only. Resource name of a CustomJob.
string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
// Required. The display name of the CustomJob.
// The name can be up to 128 characters long and can be consist of any UTF-8
// characters.
string display_name = 2 [(google.api.field_behavior) = REQUIRED];
// Required. Job spec.
CustomJobSpec job_spec = 4 [(google.api.field_behavior) = REQUIRED];
// Output only. The detailed state of the job.
JobState state = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Time when the CustomJob was created.
google.protobuf.Timestamp create_time = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Time when the CustomJob for the first time entered the
// `JOB_STATE_RUNNING` state.
google.protobuf.Timestamp start_time = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Time when the CustomJob entered any of the following states:
// `JOB_STATE_SUCCEEDED`, `JOB_STATE_FAILED`, `JOB_STATE_CANCELLED`.
google.protobuf.Timestamp end_time = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Time when the CustomJob was most recently updated.
google.protobuf.Timestamp update_time = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Only populated when job's state is `JOB_STATE_FAILED` or
// `JOB_STATE_CANCELLED`.
google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
// The labels with user-defined metadata to organize CustomJobs.
//
// Label keys and values can be no longer than 64 characters
// (Unicode codepoints), can only contain lowercase letters, numeric
// characters, underscores and dashes. International characters are allowed.
//
// See https://goo.gl/xmQnxf for more information and examples of labels.
map<string, string> labels = 11;
// Customer-managed encryption key options for a CustomJob. If this is set,
// then all resources created by the CustomJob will be encrypted with the
// provided encryption key.
EncryptionSpec encryption_spec = 12;
// Output only. The web access URIs for the training job.
// The keys are the node names in the training jobs, e.g. workerpool0-0.
// The values are the URIs for each node's web portal in the job.
map<string, string> web_access_uris = 16 [(google.api.field_behavior) = OUTPUT_ONLY];
}
// Represents the spec of a CustomJob.
message CustomJobSpec {
// Required. The spec of the worker pools including machine type and Docker image.
// All worker pools except the first one are optional and can be skipped by
// providing an empty value.
repeated WorkerPoolSpec worker_pool_specs = 1 [(google.api.field_behavior) = REQUIRED];
// Scheduling options for a CustomJob.
Scheduling scheduling = 3;
// Specifies the service account for workload run-as account.
// Users submitting jobs must have act-as permission on this run-as account.
// If unspecified, the [AI Platform Custom Code Service
// Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents)
// for the CustomJob's project is used.
string service_account = 4;
// The full name of the Compute Engine
// [network](/compute/docs/networks-and-firewalls#networks) to which the Job
// should be peered. For example, `projects/12345/global/networks/myVPC`.
// [Format](/compute/docs/reference/rest/v1/networks/insert)
// is of the form `projects/{project}/global/networks/{network}`.
// Where {project} is a project number, as in `12345`, and {network} is a
// network name.
//
// Private services access must already be configured for the network. If left
// unspecified, the job is not peered with any network.
string network = 5 [(google.api.resource_reference) = {
type: "compute.googleapis.com/Network"
}];
// The Cloud Storage location to store the output of this CustomJob or
// HyperparameterTuningJob. For HyperparameterTuningJob,
// the baseOutputDirectory of
// each child CustomJob backing a Trial is set to a subdirectory of name
// [id][google.cloud.aiplatform.v1beta1.Trial.id] under its parent HyperparameterTuningJob's
// baseOutputDirectory.
//
// The following Vertex AI environment variables will be passed to
// containers or python modules when this field is set:
//
// For CustomJob:
//
// * AIP_MODEL_DIR = `<base_output_directory>/model/`
// * AIP_CHECKPOINT_DIR = `<base_output_directory>/checkpoints/`
// * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/logs/`
//
// For CustomJob backing a Trial of HyperparameterTuningJob:
//
// * AIP_MODEL_DIR = `<base_output_directory>/<trial_id>/model/`
// * AIP_CHECKPOINT_DIR = `<base_output_directory>/<trial_id>/checkpoints/`
// * AIP_TENSORBOARD_LOG_DIR = `<base_output_directory>/<trial_id>/logs/`
GcsDestination base_output_directory = 6;
// Optional. The name of a Vertex AI [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] resource to which this CustomJob
// will upload Tensorboard logs.
// Format:
// `projects/{project}/locations/{location}/tensorboards/{tensorboard}`
string tensorboard = 7 [
(google.api.field_behavior) = OPTIONAL,
(google.api.resource_reference) = {
type: "aiplatform.googleapis.com/Tensorboard"
}
];
// Optional. Vertex AI will enable web portal access to the containers. The portals
// can be accessed on web via the URLs given by [web_access_uris][].
bool enable_web_access = 10 [(google.api.field_behavior) = OPTIONAL];
}
// Represents the spec of a worker pool in a job.
message WorkerPoolSpec {
// The custom task to be executed in this worker pool.
oneof task {
// The custom container task.
ContainerSpec container_spec = 6;
// The Python packaged task.
PythonPackageSpec python_package_spec = 7;
}
// Optional. Immutable. The specification of a single machine.
MachineSpec machine_spec = 1 [
(google.api.field_behavior) = OPTIONAL,
(google.api.field_behavior) = IMMUTABLE
];
// Optional. The number of worker replicas to use for this worker pool.
int64 replica_count = 2 [(google.api.field_behavior) = OPTIONAL];
// Disk spec.
DiskSpec disk_spec = 5;
}
// The spec of a Container.
message ContainerSpec {
// Required. The URI of a container image in the Container Registry that is to be run on
// each worker replica.
string image_uri = 1 [(google.api.field_behavior) = REQUIRED];
// The command to be invoked when the container is started.
// It overrides the entrypoint instruction in Dockerfile when provided.
repeated string command = 2;
// The arguments to be passed when starting the container.
repeated string args = 3;
}
// The spec of a Python packaged code.
message PythonPackageSpec {
// Required. The URI of a container image in Artifact Registry that will run the
// provided Python package. Vertex AI provides a wide range of executor
// images with pre-installed packages to meet users' various use cases. See
// the list of [pre-built containers for
// training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers).
// You must use an image from this list.
string executor_image_uri = 1 [(google.api.field_behavior) = REQUIRED];
// Required. The Google Cloud Storage location of the Python package files which are
// the training program and its dependent packages.
// The maximum number of package URIs is 100.
repeated string package_uris = 2 [(google.api.field_behavior) = REQUIRED];
// Required. The Python module name to run after installing the packages.
string python_module = 3 [(google.api.field_behavior) = REQUIRED];
// Command line arguments to be passed to the Python task.
repeated string args = 4;
}
// All parameters related to queuing and scheduling of custom jobs.
message Scheduling {
// The maximum job running time. The default is 7 days.
google.protobuf.Duration timeout = 1;
// Restarts the entire CustomJob if a worker gets restarted.
// This feature can be used by distributed training jobs that are not
// resilient to workers leaving and joining a job.
bool restart_job_on_worker_restart = 3;
}