-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
pipelines.proto
291 lines (245 loc) · 11.1 KB
/
pipelines.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.contentwarehouse.v1;
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/contentwarehouse/v1/common.proto";
import "google/iam/v1/policy.proto";
import "google/rpc/status.proto";
option csharp_namespace = "Google.Cloud.ContentWarehouse.V1";
option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb";
option java_multiple_files = true;
option java_outer_classname = "PipelinesProto";
option java_package = "com.google.cloud.contentwarehouse.v1";
option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1";
option ruby_package = "Google::Cloud::ContentWarehouse::V1";
option (google.api.resource_definition) = {
type: "cloudfunctions.googleapis.com/CloudFunction"
pattern: "projects/{project}/locations/{location}/functions/{function}"
};
// Response message of RunPipeline method.
message RunPipelineResponse {}
// Metadata message of RunPipeline method.
message RunPipelineMetadata {
// The metadata message for GcsIngest pipeline.
message GcsIngestPipelineMetadata {
// The input Cloud Storage folder in this pipeline.
// Format: `gs://<bucket-name>/<folder-name>`.
string input_path = 1;
}
// The metadata message for Export-to-CDW pipeline.
message ExportToCdwPipelineMetadata {
// The input list of all the resource names of the documents to be exported.
repeated string documents = 1;
// The output CDW dataset resource name.
string doc_ai_dataset = 2;
// The output Cloud Storage folder in this pipeline.
string output_path = 3;
}
// The metadata message for Process-with-DocAi pipeline.
message ProcessWithDocAiPipelineMetadata {
// The input list of all the resource names of the documents to be
// processed.
repeated string documents = 1;
// The DocAI processor to process the documents with.
ProcessorInfo processor_info = 2;
}
// The status of processing a document.
message IndividualDocumentStatus {
// Document identifier of an existing document.
string document_id = 1;
// The status processing the document.
google.rpc.Status status = 2;
}
// Number of files that were processed by the pipeline.
int32 total_file_count = 1;
// Number of files that have failed at some point in the pipeline.
int32 failed_file_count = 2;
// User unique identification and groups information.
UserInfo user_info = 3;
// The pipeline metadata.
oneof pipeline_metadata {
// The pipeline metadata for GcsIngest pipeline.
GcsIngestPipelineMetadata gcs_ingest_pipeline_metadata = 4;
// The pipeline metadata for Export-to-CDW pipeline.
ExportToCdwPipelineMetadata export_to_cdw_pipeline_metadata = 6;
// The pipeline metadata for Process-with-DocAi pipeline.
ProcessWithDocAiPipelineMetadata process_with_doc_ai_pipeline_metadata = 7;
}
// The list of response details of each document.
repeated IndividualDocumentStatus individual_document_statuses = 5;
}
// The DocAI processor information.
message ProcessorInfo {
// The processor resource name.
// Format is `projects/{project}/locations/{location}/processors/{processor}`,
// or
// `projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processorVersion}`
string processor_name = 1;
// The processor will process the documents with this document type.
string document_type = 2;
// The Document schema resource name. All documents processed by this
// processor will use this schema.
// Format:
// projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
string schema_name = 3;
}
// The ingestion pipeline config.
message IngestPipelineConfig {
// The document level acl policy config.
// This refers to an Identity and Access (IAM) policy, which specifies access
// controls for all documents ingested by the pipeline. The
// [role][google.iam.v1.Binding.role] and
// [members][google.iam.v1.Binding.role] under the policy needs to be
// specified.
//
// The following roles are supported for document level acl control:
// * roles/contentwarehouse.documentAdmin
// * roles/contentwarehouse.documentEditor
// * roles/contentwarehouse.documentViewer
//
// The following members are supported for document level acl control:
// * user:user-email@example.com
// * group:group-email@example.com
// Note that for documents searched with LLM, only single level user or group
// acl check is supported.
google.iam.v1.Policy document_acl_policy = 1;
// The document text extraction enabled flag.
// If the flag is set to true, DWH will perform text extraction on the raw
// document.
bool enable_document_text_extraction = 2;
// Optional. The name of the folder to which all ingested documents will be
// linked during ingestion process. Format is
// `projects/{project}/locations/{location}/documents/{folder_id}`
string folder = 3 [(google.api.field_behavior) = OPTIONAL];
// The Cloud Function resource name. The Cloud Function needs to live inside
// consumer project and is accessible to Document AI Warehouse P4SA.
// Only Cloud Functions V2 is supported. Cloud function execution should
// complete within 5 minutes or this file ingestion may fail due to timeout.
// Format: `https://{region}-{project_id}.cloudfunctions.net/{cloud_function}`
// The following keys are available the request json payload.
// * display_name
// * properties
// * plain_text
// * reference_id
// * document_schema_name
// * raw_document_path
// * raw_document_file_type
//
// The following keys from the cloud function json response payload will be
// ingested to the Document AI Warehouse as part of Document proto content
// and/or related information. The original values will be overridden if any
// key is present in the response.
// * display_name
// * properties
// * plain_text
// * document_acl_policy
// * folder
string cloud_function = 4 [(google.api.resource_reference) = {
type: "cloudfunctions.googleapis.com/CloudFunction"
}];
}
// The configuration of the Cloud Storage Ingestion pipeline.
message GcsIngestPipeline {
// The input Cloud Storage folder. All files under this folder will be
// imported to Document Warehouse.
// Format: `gs://<bucket-name>/<folder-name>`.
string input_path = 1;
// The Document Warehouse schema resource name. All documents processed by
// this pipeline will use this schema.
// Format:
// projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
string schema_name = 2;
// The Doc AI processor type name. Only used when the format of ingested
// files is Doc AI Document proto format.
string processor_type = 3;
// The flag whether to skip ingested documents.
// If it is set to true, documents in Cloud Storage contains key "status" with
// value "status=ingested" in custom metadata will be skipped to ingest.
bool skip_ingested_documents = 4;
// Optional. The config for the Cloud Storage Ingestion pipeline.
// It provides additional customization options to run the pipeline and can be
// skipped if it is not applicable.
IngestPipelineConfig pipeline_config = 5
[(google.api.field_behavior) = OPTIONAL];
}
// The configuration of the Cloud Storage Ingestion with DocAI Processors
// pipeline.
message GcsIngestWithDocAiProcessorsPipeline {
// The input Cloud Storage folder. All files under this folder will be
// imported to Document Warehouse.
// Format: `gs://<bucket-name>/<folder-name>`.
string input_path = 1;
// The split and classify processor information.
// The split and classify result will be used to find a matched extract
// processor.
ProcessorInfo split_classify_processor_info = 2;
// The extract processors information.
// One matched extract processor will be used to process documents based on
// the classify processor result. If no classify processor is specified, the
// first extract processor will be used.
repeated ProcessorInfo extract_processor_infos = 3;
// The Cloud Storage folder path used to store the raw results from
// processors.
// Format: `gs://<bucket-name>/<folder-name>`.
string processor_results_folder_path = 4;
// The flag whether to skip ingested documents.
// If it is set to true, documents in Cloud Storage contains key "status" with
// value "status=ingested" in custom metadata will be skipped to ingest.
bool skip_ingested_documents = 5;
// Optional. The config for the Cloud Storage Ingestion with DocAI Processors
// pipeline. It provides additional customization options to run the pipeline
// and can be skipped if it is not applicable.
IngestPipelineConfig pipeline_config = 6
[(google.api.field_behavior) = OPTIONAL];
}
// The configuration of exporting documents from the Document Warehouse to CDW
// pipeline.
message ExportToCdwPipeline {
// The list of all the resource names of the documents to be processed.
// Format:
// projects/{project_number}/locations/{location}/documents/{document_id}.
repeated string documents = 1;
// The Cloud Storage folder path used to store the exported documents before
// being sent to CDW.
// Format: `gs://<bucket-name>/<folder-name>`.
string export_folder_path = 2;
// Optional. The CDW dataset resource name. This field is optional. If not
// set, the documents will be exported to Cloud Storage only. Format:
// projects/{project}/locations/{location}/processors/{processor}/dataset
string doc_ai_dataset = 3 [(google.api.field_behavior) = OPTIONAL];
// Ratio of training dataset split. When importing into Document AI Workbench,
// documents will be automatically split into training and test split category
// with the specified ratio. This field is required if doc_ai_dataset is set.
float training_split_ratio = 4;
}
// The configuration of processing documents in Document Warehouse with DocAi
// processors pipeline.
message ProcessWithDocAiPipeline {
// The list of all the resource names of the documents to be processed.
// Format:
// projects/{project_number}/locations/{location}/documents/{document_id}.
repeated string documents = 1;
// The Cloud Storage folder path used to store the exported documents before
// being sent to CDW.
// Format: `gs://<bucket-name>/<folder-name>`.
string export_folder_path = 2;
// The CDW processor information.
ProcessorInfo processor_info = 3;
// The Cloud Storage folder path used to store the raw results from
// processors.
// Format: `gs://<bucket-name>/<folder-name>`.
string processor_results_folder_path = 4;
}