google/cloud/contentwarehouse/v1/pipelines.proto

// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.contentwarehouse.v1;

import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/contentwarehouse/v1/common.proto";
import "google/iam/v1/policy.proto";
import "google/rpc/status.proto";

option csharp_namespace = "Google.Cloud.ContentWarehouse.V1";
option go_package = "cloud.google.com/go/contentwarehouse/apiv1/contentwarehousepb;contentwarehousepb";
option java_multiple_files = true;
option java_outer_classname = "PipelinesProto";
option java_package = "com.google.cloud.contentwarehouse.v1";
option php_namespace = "Google\\Cloud\\ContentWarehouse\\V1";
option ruby_package = "Google::Cloud::ContentWarehouse::V1";
option (google.api.resource_definition) = {
  type: "cloudfunctions.googleapis.com/CloudFunction"
  pattern: "projects/{project}/locations/{location}/functions/{function}"
};

// Response message of RunPipeline method.
message RunPipelineResponse {}

// Metadata message of RunPipeline method.
message RunPipelineMetadata {
  // The metadata message for GcsIngest pipeline.
  message GcsIngestPipelineMetadata {
    // The input Cloud Storage folder in this pipeline.
    // Format: `gs://<bucket-name>/<folder-name>`.
    string input_path = 1;
  }

  // The metadata message for Export-to-CDW pipeline.
  message ExportToCdwPipelineMetadata {
    // The input list of all the resource names of the documents to be exported.
    repeated string documents = 1;

    // The output CDW dataset resource name.
    string doc_ai_dataset = 2;

    // The output Cloud Storage folder in this pipeline.
    string output_path = 3;
  }

  // The metadata message for Process-with-DocAi pipeline.
  message ProcessWithDocAiPipelineMetadata {
    // The input list of all the resource names of the documents to be
    // processed.
    repeated string documents = 1;

    // The DocAI processor to process the documents with.
    ProcessorInfo processor_info = 2;
  }

  // The status of processing a document.
  message IndividualDocumentStatus {
    // Document identifier of an existing document.
    string document_id = 1;

    // The status processing the document.
    google.rpc.Status status = 2;
  }

  // Number of files that were processed by the pipeline.
  int32 total_file_count = 1;

  // Number of files that have failed at some point in the pipeline.
  int32 failed_file_count = 2;

  // User unique identification and groups information.
  UserInfo user_info = 3;

  // The pipeline metadata.
  oneof pipeline_metadata {
    // The pipeline metadata for GcsIngest pipeline.
    GcsIngestPipelineMetadata gcs_ingest_pipeline_metadata = 4;

    // The pipeline metadata for Export-to-CDW pipeline.
    ExportToCdwPipelineMetadata export_to_cdw_pipeline_metadata = 6;

    // The pipeline metadata for Process-with-DocAi pipeline.
    ProcessWithDocAiPipelineMetadata process_with_doc_ai_pipeline_metadata = 7;
  }

  // The list of response details of each document.
  repeated IndividualDocumentStatus individual_document_statuses = 5;
}

// The DocAI processor information.
message ProcessorInfo {
  // The processor resource name.
  // Format is `projects/{project}/locations/{location}/processors/{processor}`,
  // or
  // `projects/{project}/locations/{location}/processors/{processor}/processorVersions/{processorVersion}`
  string processor_name = 1;

  // The processor will process the documents with this document type.
  string document_type = 2;

  // The Document schema resource name. All documents processed by this
  // processor will use this schema.
  // Format:
  // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
  string schema_name = 3;
}

// The ingestion pipeline config.
message IngestPipelineConfig {
  // The document level acl policy config.
  // This refers to an Identity and Access (IAM) policy, which specifies access
  // controls for all documents ingested by the pipeline. The
  // [role][google.iam.v1.Binding.role] and
  // [members][google.iam.v1.Binding.role] under the policy needs to be
  // specified.
  //
  // The following roles are supported for document level acl control:
  // * roles/contentwarehouse.documentAdmin
  // * roles/contentwarehouse.documentEditor
  // * roles/contentwarehouse.documentViewer
  //
  // The following members are supported for document level acl control:
  // * user:user-email@example.com
  // * group:group-email@example.com
  // Note that for documents searched with LLM, only single level user or group
  // acl check is supported.
  google.iam.v1.Policy document_acl_policy = 1;

  // The document text extraction enabled flag.
  // If the flag is set to true, DWH will perform text extraction on the raw
  // document.
  bool enable_document_text_extraction = 2;

  // Optional. The name of the folder to which all ingested documents will be
  // linked during ingestion process. Format is
  // `projects/{project}/locations/{location}/documents/{folder_id}`
  string folder = 3 [(google.api.field_behavior) = OPTIONAL];

  // The Cloud Function resource name. The Cloud Function needs to live inside
  // consumer project and is accessible to Document AI Warehouse P4SA.
  // Only Cloud Functions V2 is supported. Cloud function execution should
  // complete within 5 minutes or this file ingestion may fail due to timeout.
  // Format: `https://{region}-{project_id}.cloudfunctions.net/{cloud_function}`
  // The following keys are available the request json payload.
  // * display_name
  // * properties
  // * plain_text
  // * reference_id
  // * document_schema_name
  // * raw_document_path
  // * raw_document_file_type
  //
  // The following keys from the cloud function json response payload will be
  // ingested to the Document AI Warehouse as part of Document proto content
  // and/or related information. The original values will be overridden if any
  // key is present in the response.
  // * display_name
  // * properties
  // * plain_text
  // * document_acl_policy
  // * folder
  string cloud_function = 4 [(google.api.resource_reference) = {
    type: "cloudfunctions.googleapis.com/CloudFunction"
  }];
}

// The configuration of the Cloud Storage Ingestion pipeline.
message GcsIngestPipeline {
  // The input Cloud Storage folder. All files under this folder will be
  // imported to Document Warehouse.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string input_path = 1;

  // The Document Warehouse schema resource name. All documents processed by
  // this pipeline will use this schema.
  // Format:
  // projects/{project_number}/locations/{location}/documentSchemas/{document_schema_id}.
  string schema_name = 2;

  // The Doc AI processor type name. Only used when the format of ingested
  // files is Doc AI Document proto format.
  string processor_type = 3;

  // The flag whether to skip ingested documents.
  // If it is set to true, documents in Cloud Storage contains key "status" with
  // value "status=ingested" in custom metadata will be skipped to ingest.
  bool skip_ingested_documents = 4;

  // Optional. The config for the Cloud Storage Ingestion pipeline.
  // It provides additional customization options to run the pipeline and can be
  // skipped if it is not applicable.
  IngestPipelineConfig pipeline_config = 5
      [(google.api.field_behavior) = OPTIONAL];
}

// The configuration of the Cloud Storage Ingestion with DocAI Processors
// pipeline.
message GcsIngestWithDocAiProcessorsPipeline {
  // The input Cloud Storage folder. All files under this folder will be
  // imported to Document Warehouse.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string input_path = 1;

  // The split and classify processor information.
  // The split and classify result will be used to find a matched extract
  // processor.
  ProcessorInfo split_classify_processor_info = 2;

  // The extract processors information.
  // One matched extract processor will be used to process documents based on
  // the classify processor result. If no classify processor is specified, the
  // first extract processor will be used.
  repeated ProcessorInfo extract_processor_infos = 3;

  // The Cloud Storage folder path used to store the raw results from
  // processors.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string processor_results_folder_path = 4;

  // The flag whether to skip ingested documents.
  // If it is set to true, documents in Cloud Storage contains key "status" with
  // value "status=ingested" in custom metadata will be skipped to ingest.
  bool skip_ingested_documents = 5;

  // Optional. The config for the Cloud Storage Ingestion with DocAI Processors
  // pipeline. It provides additional customization options to run the pipeline
  // and can be skipped if it is not applicable.
  IngestPipelineConfig pipeline_config = 6
      [(google.api.field_behavior) = OPTIONAL];
}

// The configuration of exporting documents from the Document Warehouse to CDW
// pipeline.
message ExportToCdwPipeline {
  // The list of all the resource names of the documents to be processed.
  // Format:
  // projects/{project_number}/locations/{location}/documents/{document_id}.
  repeated string documents = 1;

  // The Cloud Storage folder path used to store the exported documents before
  // being sent to CDW.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string export_folder_path = 2;

  // Optional. The CDW dataset resource name. This field is optional. If not
  // set, the documents will be exported to Cloud Storage only. Format:
  // projects/{project}/locations/{location}/processors/{processor}/dataset
  string doc_ai_dataset = 3 [(google.api.field_behavior) = OPTIONAL];

  // Ratio of training dataset split. When importing into Document AI Workbench,
  // documents will be automatically split into training and test split category
  // with the specified ratio. This field is required if doc_ai_dataset is set.
  float training_split_ratio = 4;
}

// The configuration of processing documents in Document Warehouse with DocAi
// processors pipeline.
message ProcessWithDocAiPipeline {
  // The list of all the resource names of the documents to be processed.
  // Format:
  // projects/{project_number}/locations/{location}/documents/{document_id}.
  repeated string documents = 1;

  // The Cloud Storage folder path used to store the exported documents before
  // being sent to CDW.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string export_folder_path = 2;

  // The CDW processor information.
  ProcessorInfo processor_info = 3;

  // The Cloud Storage folder path used to store the raw results from
  // processors.
  // Format: `gs://<bucket-name>/<folder-name>`.
  string processor_results_folder_path = 4;
}