Skip to content

Commit

Permalink
feat: A new message FoundationModelTuningOptions is added
Browse files Browse the repository at this point in the history
feat: A new field foundation_model_tuning_options is added to message TrainProcessorVersionRequest
feat: A new field `labels` is added to messages `ProcessRequest` and `BatchProcessRequest`
feat: A new field `display_name` is added to message `DocumentSchema`
fix: deprecate `Dataset.document_warehouse_config`
docs: updated comments

PiperOrigin-RevId: 607358355
  • Loading branch information
Google APIs authored and Copybara-Service committed Feb 15, 2024
1 parent 9a9bc9b commit 1da5299
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 32 deletions.
5 changes: 3 additions & 2 deletions google/cloud/documentai/v1beta3/dataset.proto
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ message Dataset {
GCSManagedConfig gcs_managed_config = 3
[(google.api.field_behavior) = OPTIONAL];

// Optional. Document AI Warehouse-based dataset configuration.
// Optional. Deprecated. Warehouse-based dataset configuration is not
// supported.
DocumentWarehouseConfig document_warehouse_config = 5
[(google.api.field_behavior) = OPTIONAL];
[deprecated = true, (google.api.field_behavior) = OPTIONAL];

// Optional. Unmanaged dataset configuration. Use this configuration if the
// dataset documents are managed by the document service internally (not
Expand Down
3 changes: 2 additions & 1 deletion google/cloud/documentai/v1beta3/document.proto
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,8 @@ message Document {
[deprecated = true, (google.api.field_behavior) = OPTIONAL];

// Optional. Identifies the bounding polygon of a layout element on the
// page.
// page. If `layout_type` is set, the bounding polygon must be exactly the
// same to the layout element it's referring to.
BoundingPoly bounding_poly = 4 [(google.api.field_behavior) = OPTIONAL];

// Optional. Confidence of detected page element, if applicable. Range
Expand Down
49 changes: 40 additions & 9 deletions google/cloud/documentai/v1beta3/document_processor_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -349,12 +349,11 @@ message ProcessOptions {
}

// A subset of pages to process. If not specified, all pages are processed.
// If a page range is set, only the given pages are extracted and processed
// from the document. In the output document,
// [Document.Page.page_number][google.cloud.documentai.v1beta3.Document.Page.page_number]
// refers to the page number in the original document. This configuration
// only applies to sync requests. `page_range` can be only one of the
// following:
// If a page range is set, only the given pages are extracted and processed
// from the document. In the output document,
// [Document.Page.page_number][google.cloud.documentai.v1beta3.Document.Page.page_number]
// refers to the page number in the original document. This configuration
// only applies to sync requests.
oneof page_range {
// Which pages to process (1-indexed).
IndividualPageSelector individual_page_selector = 5;
Expand All @@ -367,8 +366,8 @@ message ProcessOptions {
int32 from_end = 7;
}

// Only applicable to `OCR_PROCESSOR`. Returns error if set on other
// processor types.
// Only applicable to `OCR_PROCESSOR` and `FORM_PARSER_PROCESSOR`.
// Returns error if set on other processor types.
OcrConfig ocr_config = 1;

// Optional. Override the schema of the
Expand Down Expand Up @@ -428,6 +427,14 @@ message ProcessRequest {

// Inference-time options for the process API
ProcessOptions process_options = 7;

// Optional. The labels with user-defined metadata for the request.
//
// Label keys and values can be no longer than 63 characters
// (Unicode codepoints) and can only contain lowercase letters, numeric
// characters, underscores, and dashes. International characters are allowed.
// Label values are optional. Label keys must start with a letter.
map<string, string> labels = 10 [(google.api.field_behavior) = OPTIONAL];
}

// The status of human review on a processed document.
Expand Down Expand Up @@ -550,6 +557,14 @@ message BatchProcessRequest {

// Inference-time options for the process API
ProcessOptions process_options = 7;

// Optional. The labels with user-defined metadata for the request.
//
// Label keys and values can be no longer than 63 characters
// (Unicode codepoints) and can only contain lowercase letters, numeric
// characters, underscores, and dashes. International characters are allowed.
// Label values are optional. Label keys must start with a letter.
map<string, string> labels = 9 [(google.api.field_behavior) = OPTIONAL];
}

// Response message for
Expand Down Expand Up @@ -878,7 +893,8 @@ message CreateProcessorRequest {

// Required. The processor to be created, requires
// [Processor.type][google.cloud.documentai.v1beta3.Processor.type] and
// [Processor.display_name]][] to be set. Also, the
// [Processor.display_name][google.cloud.documentai.v1beta3.Processor.display_name]
// to be set. Also, the
// [Processor.kms_key_name][google.cloud.documentai.v1beta3.Processor.kms_key_name]
// field must be set if the processor is under CMEK.
Processor processor = 2 [(google.api.field_behavior) = REQUIRED];
Expand Down Expand Up @@ -1027,9 +1043,24 @@ message TrainProcessorVersionRequest {
TrainingMethod training_method = 3;
}

// Options to control foundation model tuning of the processor.
message FoundationModelTuningOptions {
// Optional. The number of steps to run for model tuning. Valid values are
// between 1 and 400. If not provided, recommended steps will be used.
int32 train_steps = 2 [(google.api.field_behavior) = OPTIONAL];

// Optional. The multiplier to apply to the recommended learning rate. Valid
// values are between 0.1 and 10. If not provided, recommended learning rate
// will be used.
float learning_rate_multiplier = 3 [(google.api.field_behavior) = OPTIONAL];
}

oneof processor_flags {
// Options to control Custom Document Extraction (CDE) Processor.
CustomDocumentExtractionOptions custom_document_extraction_options = 5;

// Options to control foundation model tuning of a processor.
FoundationModelTuningOptions foundation_model_tuning_options = 12;
}

// Required. The parent (project, location and processor) to create the new
Expand Down
19 changes: 11 additions & 8 deletions google/cloud/documentai/v1beta3/document_schema.proto
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,14 @@ message DocumentSchema {
// Defines properties that can be part of the entity type.
message Property {
// Types of occurrences of the entity type in the document. This
// represents the number of instances of instances of an entity, not
// number of mentions of an entity. For example, a bank statement may
// only have one `account_number`, but this account number may be
// mentioned in several places on the document. In this case the
// 'account_number' would be considered a `REQUIRED_ONCE` entity type. If,
// on the other hand, we expect a bank statement to contain the status of
// multiple different accounts for the customers, the occurrence type will
// be set to `REQUIRED_MULTIPLE`.
// represents the number of instances, not mentions, of an entity.
// For example, a bank statement might only have one
// `account_number`, but this account number can be mentioned in several
// places on the document. In this case, the `account_number` is
// considered a `REQUIRED_ONCE` entity type. If, on the other hand, we
// expect a bank statement to contain the status of multiple different
// accounts for the customers, the occurrence type is set to
// `REQUIRED_MULTIPLE`.
enum OccurrenceType {
// Unspecified occurrence type.
OCCURRENCE_TYPE_UNSPECIFIED = 0;
Expand All @@ -127,6 +127,9 @@ message DocumentSchema {
// EntityType name.
string name = 1;

// User defined name for the property.
string display_name = 6;

// A reference to the value type of the property. This type is subject
// to the same conventions as the `Entity.base_types` field.
string value_type = 2;
Expand Down
30 changes: 18 additions & 12 deletions google/cloud/documentai/v1beta3/document_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,18 @@ enum DatasetSplitType {
DATASET_SPLIT_UNASSIGNED = 3;
}

// Describes the labelling status of a document.
// Describes the labeling status of a document.
enum DocumentLabelingState {
// Default value if the enum is not set.
DOCUMENT_LABELING_STATE_UNSPECIFIED = 0;

// Document has been labelled.
// Document has been labeled.
DOCUMENT_LABELED = 1;

// Document has not been labelled.
// Document has not been labeled.
DOCUMENT_UNLABELED = 2;

// Document has been auto-labelled.
// Document has been auto-labeled.
DOCUMENT_AUTO_LABELED = 3;
}

Expand Down Expand Up @@ -315,8 +315,9 @@ message ListDocumentsRequest {
// - String match is case sensitive (for filter `DisplayName` & `EntityType`).
string filter = 4 [(google.api.field_behavior) = OPTIONAL];

// Optional. Controls if the ListDocuments request requires a total size
// of matched documents. See ListDocumentsResponse.total_size.
// Optional. Controls if the request requires a total size of matched
// documents. See
// [ListDocumentsResponse.total_size][google.cloud.documentai.v1beta3.ListDocumentsResponse.total_size].
//
// Enabling this flag may adversely impact performance.
//
Expand All @@ -325,19 +326,24 @@ message ListDocumentsRequest {

// Optional. Number of results to skip beginning from the `page_token` if
// provided. https://google.aip.dev/158#skipping-results. It must be a
// non-negative integer. Negative values wil be rejected. Note that this is
// non-negative integer. Negative values will be rejected. Note that this is
// not the number of pages to skip. If this value causes the cursor to move
// past the end of results, `ListDocumentsResponse.document_metadata` and
// `ListDocumentsResponse.next_page_token` will be empty.
// past the end of results,
// [ListDocumentsResponse.document_metadata][google.cloud.documentai.v1beta3.ListDocumentsResponse.document_metadata]
// and
// [ListDocumentsResponse.next_page_token][google.cloud.documentai.v1beta3.ListDocumentsResponse.next_page_token]
// will be empty.
int32 skip = 8 [(google.api.field_behavior) = OPTIONAL];
}

message ListDocumentsResponse {
// Document metadata corresponding to the listed documents.
repeated DocumentMetadata document_metadata = 1;

// A token, which can be sent as `page_token` to retrieve the next page.
// If this field is omitted, there are no subsequent pages.
// A token, which can be sent as
// [ListDocumentsRequest.page_token][google.cloud.documentai.v1beta3.ListDocumentsRequest.page_token]
// to retrieve the next page. If this field is omitted, there are no
// subsequent pages.
string next_page_token = 2;

// Total count of documents queried.
Expand Down Expand Up @@ -430,7 +436,7 @@ message DocumentMetadata {
// Type of the dataset split to which the document belongs.
DatasetSplitType dataset_type = 3;

// Labelling state of the document.
// Labeling state of the document.
DocumentLabelingState labeling_state = 5;

// The display name of the document.
Expand Down

0 comments on commit 1da5299

Please sign in to comment.