Skip to content

Commit

Permalink
feat: add data_stats to Model
Browse files Browse the repository at this point in the history
feat: add data_stats to ExportDataResponse
feat: add filter_split to ExportDataConfig
feat: add saved_query_id to ExportDataConfig
feat: add annotation_schema_uri to ExportDataConfig
feat: add export_use to ExportDataConfig
docs: update exported_files description in ExportDataResponse

PiperOrigin-RevId: 588136649
  • Loading branch information
Google APIs authored and Copybara-Service committed Dec 5, 2023
1 parent 3ab9a05 commit 57b57f4
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 1 deletion.
99 changes: 99 additions & 0 deletions google/cloud/aiplatform/v1/dataset.proto
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,18 @@ message ImportDataConfig {
// Describes what part of the Dataset is to be exported, the destination of
// the export and how to export.
message ExportDataConfig {
// ExportUse indicates the usage of the exported files. It restricts file
// destination, format, annotations to be exported, whether to allow
// unannotated data to be exported and whether to clone files to temp Cloud
// Storage bucket.
enum ExportUse {
// Regular user export.
EXPORT_USE_UNSPECIFIED = 0;

// Export for custom code training.
CUSTOM_CODE_TRAINING = 6;
}

// The destination of the output.
oneof destination {
// The Google Cloud Storage location where the output is to be written to.
Expand All @@ -178,13 +190,64 @@ message ExportDataConfig {
oneof split {
// Split based on fractions defining the size of each set.
ExportFractionSplit fraction_split = 5;

// Split based on the provided filters for each set.
ExportFilterSplit filter_split = 7;
}

// An expression for filtering what part of the Dataset is to be exported.
// Only Annotations that match this filter will be exported. The filter syntax
// is the same as in
// [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations].
string annotations_filter = 2;

// The ID of a SavedQuery (annotation set) under the Dataset specified by
// [dataset_id][] used for filtering Annotations for training.
//
// Only used for custom training data export use cases.
// Only applicable to Datasets that have SavedQueries.
//
// Only Annotations that are associated with this SavedQuery are used in
// respectively training. When used in conjunction with
// [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
// the Annotations used for training are filtered by both
// [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
// and
// [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter].
//
// Only one of
// [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
// and
// [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri]
// should be specified as both of them represent the same thing: problem type.
string saved_query_id = 11;

// The Cloud Storage URI that points to a YAML file describing the annotation
// schema. The schema is defined as an OpenAPI 3.0.2 [Schema
// Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
// The schema files that can be used here are found in
// gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the
// chosen schema must be consistent with
// [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the
// Dataset specified by [dataset_id][].
//
// Only used for custom training data export use cases.
// Only applicable to Datasets that have DataItems and Annotations.
//
// Only Annotations that both match this schema and belong to DataItems not
// ignored by the split method are used in respectively training, validation
// or test role, depending on the role of the DataItem they are on.
//
// When used in conjunction with
// [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
// the Annotations used for training are filtered by both
// [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter]
// and
// [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri].
string annotation_schema_uri = 12;

// Indicates the usage of the exported files.
ExportUse export_use = 4;
}

// Assigns the input data to training, validation, and test sets as per the
Expand All @@ -203,3 +266,39 @@ message ExportFractionSplit {
// The fraction of the input data that is to be used to evaluate the Model.
double test_fraction = 3;
}

// Assigns input data to training, validation, and test sets based on the given
// filters, data pieces not matched by any filter are ignored. Currently only
// supported for Datasets containing DataItems.
// If any of the filters in this message are to match nothing, then they can be
// set as '-' (the minus sign).
//
// Supported only for unstructured Datasets.
message ExportFilterSplit {
// Required. A filter on DataItems of the Dataset. DataItems that match
// this filter are used to train the Model. A filter with same syntax
// as the one used in
// [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
// may be used. If a single DataItem is matched by more than one of the
// FilterSplit filters, then it is assigned to the first set that applies to
// it in the training, validation, test order.
string training_filter = 1 [(google.api.field_behavior) = REQUIRED];

// Required. A filter on DataItems of the Dataset. DataItems that match
// this filter are used to validate the Model. A filter with same syntax
// as the one used in
// [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
// may be used. If a single DataItem is matched by more than one of the
// FilterSplit filters, then it is assigned to the first set that applies to
// it in the training, validation, test order.
string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];

// Required. A filter on DataItems of the Dataset. DataItems that match
// this filter are used to test the Model. A filter with same syntax
// as the one used in
// [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
// may be used. If a single DataItem is matched by more than one of the
// FilterSplit filters, then it is assigned to the first set that applies to
// it in the training, validation, test order.
string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
}
10 changes: 9 additions & 1 deletion google/cloud/aiplatform/v1/dataset_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import "google/cloud/aiplatform/v1/annotation_spec.proto";
import "google/cloud/aiplatform/v1/data_item.proto";
import "google/cloud/aiplatform/v1/dataset.proto";
import "google/cloud/aiplatform/v1/dataset_version.proto";
import "google/cloud/aiplatform/v1/model.proto";
import "google/cloud/aiplatform/v1/operation.proto";
import "google/cloud/aiplatform/v1/saved_query.proto";
import "google/longrunning/operations.proto";
Expand Down Expand Up @@ -412,8 +413,15 @@ message ExportDataRequest {
// Response message for
// [DatasetService.ExportData][google.cloud.aiplatform.v1.DatasetService.ExportData].
message ExportDataResponse {
// All of the files that are exported in this export operation.
// All of the files that are exported in this export operation. For custom
// code training export, only three (training, validation and test) GCS paths
// in wildcard format are populated (e.g., gs://.../training-*).
repeated string exported_files = 1;

// Only present for custom code training export use case. Records data stats,
// i.e., train/validation/test item/annotation counts calculated during
// the export operation.
Model.DataStats data_stats = 2;
}

// Runtime operation information for
Expand Down
38 changes: 38 additions & 0 deletions google/cloud/aiplatform/v1/model.proto
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,35 @@ message Model {
[(google.api.field_behavior) = OUTPUT_ONLY];
}

// Stats of data used for train or evaluate the Model.
message DataStats {
// Number of DataItems that were used for training this Model.
int64 training_data_items_count = 1;

// Number of DataItems that were used for validating this Model during
// training.
int64 validation_data_items_count = 2;

// Number of DataItems that were used for evaluating this Model. If the
// Model is evaluated multiple times, this will be the number of test
// DataItems used by the first evaluation. If the Model is not evaluated,
// the number is 0.
int64 test_data_items_count = 3;

// Number of Annotations that are used for training this Model.
int64 training_annotations_count = 4;

// Number of Annotations that are used for validating this Model during
// training.
int64 validation_annotations_count = 5;

// Number of Annotations that are used for evaluating this Model. If the
// Model is evaluated multiple times, this will be the number of test
// Annotations used by the first evaluation. If the Model is not evaluated,
// the number is 0.
int64 test_annotations_count = 6;
}

// Contains information about the original Model if this Model is a copy.
message OriginalModelInfo {
// Output only. The resource name of the Model this Model is a copy of,
Expand Down Expand Up @@ -395,6 +424,12 @@ message Model {
// See https://goo.gl/xmQnxf for more information and examples of labels.
map<string, string> labels = 17;

// Stats of data used for training or evaluating the Model.
//
// Only populated when the Model is trained by a TrainingPipeline with
// [data_input_config][TrainingPipeline.data_input_config].
DataStats data_stats = 21;

// Customer-managed encryption key spec for a Model. If set, this
// Model and all sub-resources of this Model will be secured by this key.
EncryptionSpec encryption_spec = 24;
Expand Down Expand Up @@ -747,6 +782,9 @@ message ModelSourceInfo {

// The Model is saved or tuned from Genie.
GENIE = 5;

// The Model is uploaded by text embedding finetuning pipeline.
CUSTOM_TEXT_EMBEDDING = 6;
}

// Type of the model source.
Expand Down

0 comments on commit 57b57f4

Please sign in to comment.