feat: add data_stats to Model

feat: add data_stats to ExportDataResponse feat: add filter_split to ExportDataConfig feat: add saved_query_id to ExportDataConfig feat: add annotation_schema_uri to ExportDataConfig feat: add export_use to ExportDataConfig docs: update exported_files description in ExportDataResponse PiperOrigin-RevId: 588136649
googleapis · Dec 5, 2023 · 57b57f4 · 57b57f4
1 parent 3ab9a05
commit 57b57f4
Show file tree

Hide file tree

Showing 3 changed files with 146 additions and 1 deletion.
diff --git a/google/cloud/aiplatform/v1/dataset.proto b/google/cloud/aiplatform/v1/dataset.proto
@@ -159,6 +159,18 @@ message ImportDataConfig {
 // Describes what part of the Dataset is to be exported, the destination of
 // the export and how to export.
 message ExportDataConfig {
+  // ExportUse indicates the usage of the exported files. It restricts file
+  // destination, format, annotations to be exported, whether to allow
+  // unannotated data to be exported and whether to clone files to temp Cloud
+  // Storage bucket.
+  enum ExportUse {
+    // Regular user export.
+    EXPORT_USE_UNSPECIFIED = 0;
+
+    // Export for custom code training.
+    CUSTOM_CODE_TRAINING = 6;
+  }
+
   // The destination of the output.
   oneof destination {
     // The Google Cloud Storage location where the output is to be written to.
@@ -178,13 +190,64 @@ message ExportDataConfig {
   oneof split {
     // Split based on fractions defining the size of each set.
     ExportFractionSplit fraction_split = 5;
+
+    // Split based on the provided filters for each set.
+    ExportFilterSplit filter_split = 7;
   }
 
   // An expression for filtering what part of the Dataset is to be exported.
   // Only Annotations that match this filter will be exported. The filter syntax
   // is the same as in
   // [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations].
   string annotations_filter = 2;
+
+  // The ID of a SavedQuery (annotation set) under the Dataset specified by
+  // [dataset_id][] used for filtering Annotations for training.
+  //
+  // Only used for custom training data export use cases.
+  // Only applicable to Datasets that have SavedQueries.
+  //
+  // Only Annotations that are associated with this SavedQuery are used in
+  // respectively training. When used in conjunction with
+  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
+  // the Annotations used for training are filtered by both
+  // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
+  // and
+  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter].
+  //
+  // Only one of
+  // [saved_query_id][google.cloud.aiplatform.v1.ExportDataConfig.saved_query_id]
+  // and
+  // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri]
+  // should be specified as both of them represent the same thing: problem type.
+  string saved_query_id = 11;
+
+  // The Cloud Storage URI that points to a YAML file describing the annotation
+  // schema. The schema is defined as an OpenAPI 3.0.2 [Schema
+  // Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject).
+  // The schema files that can be used here are found in
+  // gs://google-cloud-aiplatform/schema/dataset/annotation/, note that the
+  // chosen schema must be consistent with
+  // [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the
+  // Dataset specified by [dataset_id][].
+  //
+  // Only used for custom training data export use cases.
+  // Only applicable to Datasets that have DataItems and Annotations.
+  //
+  // Only Annotations that both match this schema and belong to DataItems not
+  // ignored by the split method are used in respectively training, validation
+  // or test role, depending on the role of the DataItem they are on.
+  //
+  // When used in conjunction with
+  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter],
+  // the Annotations used for training are filtered by both
+  // [annotations_filter][google.cloud.aiplatform.v1.ExportDataConfig.annotations_filter]
+  // and
+  // [annotation_schema_uri][google.cloud.aiplatform.v1.ExportDataConfig.annotation_schema_uri].
+  string annotation_schema_uri = 12;
+
+  // Indicates the usage of the exported files.
+  ExportUse export_use = 4;
 }
 
 // Assigns the input data to training, validation, and test sets as per the
@@ -203,3 +266,39 @@ message ExportFractionSplit {
   // The fraction of the input data that is to be used to evaluate the Model.
   double test_fraction = 3;
 }
+
+// Assigns input data to training, validation, and test sets based on the given
+// filters, data pieces not matched by any filter are ignored. Currently only
+// supported for Datasets containing DataItems.
+// If any of the filters in this message are to match nothing, then they can be
+// set as '-' (the minus sign).
+//
+// Supported only for unstructured Datasets.
+message ExportFilterSplit {
+  // Required. A filter on DataItems of the Dataset. DataItems that match
+  // this filter are used to train the Model. A filter with same syntax
+  // as the one used in
+  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
+  // may be used. If a single DataItem is matched by more than one of the
+  // FilterSplit filters, then it is assigned to the first set that applies to
+  // it in the training, validation, test order.
+  string training_filter = 1 [(google.api.field_behavior) = REQUIRED];
+
+  // Required. A filter on DataItems of the Dataset. DataItems that match
+  // this filter are used to validate the Model. A filter with same syntax
+  // as the one used in
+  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
+  // may be used. If a single DataItem is matched by more than one of the
+  // FilterSplit filters, then it is assigned to the first set that applies to
+  // it in the training, validation, test order.
+  string validation_filter = 2 [(google.api.field_behavior) = REQUIRED];
+
+  // Required. A filter on DataItems of the Dataset. DataItems that match
+  // this filter are used to test the Model. A filter with same syntax
+  // as the one used in
+  // [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems]
+  // may be used. If a single DataItem is matched by more than one of the
+  // FilterSplit filters, then it is assigned to the first set that applies to
+  // it in the training, validation, test order.
+  string test_filter = 3 [(google.api.field_behavior) = REQUIRED];
+}
diff --git a/google/cloud/aiplatform/v1/dataset_service.proto b/google/cloud/aiplatform/v1/dataset_service.proto
@@ -25,6 +25,7 @@ import "google/cloud/aiplatform/v1/annotation_spec.proto";
 import "google/cloud/aiplatform/v1/data_item.proto";
 import "google/cloud/aiplatform/v1/dataset.proto";
 import "google/cloud/aiplatform/v1/dataset_version.proto";
+import "google/cloud/aiplatform/v1/model.proto";
 import "google/cloud/aiplatform/v1/operation.proto";
 import "google/cloud/aiplatform/v1/saved_query.proto";
 import "google/longrunning/operations.proto";
@@ -412,8 +413,15 @@ message ExportDataRequest {
 // Response message for
 // [DatasetService.ExportData][google.cloud.aiplatform.v1.DatasetService.ExportData].
 message ExportDataResponse {
-  // All of the files that are exported in this export operation.
+  // All of the files that are exported in this export operation. For custom
+  // code training export, only three (training, validation and test) GCS paths
+  // in wildcard format are populated (e.g., gs://.../training-*).
   repeated string exported_files = 1;
+
+  // Only present for custom code training export use case. Records data stats,
+  // i.e., train/validation/test item/annotation counts calculated during
+  // the export operation.
+  Model.DataStats data_stats = 2;
 }
 
 // Runtime operation information for

diff --git a/google/cloud/aiplatform/v1/model.proto b/google/cloud/aiplatform/v1/model.proto
@@ -91,6 +91,35 @@ message Model {
         [(google.api.field_behavior) = OUTPUT_ONLY];
   }
 
+  // Stats of data used for train or evaluate the Model.
+  message DataStats {
+    // Number of DataItems that were used for training this Model.
+    int64 training_data_items_count = 1;
+
+    // Number of DataItems that were used for validating this Model during
+    // training.
+    int64 validation_data_items_count = 2;
+
+    // Number of DataItems that were used for evaluating this Model. If the
+    // Model is evaluated multiple times, this will be the number of test
+    // DataItems used by the first evaluation. If the Model is not evaluated,
+    // the number is 0.
+    int64 test_data_items_count = 3;
+
+    // Number of Annotations that are used for training this Model.
+    int64 training_annotations_count = 4;
+
+    // Number of Annotations that are used for validating this Model during
+    // training.
+    int64 validation_annotations_count = 5;
+
+    // Number of Annotations that are used for evaluating this Model. If the
+    // Model is evaluated multiple times, this will be the number of test
+    // Annotations used by the first evaluation. If the Model is not evaluated,
+    // the number is 0.
+    int64 test_annotations_count = 6;
+  }
+
   // Contains information about the original Model if this Model is a copy.
   message OriginalModelInfo {
     // Output only. The resource name of the Model this Model is a copy of,
@@ -395,6 +424,12 @@ message Model {
   // See https://goo.gl/xmQnxf for more information and examples of labels.
   map<string, string> labels = 17;
 
+  // Stats of data used for training or evaluating the Model.
+  //
+  // Only populated when the Model is trained by a TrainingPipeline with
+  // [data_input_config][TrainingPipeline.data_input_config].
+  DataStats data_stats = 21;
+
   // Customer-managed encryption key spec for a Model. If set, this
   // Model and all sub-resources of this Model will be secured by this key.
   EncryptionSpec encryption_spec = 24;
@@ -747,6 +782,9 @@ message ModelSourceInfo {
 
     // The Model is saved or tuned from Genie.
     GENIE = 5;
+
+    // The Model is uploaded by text embedding finetuning pipeline.
+    CUSTOM_TEXT_EMBEDDING = 6;
   }
 
   // Type of the model source.