google/cloud/aiplatform/v1beta1/schema/trainingjob/definition/automl_time_series_forecasting.proto

// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1.schema.trainingjob.definition;

import "google/cloud/aiplatform/v1beta1/schema/trainingjob/definition/export_evaluated_data_items_config.proto";
import "google/api/annotations.proto";

option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1/schema/trainingjob/definition;definition";
option java_multiple_files = true;
option java_outer_classname = "AutoMLForecastingProto";
option java_package = "com.google.cloud.aiplatform.v1beta1.schema.trainingjob.definition";

// A TrainingJob that trains and uploads an AutoML Forecasting Model.
message AutoMlForecasting {
  // The input parameters of this TrainingJob.
  AutoMlForecastingInputs inputs = 1;

  // The metadata information.
  AutoMlForecastingMetadata metadata = 2;
}

message AutoMlForecastingInputs {
  message Transformation {
    // Training pipeline will infer the proper transformation based on the
    // statistic of dataset.
    message AutoTransformation {
      string column_name = 1;
    }

    // Training pipeline will perform following transformation functions.
    //
    // *  The value converted to float32.
    //
    // *  The z_score of the value.
    //
    // *  log(value+1) when the value is greater than or equal to 0. Otherwise,
    //    this transformation is not applied and the value is considered a
    //    missing value.
    //
    // *  z_score of log(value+1) when the value is greater than or equal to 0.
    //    Otherwise, this transformation is not applied and the value is
    //    considered a missing value.
    //
    // *  A boolean value that indicates whether the value is valid.
    message NumericTransformation {
      string column_name = 1;
    }

    // Training pipeline will perform following transformation functions.
    //
    // *  The categorical string as is--no change to case, punctuation,
    //    spelling, tense, and so on.
    //
    // *  Convert the category name to a dictionary lookup index and generate an
    //    embedding for each index.
    //
    // *  Categories that appear less than 5 times in the training dataset are
    //    treated as the "unknown" category. The "unknown" category gets its own
    //    special lookup index and resulting embedding.
    message CategoricalTransformation {
      string column_name = 1;
    }

    // Training pipeline will perform following transformation functions.
    //
    // *  Apply the transformation functions for Numerical columns.
    //
    // *  Determine the year, month, day,and weekday. Treat each value from the
    //    timestamp as a Categorical column.
    //
    // *  Invalid numerical values (for example, values that fall outside of a
    //    typical timestamp range, or are extreme values) receive no special
    //    treatment and are not removed.
    message TimestampTransformation {
      string column_name = 1;

      // The format in which that time field is expressed. The time_format must
      // either be one of:
      //
      // * `unix-seconds`
      //
      // * `unix-milliseconds`
      //
      // * `unix-microseconds`
      //
      // * `unix-nanoseconds`
      //
      // (for respectively number of seconds, milliseconds, microseconds and
      // nanoseconds since start of the Unix epoch);
      //
      // or be written in `strftime` syntax.
      //
      // If time_format is not set, then the
      // default format is RFC 3339 `date-time` format, where
      // `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z)
      string time_format = 2;
    }

    // Training pipeline will perform following transformation functions.
    //
    // *  The text as is--no change to case, punctuation, spelling, tense, and
    //    so on.
    //
    // *  Convert the category name to a dictionary lookup index and generate an
    //    embedding for each index.
    message TextTransformation {
      string column_name = 1;
    }

    // The transformation that the training pipeline will apply to the input
    // columns.
    oneof transformation_detail {
      AutoTransformation auto = 1;

      NumericTransformation numeric = 2;

      CategoricalTransformation categorical = 3;

      TimestampTransformation timestamp = 4;

      TextTransformation text = 5;
    }
  }

  // A duration of time expressed in time granularity units.
  message Granularity {
    // The time granularity unit of this time period.
    // The supported units are:
    //
    //  * "minute"
    //
    //  * "hour"
    //
    //  * "day"
    //
    //  * "week"
    //
    //  * "month"
    //
    //  * "year"
    string unit = 1;

    // The number of granularity_units between data points in the training
    // data. If `granularity_unit` is `minute`,
    // can be 1, 5, 10, 15, or 30. For all other values of `granularity_unit`,
    // must be 1.
    int64 quantity = 2;
  }

  // The name of the column that the model is to predict.
  string target_column = 1;

  // The name of the column that identifies the time series.
  string time_series_identifier_column = 2;

  // The name of the column that identifies time order in the time series.
  string time_column = 3;

  // Each transformation will apply transform function to given input column.
  // And the result will be used for training.
  // When creating transformation for BigQuery Struct column, the column should
  // be flattened using "." as the delimiter.
  repeated Transformation transformations = 4;

  // Objective function the model is optimizing towards. The training process
  // creates a model that optimizes the value of the objective
  // function over the validation set.
  //
  // The supported optimization objectives:
  //
  //   * "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
  //
  //   * "minimize-mae" - Minimize mean-absolute error (MAE).
  //
  //   * "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
  //
  //   * "minimize-rmspe" - Minimize root-mean-squared percentage error (RMSPE).
  //
  //   * "minimize-wape-mae" - Minimize the combination of weighted absolute
  //     percentage error (WAPE) and mean-absolute-error (MAE).
  //
  //   * "minimize-quantile-loss" - Minimize the quantile loss at the quantiles
  //     defined in `quantiles`.
  string optimization_objective = 5;

  // Required. The train budget of creating this model, expressed in milli node
  // hours i.e. 1,000 value in this field means 1 node hour.
  //
  // The training cost of the model will not exceed this budget. The final cost
  // will be attempted to be close to the budget, though may end up being (even)
  // noticeably smaller - at the backend's discretion. This especially may
  // happen when further model training ceases to provide any improvements.
  //
  // If the budget is set to a value known to be insufficient to train a
  // model for the given dataset, the training won't be attempted and
  // will error.
  //
  // The train budget must be between 1,000 and 72,000 milli node hours,
  // inclusive.
  int64 train_budget_milli_node_hours = 6;

  // Column name that should be used as the weight column.
  // Higher values in this column give more importance to the row
  // during model training. The column must have numeric values between 0 and
  // 10000 inclusively; 0 means the row is ignored for training. If weight
  // column field is not set, then all rows are assumed to have equal weight
  // of 1.
  string weight_column = 7;

  // Column names that should be used as attribute columns.
  // The value of these columns does not vary as a function of time.
  // For example, store ID or item color.
  repeated string time_series_attribute_columns = 19;

  // Names of columns that are unavailable when a forecast is requested.
  // This column contains information for the given entity (identified
  // by the time_series_identifier_column) that is unknown before the forecast
  // For example, actual weather on a given day.
  repeated string unavailable_at_forecast_columns = 20;

  // Names of columns that are available and provided when a forecast
  // is requested. These columns
  // contain information for the given entity (identified by the
  // time_series_identifier_column column) that is known at forecast.
  // For example, predicted weather for a specific day.
  repeated string available_at_forecast_columns = 21;

  // Expected difference in time granularity between rows in the data.
  Granularity data_granularity = 22;

  // The amount of time into the future for which forecasted values for the
  // target are returned. Expressed in number of units defined by the
  // `data_granularity` field.
  int64 forecast_horizon = 23;

  // The amount of time into the past training and prediction data is used
  // for model training and prediction respectively. Expressed in number of
  // units defined by the `data_granularity` field.
  int64 context_window = 24;

  // Configuration for exporting test set predictions to a BigQuery table. If
  // this configuration is absent, then the export is not performed.
  ExportEvaluatedDataItemsConfig export_evaluated_data_items_config = 15;

  // Quantiles to use for minimize-quantile-loss `optimization_objective`. Up to
  // 5 quantiles are allowed of values between 0 and 1, exclusive. Required if
  // the value of optimization_objective is minimize-quantile-loss. Represents
  // the percent quantiles to use for that objective. Quantiles must be unique.
  repeated double quantiles = 16;

  // Validation options for the data validation component. The available options
  // are:
  //
  //   * "fail-pipeline" - default, will validate against the validation and
  //      fail the pipeline if it fails.
  //
  //   * "ignore-validation" - ignore the results of the validation and continue
  string validation_options = 17;

  // Additional experiment flags for the time series forcasting training.
  repeated string additional_experiments = 25;
}

// Model metadata specific to AutoML Forecasting.
message AutoMlForecastingMetadata {
  // Output only. The actual training cost of the model, expressed in milli
  // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
  // to not exceed the train budget.
  int64 train_cost_milli_node_hours = 1;
}