In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 02 - Vertex Pipelines using Custom KubeFlow Components

## Overview 

This notebook shows how to use Kubeflow components to build a custom regression workflow on `Vertex AI Pipelines`.

You will build a pipeline in this notebook that looks like this:

<img src="img/vertex-pipelines.png" width="80%"/>





## Notebook Objective

In this notebook, you will learn to use `Vertex AI Pipelines`, with only  `KubeFlow Components` to build a `custom` tabular regression model. In the pipeline we  will orchestrate data creation, data processing, model training and evaluation, and model deployment. We'll also see how to send payloads the endpoint deployed and how to run batch predition jobs.

This lab uses the following Google Cloud services and resources:

- `BigQuery`
- `Vertex AI Pipelines`
- `Google Cloud Pipeline Components`
- `Vertex AI Model`
- `Vertex AI Model Registry`
- `Vertex AI Metadata`
- `Vertex AI Endpoint`

The steps performed in this notebook include:

1. [Load Configuration settings from the setup notebook](#Load-Configuration-settings-from-the-setup-notebook)
1. [Vertex Pipelines Introduction](#Vertex-Pipelines-Introduction)
1. [Create a KFP Pipeline](#Create-a-KFP-Pipeline)
    1. [Create a dataset in BigQuery](#Step-1:-Create-a-dataset-in-BigQuery)
    1. [Transform the Data](#Step-2:-Transform-the-Data)
    1. [Train and Evaluate our custom Regression Model](#Step-3:-Train-and-Evaluate-our-custom-Regression-Model)
    1. [Upload Model to Vertex AI and Deploy Endpoint](#Step-4:-Upload-Model-to-Vertex-AI-and-Deploy-Endpoint)
1. [Compile the KFP Pipeline](#Compile-the-KFP-Pipeline)
1. [Execute the KFP Pipeline using Vertex AI Pipelines](#Execute-the-KFP-Pipeline-using-Vertex-AI-Pipelines)
1. [Inspect Experiments](#Inspect-Experiments)
1. [Online Predictions with Deployed Endpoint](#Online-Predictions-with-Deployed-Endpoint)
1. [Batch Predictions with Created Model](#Batch-Predictions-with-Created-Model)


The KubeFlow Compoenents are [documented here](https://www.kubeflow.org/docs/components/pipelines/v1/sdk-v2/python-function-components/)


### Dataset

In this workshop we'll use the **public datase**t [Auto MPG](https://archive.ics.uci.edu/ml/datasets/auto+mpg) for demonstration purposes. The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes. The objective will be to build a model to predict "MPG" (Miles per Gallon).

Check notebook  `01_exploratory_data_analysis.ipynb` for further details of the dataset

## Load Configuration settings from the setup notebook

In [None]:
# import our configurations from notebook 00_environment_setup.ipynb
from src.config import config

PROJECT_ID = config['PROJECT_ID']
REGION = config['REGION']
ID = config['ID']
BUCKET_NAME = config['BUCKET_NAME']
GCS_DATA_URI = config['GCS_DATA_URI']
BQ_DATASET_URI = config['BQ_DATASET_URI']

### Import libraries

In [None]:
# Misc
import os
import shutil
import logging
from datetime import datetime

# Import the Vertex AI Python SDK 
from google.cloud import aiplatform as aip
from google.cloud.aiplatform import pipeline_jobs
import google.auth
from google.cloud import storage
from google_cloud_pipeline_components import aiplatform as gcc_aip

# kfp sdk, to create the Vertex AI Pipelines
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline

from typing import NamedTuple

from kfp.v2.dsl import (Artifact, Dataset, Input, Model, Output, Metrics, ClassificationMetrics, component, OutputPath, InputPath)
from kfp.v2 import compiler


# TensorFlow model building libraries.
import tensorflow as tf


# Custom Modules
from src.helper import *

------------------------------------

## Vertex Pipelines Introduction

[Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) helps you to automate, monitor, and govern your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata. By storing the artifacts of your ML workflow in Vertex ML Metadata, you can analyze the lineage of your workflow's artifacts — for example, an ML model's lineage may include the training data, hyperparameters, and code that were used to create the model.

You can build your Pipelines using the battle-tested and easy-to-use `KubeFlow Pipelines (KFP) SDK` or ` TensorFlow Extended (TFX) SDK`. 

Within your Vertex Pipeline with `KubeFlow Pipelines (KFP) SDK` you can use either your own custom components using `KubeFlow Components` or already-built compontents using `Google Cloud Pipeline Components`.

The Google Cloud Components are [documented here](https://google-cloud-pipeline-components.readthedocs.io/en/latest/google_cloud_pipeline_components.aiplatform.html#module-google_cloud_pipeline_components.aiplatform). 

The KubeFlow Compoenents are [documented here](https://www.kubeflow.org/docs/components/pipelines/v1/sdk-v2/python-function-components/)

<img src=img/vertex-pipelines-def.png width=80%>

-----


## Create a KFP Pipeline

To address your business requirements and get your higher performing model into production to deliver value faster, you will define a pipeline using the [**Kubeflow Pipelines (KFP) V2 SDK**](https://www.kubeflow.org/docs/components/pipelines/sdk/v2/v2-compatibility) to orchestrate the training and deployment of your model on [**Vertex Pipelines**](https://cloud.google.com/vertex-ai/docs/pipelines) below.

The pipeline consists of four custom KFP V2 components:

* `create_bq_dataset_op`: Create BigQuery Dataset from data stored as csv in Google Cloud Storage.

* `prepare_datasets_op`: gets the required data and transform it for the training requirements.

* `train_evaluate_model_op`: Trains our regression model to predig MPG and  evaluates the trained model and makes the decision whether to deploy the model and create an endpoint based on a passed threshold.


*  `deploy_model_endpoint Custom Component`: Uploads trained model to Vertex AI and creates a Google Cloud Vertex Endpoint resource that maps physical machine resources with your model to enable it to serve online predictions. Online predictions have low latency requirements; providing resources to the model in advance reduces latency. 


### Step 1: Create a dataset in BigQuery

In [None]:
@component(
    base_image="python:3.9",
    packages_to_install=['google-cloud-bigquery', 'pandas'],
    output_component_file="create_bq_dataset_op.yml"
)
def create_bq_dataset_op(
    gcs_uri: str,
    project_id: str,
    dataset_name: str,
    table_name: str,
) -> NamedTuple('Outputs',[('bq_dataset_uri', str)]):
    """
    Create a new bucket in the US region with the STANDARD storage class
    Args:
        gcs_uri: gcs uri (gs://...)
        bucket_name: name of the bucket
        region: region or zone
        service_account: service account
    Output:
        table_id:string, Table in BigQuery
    """
    
    from google.cloud import bigquery
    import os

    # Create bigquery table
    bq_client = bigquery.Client(project=project_id)
    dataset_name = dataset_name

    dataset_id = "{}.{}".format(bq_client.project, dataset_name)
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = "US"

    try:
        dataset = bq_client.create_dataset(dataset, timeout=30)
        print("Created dataset {}.{}".format(bq_client.project, dataset.dataset_id))
    except:
        bq_client.delete_dataset(dataset_id, delete_contents=True)
        dataset = bq_client.create_dataset(dataset, timeout=30)

    # Create table
    table_name = table_name
    table_id = f"{dataset_id}.{table_name}"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("mpg", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("cyl", bigquery.enums.SqlTypeNames.INTEGER),
            bigquery.SchemaField("dis", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("hp", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("weight", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("accel", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("year", bigquery.enums.SqlTypeNames.INTEGER),
            bigquery.SchemaField("origin", bigquery.enums.SqlTypeNames.INTEGER),
        ], 
        write_disposition="WRITE_TRUNCATE")

    job = bq_client.load_table_from_uri(
        gcs_uri, table_id, job_config=job_config)

    job.result()

    bq_dataset_uri = f"bq://{dataset_id}.{table_name}"
    from collections import namedtuple
    output = namedtuple('Outputs',['bq_dataset_uri'])

    return output(bq_dataset_uri)

------

### Step 2: Transform the Data

In [None]:
@component(
    base_image="python:3.9",
    packages_to_install= ['google-cloud-bigquery[bqstorage,pandas]'],
    output_component_file="prepare_datasets_op.yml"
)
def prepare_datasets_op(
    bq_dataset: str,
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset],
)-> NamedTuple("Outputs", [("train_dataset_path", str), ("test_dataset_path", str)]):
    
    # print(f"Input dataset is: {bq_dataset}")
    # print(f"Input dataset is: {bq_dataset.uri}")
    # print(f"Input dataset is: {bq_dataset.location}")
    from google.cloud import bigquery
    import pandas as pd
    import logging
    import os
    
    
    
    bqclient = bigquery.Client()
    
    def download_table(bq_table_uri: str):
        prefix = "bq://"
        if bq_table_uri.startswith(prefix):
            bq_table_uri = bq_table_uri[len(prefix):]

        table = bigquery.TableReference.from_string(bq_table_uri)
        rows = bqclient.list_rows(
            table,
        )
        return rows.to_dataframe(create_bqstorage_client=False)
    
    
    raw_dataset = download_table(bq_dataset)
    raw_dataset.rename(columns = {
        'mpg':'MPG',
        'cyl':'Cylinders',
        'dis':'Displacement',
        'hp': 'Horsepower',
        'weight': 'Weight',
        'accel': 'Acceleration',
        'year': 'Model Year',
        'origin': 'Origin'}, inplace = True)

    # Get data in shape
    dataset = raw_dataset.copy()
    dataset.tail()
    dataset = dataset.dropna()
    dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
    dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
    train_dataset = dataset.sample(frac=0.8, random_state=0)
    test_dataset = dataset.drop(train_dataset.index)
    
    train_dataset_path = dataset_train.path
    test_dataset_path = dataset_test.path
    logging.info(f"Dataset Train is be stored in: {dataset_train.path}")
    logging.info(f"Dataset Test is be stored in: {dataset_test.path}")
    
    train_dataset.to_csv(dataset_train.path, index=False)
    test_dataset.to_csv(dataset_train.path, index=False)
    
    return (train_dataset_path, test_dataset_path)
    


---------

### Step 3: Train and Evaluate our custom Regression Model

In [None]:
@component(
    base_image="tensorflow/tensorflow:2.10.0",
    packages_to_install=['pandas'],
    output_component_file="train_evaluate_model_op.yml"
)
def train_evaluate_model_op(
    dataset_train: Input[Dataset],
    dataset_test: Input[Dataset],
    threshold_dict_str: str,
    pipeline_name: str,
    metrics: Output[Metrics],
    model: Output[Model],
    model_uri: OutputPath(str),
) -> NamedTuple(
        'Outputs',[
            ('loss', float),
            ('val_loss', float),
            ('dep_decision', str),
            ('model_path', str)
        ]):
    
    import os
    import logging
    import tensorflow as tf
    import pandas as pd
    import numpy as np
    from tensorflow import keras
    from tensorflow.keras import layers
    import json
    
    logging.info(f"Train Dataset path {dataset_train.path}")
    logging.info(f"Test Dataset path {dataset_test.path}")
    logging.info(f"train dataset - {dataset_train.path}")
    
    train_dataset = pd.read_csv(dataset_train.path)
    test_dataset = pd.read_csv(dataset_train.path)
    train_features = train_dataset.copy()
    test_features = test_dataset.copy()
    train_labels = train_features.pop('MPG')
    test_labels = test_features.pop('MPG')

    
    # Create model
    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(np.array(train_features))
    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(np.array(train_features))
    first = np.array(train_features[:1])
    horsepower = np.array(train_features['Horsepower'])
    horsepower_normalizer = layers.Normalization(input_shape=[1,], axis=None)
    horsepower_normalizer.adapt(horsepower)

    def build_and_compile_model(norm):
        model = keras.Sequential([
            norm,
            layers.Dense(64, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(1)
        ])
        
        metrics_tf = [tf.metrics.MeanAbsoluteError(), tf.metrics.MeanAbsolutePercentageError(), 
               tf.metrics.MeanSquaredError()]
        
        model.compile(loss='mean_absolute_error', 
                      optimizer=tf.keras.optimizers.Adam(0.001),
                      metrics=metrics_tf
                     )
        return model

    dnn_model = build_and_compile_model(normalizer)
    dnn_model.summary()

    history = dnn_model.fit(
        train_features,
        train_labels,
        validation_split=0.2,
        verbose=0, epochs=100
    )

    
    loss_test, mae, mape, mse = dnn_model.evaluate(test_features,
                                                       test_labels,
                                                       verbose=0
                                                      )
    
    logging.info(f"TEST METRICS: {loss_test}, {mae}, {mape}, {mse}")

    # Log metrics
    metrics_training = {metric: values[-1] for metric, values in history.history.items()}
    metrics.log_metric('loss', metrics_training['loss'])
    metrics.log_metric('val_loss', metrics_training['val_loss'])
    metrics.log_metric('test_loss', loss_test)
    metrics.log_metric('test_mae', mae)
    metrics.log_metric('test_mape', mape)
    metrics.log_metric('test_mse', mse)
  
    
    model.metadata['loss'] = metrics_training['loss']
    model.metadata['val_loss'] = metrics_training['val_loss']
    model.metadata['test_loss'] = loss_test
    model.metadata['pipeline'] = pipeline_name

    # Save the model to GCS
    dnn_model.save(model.path)

    threshold_dict = json.loads(threshold_dict_str)
    kpi_decision = float(threshold_dict['mae'])
    
        
    if mae <= kpi_decision:
        dep_decision = "true"
    else:
        dep_decision = "false"
    logging.info(f"deployment decision is {dep_decision}")
    logging.info(f"model will be stored in {model.path}")
    
    model_uri = model.path

    return (metrics_training['loss'], metrics_training['val_loss'], dep_decision, model.path)

----------------

### Step 4: Upload Model to Vertex AI and Deploy Endpoint

In [None]:
@component(
  base_image="python:3.9",
  packages_to_install=['google-cloud-aiplatform'],
  output_component_file="upload_model_op.yml"
)
def deploy_model_endpoint_op(
    model: Input[Model],
    region: str,
    model_name: str,
    pipeline_name: str,
    project_id: str,
    serving_container_image_uri: str,
    machine_type: str,
    model_display_name: str,
    endpoint_display_name:str,
    vertex_model: Output[Artifact]
):
    import logging
    from google.cloud import aiplatform as aip
        
    labels={'pipeline': str(pipeline_name),
            'country': 'germany'}
   
    aip.init(project=project_id, location=region)

    logging.info(f"Model URI {model.uri}")
    logging.info(f"Model Name: {model_name}")
    logging.info(f"Model Labels: {labels}")
    logging.info(f"serving_container_image_uri {serving_container_image_uri}")
    
    def create_endpoint(endpoint_display_name, project, region):
        endpoints = aip.Endpoint.list(
        filter='display_name="{}"'.format(endpoint_display_name),
        order_by='create_time desc',
        project=project, 
        location=region,
        )
        if len(endpoints) > 0:
            endpoint = endpoints[0]  # most recently created
        else:
            endpoint = aiplatform.Endpoint.create(
            display_name=endpoint_display_name, project=project, location=region
        )
        return endpoint
    
    # Create Endpoint     
    endpoint = create_endpoint(endpoint_display_name, project_id, region) 
    
    
    model_upload = aip.Model.upload(  
        display_name=model_name,
        artifact_uri=model.uri,
        description='Regression model for fuel prediction',
        labels=labels,
        serving_container_image_uri=serving_container_image_uri
        )
    
    logging.info(f'Input Endpoint {endpoint}')
    model_deploy = model_upload.deploy(
        machine_type=machine_type, 
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=model_display_name,
    )

    # Save data to the output params
    vertex_model.uri = model_deploy.resource_name
    logging.info(model_deploy.resource_name)
    
#

--------

### Compile the KFP Pipeline

#### Define Pipeline Parameters

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

USER = 'add-your-name-lowercase'
USER = 'gabriela'
if USER == 'add-your-name-lowercase':
    USER = 'unknown'


EXPERIMENT_NAME = "fuel-model"
EXPERIMENT_DESCRIPTION = "Fuel prediction pipeline"
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root/{USER}"

## Important the pipeline name always has the timestamp as part of the name.
PIPELINE_NAME = f"{EXPERIMENT_NAME}-{TIMESTAMP}"
PIPELINE_PACKAGE_PATH = f'{PIPELINE_NAME}-path.json'


MODEL_NAME = EXPERIMENT_NAME
THRESHOLD_DICT_STR = '{"mae": 10000}'

DATASET_NAME = 'fuel_dataset'
TABLE_NAME = 'main'
MODEL_NAME = 'fuel-prediction'
DATA_GCS_DIR = GCS_DATA_URI


ENDPOINT_DISPLAY_NAME = 'fuel-endpoint'
SERVING_CONTAINER_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest'
ENDPOINT_MACHINE_TYPE = "n1-standard-4"


LABELS = {
    'creator': USER,
    'workflow': 'fuel-prediction',
    'type': 'regression'}

aip.init(
    project=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    experiment=EXPERIMENT_NAME,
    experiment_description="Fuel prediction pipeline")

In [None]:
# PIPELINE
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name=PIPELINE_NAME
)
def pipeline(
    gcs_uri: str,
    dataset_name: str,
    threshold_dict_str: str,
    user:str,
    managed_dataset_name: str,
    endpoint_display_name: str,
    endpoint_machine_type:str,
    table_name: str,
    project_id: str,
    pipeline_name: str,
    serving_container_image_uri:str,
    model_name: str,
    region: str):

    # STEP 1:  Create bq table and dataset
    create_bq_dataset_task = create_bq_dataset_op(
        gcs_uri=gcs_uri,
        project_id=project_id,
        dataset_name=dataset_name,
        table_name=table_name
        ).set_caching_options(True) \
        .set_display_name('create-bq-dataset-table-op')
    
    
    
   # STEP 2: Prepare train and test datasets
    prepare_data_op = prepare_datasets_op(
        #bq_dataset=create_dataset_op.outputs['dataset']
        bq_dataset =create_bq_dataset_task.outputs['bq_dataset_uri']
    ).set_caching_options(True) \
        .set_display_name('prepare-datasets-op')

    # STEP 3: Train and evaluate model
    train_evaluate_model_task = train_evaluate_model_op(
        dataset_train = prepare_data_op.outputs['dataset_train'],
        dataset_test = prepare_data_op.outputs['dataset_test'],
        threshold_dict_str=threshold_dict_str,
        pipeline_name=pipeline_name
    ).set_display_name('training-evaluation-job-op')\
        .set_caching_options(True)

    ## Step3: Decision: If model performs according to our threshold, then deploy model and Enp
    with dsl.Condition(
            train_evaluate_model_task.outputs["dep_decision"] == "true",
            name="deploy_decision",
        ):
            # Upload Model, Create a Vertex Endpoint resource and Deploy Model to Endpoint.

            deploy_model_endpoint_op(
                            model=train_evaluate_model_task.outputs['model'],
                            region=region,
                endpoint_display_name=endpoint_display_name,
                            model_name=model_name,
                            pipeline_name=pipeline_name,
                            project_id=project_id,
                machine_type=endpoint_machine_type,
                model_display_name=model_name,
                serving_container_image_uri=serving_container_image_uri
            ).set_display_name('deploy-model-op').set_caching_options(True)
    


In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=PIPELINE_PACKAGE_PATH
)

--------------------

### Execute the KFP Pipeline using Vertex AI Pipelines

In [None]:
job = aip.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path=PIPELINE_PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    enable_caching=True,
    labels=LABELS,
    parameter_values={
        'gcs_uri': DATA_GCS_DIR,
        'dataset_name': DATASET_NAME,
        'threshold_dict_str':THRESHOLD_DICT_STR,
        'user': USER,
        'managed_dataset_name': DATASET_NAME,
        'endpoint_display_name': ENDPOINT_DISPLAY_NAME,
        'endpoint_machine_type': ENDPOINT_MACHINE_TYPE,
        'table_name': TABLE_NAME,
        'project_id': PROJECT_ID,
        'pipeline_name': PIPELINE_NAME,
        'serving_container_image_uri': SERVING_CONTAINER_IMAGE_URI,
        'model_name': MODEL_NAME,
        'region': REGION
    }
)


job.submit(experiment=EXPERIMENT_NAME)

------------

## Inspect Experiments

In [None]:
def get_experiments_data(
  experiment_name: str,
  project: str,
  location: str
):
    """
    Get experiments
    """
    aip.init(experiment=experiment_name, project=project, location=location)
    experiments_df = aip.get_experiment_df()
    return experiments_df

In [None]:
df = get_experiments_data(
    experiment_name=EXPERIMENT_NAME,
    project=PROJECT_ID,
    location=REGION
)

df

-----------

## Online Predictions with Deployed Endpoint

Retrieve the `Endpoint` deployed by the pipeline and use it to query your model for online predictions.

Configure the `Endpoint()` function below with the following parameters:

*  `endpoint_name`: A fully-qualified endpoint resource name or endpoint ID. Example: "projects/123/locations/us-central1/endpoints/456" or "456" when project and location are initialized or passed.
*  `project_id`: GCP project.
*  `location`: GCP region.

Call `predict()` to return a prediction for a test review.

In [None]:
ENDPOINT_ID = 'insert-your-endpoint-id'

In [None]:
endpoint = vertexai.Endpoint(ENDPOINT_ID)

In [None]:
prediction = endpoint.predict([4,90.0,75.0,2125.0,14.5,74,0,0,1])

In [None]:
prediction

----

## Batch Predictions with Created Model

In [None]:
## Create a fake batch file in Cloud Storage by randomly sampling our dataset
import pandas as pd
dataset = pd.read_csv(GCS_DATA_URI, header=None)
# Remove label
dataset = dataset.iloc[:,1:]

batch_data = dataset.sample(10)
batch_data.to_csv('data/batch_data_ex.csv', index=False)
batch_data.head()


In [None]:
## Upload data to Cloud Storage
from src.helper import upload_file_to_gcs

gcs_batch_input_data_path = upload_file_to_gcs(
    project_id=PROJECT_ID,
    target=BUCKET_NAME,
    source='data/batch_data_ex.csv',
    blob_name=f'data/batch_prediction/input_data/fuel_data_{TIMESTAMP}.csv')
gcs_batch_input_data_path

In [None]:
# Define batch job args
TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S") 
batch_job_display_name = "fuel-batch-prediction-job"
gcs_batch_data = gcs_batch_input_data_path
instances_format = 'csv'
gcs_dest_results = f'gs://{BUCKET_NAME}/batch_jobs/output/{TIMESTAMP}/'
machine_type = "n1-standard-2"

In [None]:
## List all Models and pick the Model ID 
!gcloud ai models list --region=us-central1

In [None]:
MODEL_ID = 'insert-your-model-id'

In [None]:
model_resource_name = f'projects/{PROJECT_ID}/locations/{REGION}/models/{MODEL_ID}'
model_resource_name

In [None]:
# aiplatform.init(project=project, location=location)
model = aip.Model(model_resource_name)

In [None]:
batch_prediction_job = model.batch_predict(
        job_display_name=batch_job_display_name,
        instances_format='csv', #json
        gcs_source=[gcs_batch_data],
        gcs_destination_prefix=gcs_dest_results,
        machine_type=machine_type, # must be present      
    )

------

# IMPORTANT! CLEAN UP ALL RESOURCES CREATED