## Create the template file for creating the pipeline

## Steps
1. Define your pipeline function
2. Build any custom components you need
3. Use the v2 compiler to compile your code 
4. Call the gcloud API Client to establish a connection to AI Platform
5. Run the job from the client

## Imports

In [1]:
# Example imports
import kfp
import json
import os
import datetime
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

# Old imports
# import os
# import datetime
# from jinja2 import Template
# from kfp.components import func_to_container_op
# from kfp.dsl.types import Dict
# from kfp.dsl.types import GCPProjectID
# from kfp.dsl.types import GCPRegion
# from kfp.dsl.types import GCSPath
# from kfp.dsl.types import String
# from kfp.gcp import use_gcp_secret

## Set up the environment

In [2]:
# Defaults and environment settings
REGION = 'us-central1'
ENDPOINT = 'https://fda9da3634d2db2-dot-us-central2.pipelines.googleusercontent.com'
BUCKET_NAME = 'rrusson-kubeflow-test'
ARTIFACT_STORE_URI = f'gs://{BUCKET_NAME}'
PROJECT_ID = "mwpmltr"

%env ENDPOINT=$ENDPOINT
%env PROJECT_ID=$PROJECT_ID
%env REGION=$REGION
%env BUCKET_NAME=$BUCKET_NAME
%env ARTIFACT_STORE_URI=$ARTIFACT_STORE_URI

from kfp_component.func_components import load_raw_data
from kfp_component.func_components import split_data
from kfp_component.func_components import disp_loss
from kfp_component.func_components import vertex_custom_job

env: ENDPOINT=https://fda9da3634d2db2-dot-us-central2.pipelines.googleusercontent.com
env: PROJECT_ID=mwpmltr
env: REGION=us-central1
env: BUCKET_NAME=rrusson-kubeflow-test
env: ARTIFACT_STORE_URI=gs://rrusson-kubeflow-test


## Create the Docker images and upload to gcr.io

In [3]:
IMAGE_NAME='nasa-iot-base'
TAG='v1'
BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)
print(BASE_IMAGE)

gcr.io/mwpmltr/nasa-iot-base:v1


In [4]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $BASE_IMAGE base_image --async

In [5]:
IMAGE_NAME='nasa-iot-trainer'
TAG='v5'
TRAINER_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)
print(TRAINER_IMAGE)

gcr.io/mwpmltr/nasa-iot-trainer:v5


In [6]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $TRAINER_IMAGE train_image --async

In [7]:
IMAGE_NAME='nasa-iot-datadescribe'
TAG='v1'
DD_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)
print(DD_IMAGE)

gcr.io/mwpmltr/nasa-iot-datadescribe:v1


In [8]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $DD_IMAGE dd_image --async

## Compile the Pipeline

In [9]:
# Define the pipeline
@kfp.dsl.pipeline(
    name="bearing-sensor-data-training",
    description="The pipeline for training and deploying an anomaly detector based on an autoencoder",
    pipeline_root="")

def pipeline(project_id: str,
             region: str,
             source_bucket_name: str, 
             prefix: str,
             dest_bucket_name: str,
             dest_file_name: str,
             gcs_root: str,
             dataset_location:str='US'):
    
    # Read in the raw sensor data from the public dataset and load in the project bucket
    raw_data_op = load_raw_data(source_bucket_name,
                                prefix,
                                dest_bucket_name,
                                dest_file_name)
    
    
    # # Prepare some output from Data Describe
    # dd_out = datadescribe_op(gcs_root, 
    #                          raw_data.outputs['dest_file_name'])
    
    
    # Preprocess and split the raw data by time
    split_data_op = split_data(raw_data_op.outputs['dest_bucket_name'],
                               raw_data_op.outputs['dest_file_name'],
                               '2004-02-15 12:52:39',
                               True)
    
    # Set up the training args
    train_args = json.dumps(
        ["--bucket", str(split_data_op.outputs['bucket_name']),
         "--train_file", str(split_data_op.outputs['train_dest_file']),
         "--test_file", str(split_data_op.outputs['test_dest_file'])
        ]
    )
    
    job_dir = "{0}/{1}/{2}".format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)
    
    # Train the model on AI Platform
    train_model = vertex_custom_job(
        project=project_id,
        display_name=f"anomaly-detection-{datetime.datetime.now().strftime('%H%M%S')}",
        container_image_uri=TRAINER_IMAGE,
        train_args=train_args, 
    )

    # train_model = caip_train_op(project_id,
    #                             region=region,
    #                             master_image_uri=TRAINER_IMAGE,
    #                             python_version="3.7",
    #                             job_id_prefix=f'anomaly-detection-{datetime.datetime.now().strftime("%H%M%S")}_',
    #                             job_dir=job_dir,
    #                             args=train_args)
    
    # Expose artifacts to the Kubeflow UI
    # disp_loss_img = disp_loss_op(train_model.outputs['job_id'])
    # disp_loss_dist_img = disp_loss_op(train_model.outputs['job_id'])
    


In [10]:
EXPERIMENT_NAME = 'AnomalyDetector'
RUN_ID = 'Run_001'
SOURCE_BUCKET_NAME = 'amazing-public-data'
PREFIX = 'bearing_sensor_data/bearing_sensor_data/'
DEST_BUCKET_NAME = BUCKET_NAME
DEST_FILE_NAME = 'raw_bearing_data.csv'

## Compile the pipline

In [11]:
kfp.v2.compiler.Compiler().compile(pipeline, 'nasa_iot_training.json')

## Submit a Run

In [12]:
aiplatform.init(project=PROJECT_ID, location=REGION)

pipejob = aiplatform.PipelineJob(
    'nasa_iot_training',
    'nasa_iot_training.json',
    pipeline_root=ARTIFACT_STORE_URI,  #'gs://ddowler-bucket/temp',
    parameter_values={
        "project_id": PROJECT_ID,
        "region": REGION,
        "source_bucket_name": SOURCE_BUCKET_NAME,
        "prefix": PREFIX,
        "dest_bucket_name": DEST_BUCKET_NAME,
        "dest_file_name": DEST_FILE_NAME,
        "gcs_root": ARTIFACT_STORE_URI,
        "dataset_location": "US"
    }
)

In [13]:
pipejob.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
ERROR:grpc._plugin_wrapping:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x7f9990003eb0>" raised exception!
Traceback (most recent call last):
  File "/Users/ryanrusson/opt/miniconda3/envs/nasa-iot/lib/python3.8/site-packages/grpc/_plugin_wrapping.py", line 88, in __call__
    self._metadata_plugin(
  File "/Users/ryanrusson/opt/miniconda3/envs/nasa-iot/lib/python3.8/site-packages/google/auth/transport/grpc.py", line 101, in __call__
    callback(self._get_authorization_headers(context), None)
  File "/Users/ryanrusson/opt/miniconda3/envs/nasa-iot/lib/python3.8/site-packages/google/auth/transport/grpc.py", line 87, in _get_authorization_headers
    self._credentials.before_request(
  File "/Users/ryanrusson/opt/miniconda3/envs/nasa-iot/lib/python3.8/site-packages/google/auth/credentials.py", line 133, in before_request
    self.refresh(request)
  File "/Users/ryanrusson/opt/miniconda

ServiceUnavailable: 503 Getting metadata from plugin failed with error: ('invalid_grant: Bad Request', {'error': 'invalid_grant', 'error_description': 'Bad Request'})