### Create the template file for creating the pipeline

In [1]:
%%writefile ./pipeline/sensor_training_pipeline.py
import os
import datetime
from func_components import load_raw_data
from func_components import split_data
from func_components import disp_loss
from jinja2 import Template
import kfp
from kfp.components import func_to_container_op
from kfp.dsl.types import Dict
from kfp.dsl.types import GCPProjectID
from kfp.dsl.types import GCPRegion
from kfp.dsl.types import GCSPath
from kfp.dsl.types import String
from kfp.gcp import use_gcp_secret

# Defaults and environment settings
BASE_IMAGE = os.getenv('BASE_IMAGE')
TRAINER_IMAGE = os.getenv('TRAINER_IMAGE')
DD_IMAGE = os.getenv('DD_IMAGE')
RUNTIME_VERSION = os.getenv('RUNTIME_VERSION')
PYTHON_VERSION = os.getenv('PYTHON_VERSION')
COMPONENT_URL_SEARCH_PREFIX = os.getenv('COMPONENT_URL_SEARCH_PREFIX')
USE_KFP_SA = os.getenv('USE_KFP_SA')

# Create component factories
component_store = kfp.components.ComponentStore(
    local_search_paths=None, url_search_prefixes=[COMPONENT_URL_SEARCH_PREFIX])

# Create all the component ops
caip_train_op = component_store.load_component('ml_engine/train')

retrieve_raw_data_op = func_to_container_op(
    load_raw_data, base_image=BASE_IMAGE)

split_preprocess_data_op = func_to_container_op(
    split_data, base_image=BASE_IMAGE)

disp_loss_op = func_to_container_op(
    disp_loss)

def datadescribe_op(gcs_root, filepath):
    return kfp.dsl.ContainerOp(
        name='Run_Data_Describe',
        image = DD_IMAGE,
        arguments=[
            '--gcs_root', gcs_root,
            '--file', filepath
        ]
    )


# Define the pipeline
@kfp.dsl.pipeline(
    name='Bearing Sensor Data Training',
    description='The pipeline for training and deploying an anomaly detector based on an autoencoder')

def pipeline_run(project_id,
                 region,
                 source_bucket_name, 
                 prefix,
                 dest_bucket_name,
                 dest_file_name,
                 gcs_root,
                 dataset_location='US'):
    
    # Read in the raw sensor data from the public dataset and load in the project bucket
    raw_data = retrieve_raw_data_op(source_bucket_name,
                                    prefix,
                                    dest_bucket_name,
                                    dest_file_name)
    
    
    # Prepare some output from Data Describe
    dd_out = datadescribe_op(gcs_root, 
                             raw_data.outputs['dest_file_name'])
    
    
    # Preprocess and split the raw data by time
    split_data = split_preprocess_data_op(raw_data.outputs['dest_bucket_name'],
                                          raw_data.outputs['dest_file_name'],
                                          '2004-02-15 12:52:39',
                                          True)
    
    # Set up the training args
    train_args = ["--bucket", split_data.outputs['bucket_name'],
                  "--train_file", split_data.outputs['train_dest_file'],
                  "--test_file", split_data.outputs['test_dest_file']
                 ]
    
    job_dir = "{0}/{1}/{2}".format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)
    
    # Train the model on AI Platform
    train_model = caip_train_op(project_id,
                                region=region,
                                master_image_uri=TRAINER_IMAGE,
                                python_version="3.7",
                                job_id_prefix=f'anomaly-detection-{datetime.datetime.now().strftime("%H%M%S")}_',
                                job_dir=job_dir,
                                args=train_args)
    
    # Expose artifacts to the Kubeflow UI
    disp_loss_img = disp_loss_op(train_model.outputs['job_id'])
    disp_loss_dist_img = disp_loss_op(train_model.outputs['job_id'])
    


Overwriting ./pipeline/sensor_training_pipeline.py


### Set up the environment

In [2]:
REGION = 'us-central1'
ENDPOINT = 'https://fda9da3634d2db2-dot-us-central2.pipelines.googleusercontent.com'
BUCKET_NAME = 'dlaw_bucket'
ARTIFACT_STORE_URI = f'gs://{BUCKET_NAME}'
PROJECT_ID = "mwpmltr"

In [3]:
%env ENDPOINT=$ENDPOINT
%env PROJECT_ID=$PROJECT_ID
%env REGION=$REGION
%env BUCKET_NAME=$BUCKET_NAME
%env ARTIFACT_STORE_URI=$ARTIFACT_STORE_URI

env: ENDPOINT=https://fda9da3634d2db2-dot-us-central2.pipelines.googleusercontent.com
env: PROJECT_ID=ups-ai-ml-sandbox
env: REGION=us-central1
env: BUCKET_NAME=dlaw_bucket
env: ARTIFACT_STORE_URI=gs://dlaw_bucket


### Create the base image and load it into gcr.io

In [4]:
IMAGE_NAME='nasa-iot-base'
TAG='v1'
BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [5]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $BASE_IMAGE base_image --async

### Create the training image from the base image and load it into the gcr.io (maybe just have one image?)

In [6]:
IMAGE_NAME='nasa-iot-trainer'
TAG='v5'
TRAINER_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [7]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $TRAINER_IMAGE train_image --async

### Create the Data Describe image from the base image and load it into gcr.io

In [8]:
IMAGE_NAME='nasa-iot-datadescribe'
TAG='v1'
DD_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [9]:
# DON'T RUN THIS IF THE IMAGE EXISTS!
# !gcloud builds submit --timeout 15m --tag $DD_IMAGE dd_image --async

### Compile the Pipeline

In [10]:
USE_KFP_SA = False

COMPONENT_URL_SEARCH_PREFIX = 'https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/'
RUNTIME_VERSION = '1.15'
PYTHON_VERSION = '3.7'

%env USE_KFP_SA={USE_KFP_SA}
%env BASE_IMAGE={BASE_IMAGE}
%env TRAINER_IMAGE={TRAINER_IMAGE}
%env ENDPOINT={ENDPOINT}
%env DD_IMAGE={DD_IMAGE}
%env COMPONENT_URL_SEARCH_PREFIX={COMPONENT_URL_SEARCH_PREFIX}
%env RUNTIME_VERSION={RUNTIME_VERSION}
%env PYTHON_VERSION={PYTHON_VERSION}

env: USE_KFP_SA=False
env: BASE_IMAGE=gcr.io/ups-ai-ml-sandbox/nasa-iot-base:v1
env: TRAINER_IMAGE=gcr.io/ups-ai-ml-sandbox/nasa-iot-trainer:v5
env: ENDPOINT=https://fda9da3634d2db2-dot-us-central2.pipelines.googleusercontent.com
env: DD_IMAGE=gcr.io/ups-ai-ml-sandbox/nasa-iot-datadescribe:v1
env: COMPONENT_URL_SEARCH_PREFIX=https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/
env: RUNTIME_VERSION=1.15
env: PYTHON_VERSION=3.7


In [11]:
!dsl-compile --py pipeline/sensor_training_pipeline.py --output sensor_training_pipeline.yaml

### List the Pipeline in AI Platform Pipelines

In [12]:
PIPELINE_NAME='bearing_sensor_anomaly_v1.0'

!kfp --endpoint $ENDPOINT pipeline upload \
-p $PIPELINE_NAME \
sensor_training_pipeline.yaml

Pipeline 17e93454-fb5b-40ba-8d17-39322c3f1d3b has been submitted

Pipeline Details
------------------
ID           17e93454-fb5b-40ba-8d17-39322c3f1d3b
Name         bearing_sensor_anomaly_v1.0
Description
Uploaded at  2020-12-09T03:08:11+00:00
+--------------------+-----------------+
| Parameter Name     | Default Value   |
| project_id         |                 |
+--------------------+-----------------+
| region             |                 |
+--------------------+-----------------+
| source_bucket_name |                 |
+--------------------+-----------------+
| prefix             |                 |
+--------------------+-----------------+
| dest_bucket_name   |                 |
+--------------------+-----------------+
| dest_file_name     |                 |
+--------------------+-----------------+
| gcs_root           |                 |
+--------------------+-----------------+
| dataset_location   | US              |
+--------------------+-----------------+


In [13]:
!kfp --endpoint $ENDPOINT pipeline list

+--------------------------------------+-------------------------------------------------+---------------------------+
| Pipeline ID                          | Name                                            | Uploaded at               |
| 17e93454-fb5b-40ba-8d17-39322c3f1d3b | bearing_sensor_anomaly_v1.0                     | 2020-12-09T03:08:11+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| 4a62afc3-e9c3-450d-8500-1ea3454703fb | [Tutorial] DSL - Control structures             | 2020-12-09T01:06:29+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| da69e0c9-35e0-4ea5-a80c-223ab7dee137 | [Tutorial] Data passing in python components    | 2020-12-09T01:06:28+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| 9873fae7-3ecc-4299-90a3-769f633f075f | [Demo] 

### Submit a Run

In [14]:
PIPELINE_ID='17e93454-fb5b-40ba-8d17-39322c3f1d3b'

In [15]:
EXPERIMENT_NAME = 'AnomalyDetector'
RUN_ID = 'Run_001'
SOURCE_BUCKET_NAME = 'amazing-public-data'
PREFIX = 'bearing_sensor_data/bearing_sensor_data/'
DEST_BUCKET_NAME = BUCKET_NAME
DEST_FILE_NAME = 'raw_bearing_data.csv'

In [16]:
%env PIPELINE_ID=$PIPELINE_ID
%env EXPERIMENT_NAME=$EXPERIMENT_NAME
%env RUN_ID=$RUN_ID
%env SOURCE_BUCKET_NAME=$SOURCE_BUCKET_NAME
%env PREFIX=$PREFIX
%env DEST_BUCKET_NAME=$DEST_BUCKET_NAME
%env DEST_FILE_NAME=$DEST_FILE_NAME

env: PIPELINE_ID=17e93454-fb5b-40ba-8d17-39322c3f1d3b
env: EXPERIMENT_NAME=AnomalyDetector
env: RUN_ID=Run_001
env: SOURCE_BUCKET_NAME=amazing-public-data
env: PREFIX=bearing_sensor_data/bearing_sensor_data/
env: DEST_BUCKET_NAME=dlaw_bucket
env: DEST_FILE_NAME=raw_bearing_data.csv


In [21]:
!kfp --endpoint $ENDPOINT run submit \
-e $EXPERIMENT_NAME \
-r $RUN_ID \
-p $PIPELINE_ID \
project_id=$PROJECT_ID \
gcs_root=$ARTIFACT_STORE_URI \
region=$REGION \
source_bucket_name=$SOURCE_BUCKET_NAME \
prefix=$PREFIX \
dest_bucket_name=$DEST_BUCKET_NAME \
dest_file_name=$DEST_FILE_NAME

Run 8eebcc03-cb9d-41d2-b35e-c6242fffd153 is submitted
+--------------------------------------+---------+----------+---------------------------+
| run id                               | name    | status   | created at                |
| 8eebcc03-cb9d-41d2-b35e-c6242fffd153 | Run_001 |          | 2020-12-09T03:10:34+00:00 |
+--------------------------------------+---------+----------+---------------------------+
