In [51]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https:/www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Automated MLOps pipeline build, testing and deployment

In the previous notebook, you created a machine learning pipeline to train a model. In this session, it's all about automating the training and deployment of this model. Hence, the objective this week is to:

1. Write a pipeline into a Python file that can be compiled into YAML in an automated fashion.
2. Define some dummy unit tests
3. Write a script to deploy a compiled Kubeflow pipeline to Vertex AI.
4. Use Code Build (CI/CD) to compile, test, and run your Kubeflow pipeline.


In [52]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-mlops" 
REGION = "us-central1"

# Experiments
EXPERIMENT_NAME = "test-experiment"
PIPELINE_NAME = "mlops-pipeline-prod"



### Create a script containing your Vertex AI/Kubeflow Pipeline to compile the pipeline into `pipeline.json`

> <font color='green'>**Task 1**</font>
>
> Create a Python script `src/pipeline.py` that creates a file name `pipeline.json` from the Kubeflow pipeline you developed last week. The output file should be in YAML and not JSON format.
>

In [53]:
!mkdir -p src

In [54]:
%%writefile src/requirements.txt
kfp==1.8.18
pytest==7.2.0
pytz==2022.7
google-cloud-aiplatform==1.20.0
google-api-core==2.10.2
google-auth==1.35.0
google-cloud-bigquery==1.20.0
google-cloud-core==1.7.3
google-cloud-resource-manager==1.6.3
google-cloud-storage==2.2.1

Writing src/requirements.txt


In [55]:
%%writefile src/pipeline.py

import argparse

import kfp.v2.compiler as compiler
import kfp.v2.dsl as dsl


@dsl.component(packages_to_install=["scikit-learn", "pandas", "joblib"])
def model_training_op(
        dataset: dsl.Input[dsl.Dataset],
        model: dsl.Output[dsl.Model]
):
    import glob
    import json
    import os

    import joblib
    import pandas as pd

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import train_test_split

    TARGET_COLUMN = "tip_bin"
    TARGET_LABELS = ["tip<20%", "tip>=20%"]

    def sanitize(path: str) -> str:
        return path.replace("gs://", "/gcs/", 1) if path and path.startswith("gs://") else path

    def get_dataframe(path: str):
        if os.path.isdir(path):  # base data directory is passed
            files = glob.glob(f"{path}/*.csv")
        elif "*" in path:  # a glob expression is passed
            files = glob.glob(path)
        else:  # single file is passed
            files = [path]
        dfs = (pd.read_csv(f, header=0) for f in files)
        return pd.concat(dfs, ignore_index=True)

    def create_datasets(training_data_dir: str, validation_data_dir: str):
        """Creates training and validation datasets."""

        train_dataset = get_dataframe(training_data_dir)

        if validation_data_dir:
            return train_dataset, get_dataframe(validation_data_dir)
        else:
            return train_test_split(train_dataset, test_size=.25, random_state=42)

    def log_metrics(y_pred: pd.Series, y_true: pd.Series, output_dir: str):
        curve = roc_curve(y_score=y_pred, y_true=y_true)
        auc = roc_auc_score(y_score=y_pred, y_true=y_true)
        cm = confusion_matrix(labels=[False, True], y_pred=y_pred, y_true=y_true)

        with open(f"{output_dir}/metrics.json", "w") as f:
            metrics = {"auc": auc}
            metrics["confusion_matrix"] = {}
            metrics["confusion_matrix"]["categories"] = TARGET_LABELS
            metrics["confusion_matrix"]["matrix"] = cm.tolist()
            metrics["roc_curve"] = {}
            metrics["roc_curve"]["fpr"] = curve[0].tolist()
            metrics["roc_curve"]["tpr"] = curve[1].tolist()
            metrics["roc_curve"]["thresholds"] = curve[2].tolist()
            json.dump(metrics, f, indent=2)

    def split(df: pd.DataFrame):
        return df.drop(TARGET_COLUMN, axis=1), df[TARGET_COLUMN]

    def train(training_data_dir: str, validation_data_dir: str, output_dir: str):
        train_df, val_df = create_datasets(training_data_dir, validation_data_dir)

        X_train, y_train = split(train_df)
        X_test, y_test = split(val_df)

        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        os.makedirs(output_dir, exist_ok=True)
        joblib.dump(model, f"{output_dir}/model.joblib")

        y_pred = model.predict(X_test)
        log_metrics(y_pred, y_test, output_dir)

        return model.score(X_test, y_test)

    train(f"{dataset.path}/train", f"{dataset.path}/val", f"{model.path}")


@dsl.component()
def data_validation_op(dataset: dsl.Input[dsl.Dataset]) -> str:
    return "valid"


@dsl.component()
def data_preparation_op():
    pass


@dsl.component(packages_to_install=["google-cloud-aiplatform"])
def model_validation_op(
        metrics: dsl.Input[dsl.ClassificationMetrics],
        threshold_auc: float = 0.50
) -> str:
    return "valid" if metrics.metadata["auc"] > threshold_auc else 'not_valid'


@dsl.component(packages_to_install=["google-cloud-aiplatform"])
def model_upload_op(
        model: dsl.Input[dsl.Model],
        serving_container_image_uri: str,
        project_id: str,
        location: str,
        model_name: str) -> str:
    from google.cloud import aiplatform

    aiplatform.init(project=project_id, location=location)
    matches = aiplatform.Model.list(filter=f"display_name={model_name}")
    parent_model = matches[0].resource_name if matches else None

    registered_model = aiplatform.Model.upload(
        display_name=model_name,
        parent_model=parent_model,
        artifact_uri=model.uri,
        serving_container_image_uri=serving_container_image_uri
    )

    return registered_model.versioned_resource_name


@dsl.component(packages_to_install=["google-cloud-aiplatform"])
def model_evaluation_upload_op(
        metrics: dsl.Input[dsl.ClassificationMetrics],
        model_resource_name: str,
        project_id: str,
        location: str):
    from google.api_core import gapic_v1
    from google.cloud import aiplatform
    from google.protobuf.struct_pb2 import Struct
    from google.protobuf.struct_pb2 import Value

    model_evaluation = {
        "display_name": "pipeline-eval",
        "metrics": Value(struct_value=Struct(fields={"auRoc": Value(number_value=metrics.metadata["auc"])})),
        "metrics_schema_uri": "gs://google-cloud-aiplatform/schema/modelevaluation/classification_metrics_1.0.0.yaml"
    }

    aiplatform.init(project=project_id, location=location)
    api_endpoint = location + '-aiplatform.googleapis.com'
    client = aiplatform.gapic.ModelServiceClient(client_info=gapic_v1.client_info.ClientInfo(
        user_agent="google-cloud-pipeline-components"),
        client_options={
            "api_endpoint": api_endpoint,
        })
    client.import_model_evaluation(parent=model_resource_name, model_evaluation=model_evaluation)


@dsl.component()
def model_evaluation_op(model: dsl.Input[dsl.Model], metrics: dsl.Output[dsl.ClassificationMetrics]):
    import json

    with open(f"{model.path}/metrics.json", "r") as f:
        model_metrics = json.load(f)

    conf_matrix = model_metrics["confusion_matrix"]
    metrics.log_confusion_matrix(categories=conf_matrix["categories"], matrix=conf_matrix["matrix"])

    curve = model_metrics["roc_curve"]
    metrics.log_roc_curve(fpr=curve["fpr"], tpr=curve["tpr"], threshold=curve["thresholds"])

    metrics.metadata["auc"] = model_metrics["auc"]


@dsl.component(packages_to_install=["google-cloud-bigquery"])
def data_extract_op(project_id: str, location: str, dataset: dsl.Output[dsl.Dataset]):
    import os

    from google.cloud import bigquery

    client = bigquery.Client()
    query = """
    EXPORT DATA OPTIONS(
        uri='{path}/*.csv',
        format='CSV',
        overwrite=true,
        header=true,
        field_delimiter=',') AS
    SELECT
        EXTRACT(MONTH from pickup_datetime) as trip_month,
        EXTRACT(DAY from pickup_datetime) as trip_day,
        EXTRACT(DAYOFWEEK from pickup_datetime) as trip_day_of_week,
        EXTRACT(HOUR from pickup_datetime) as trip_hour,
        TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, SECOND) as trip_duration,
        trip_distance,
        payment_type,
        pickup_location_id as pickup_zone,
        pickup_location_id as dropoff_zone,
        IF((SAFE_DIVIDE(tip_amount, fare_amount) >= 0.2), 1, 0) AS tip_bin
    FROM
        `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_{year}` TABLESAMPLE SYSTEM (1 PERCENT)
    WHERE
        TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, SECOND) BETWEEN 300 AND 10800
    LIMIT {limit}
    """
    datasets = [
        (f"{dataset.path}/train", 2020, 10000),
        (f"{dataset.path}/val", 2020, 5000),
        (f"{dataset.path}/test", 2020, 1000)
    ]
    for ds in datasets:
        path = ds[0].replace("/gcs/", "gs://", 1)
        os.makedirs(path, exist_ok=True)
        # ignoring the provided location as this dataset is in US
        job = client.query(query.format(path=path, year=ds[1], limit=ds[2]), project=project_id, location="us")
        job.result()



@dsl.pipeline(name="taxi-tips-training")
def training_pipeline(
        project_id: str, location: str):
    model_name = "taxi-tips"

    data_extraction_task = data_extract_op(
        project_id=project_id, location=location
    ).set_display_name("extract-data")

    data_validation_task = data_validation_op(
        dataset=data_extraction_task.outputs["dataset"]
    ).set_display_name("validate-data")

    data_preparation_task = data_preparation_op().set_display_name("prepare-data")
    data_preparation_task.after(data_validation_task)

    training_task = model_training_op(
        dataset=data_extraction_task.outputs["dataset"],
    ).set_display_name("train-model")
    training_task.after(data_preparation_task)

    model_evaluation_task = model_evaluation_op(
        model=training_task.outputs["model"]
    ).set_display_name("evaluate-model")

    model_validation_task = model_validation_op(
        metrics=model_evaluation_task.outputs["metrics"],
    ).set_display_name("validate-model")

    with dsl.Condition(model_validation_task.output == "valid", name="check-performance"):
        model_upload_task = model_upload_op(
            model=training_task.outputs["model"],
            model_name=model_name,
            serving_container_image_uri="europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-23:latest",
            project_id=project_id,
            location=location
        ).set_display_name("register-model")

        model_evaluation_upload_task = model_evaluation_upload_op(
            metrics=model_evaluation_task.outputs["metrics"],
            model_resource_name=model_upload_task.output,
            project_id=project_id,
            location=location
        ).set_display_name("register-model-evaluation")

def compile(filename: str):
    cmp = compiler.Compiler()
    cmp.compile(pipeline_func=training_pipeline, package_path=filename)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pipeline-file-name", type=str, default="pipeline.json")

    args = parser.parse_args()

    compile(args.pipeline_file_name)


Writing src/pipeline.py


Using the next command, you can test the materialized pipeline generated by your script. You can view the output in a file named `pipeline.json`.

In [56]:
!python src/pipeline.py



In [57]:
!head -n20 pipeline.json

{
  "pipelineSpec": {
    "components": {
      "comp-condition-check-performance-1": {
        "dag": {
          "tasks": {
            "model-evaluation-upload-op": {
              "cachingOptions": {
                "enableCache": true
              },
              "componentRef": {
                "name": "comp-model-evaluation-upload-op"
              },
              "dependentTasks": [
                "model-upload-op"
              ],
              "inputs": {
                "artifacts": {
                  "metrics": {
                    "componentInputArtifact": "pipelineparam--model-evaluation-op-metrics"


### Test the Pipeline

> <font color='green'>**Task 2**</font>
> Write unit/integration tests for the pipeline you created to ensure the component logic that you added works as expected

In [58]:
!mkdir -p src/tests

In [59]:
%%writefile src/tests/test_pipeline.py

import unittest
from pipeline import training_pipeline

class TestBasicPipeline(unittest.TestCase):
    
    def test_pipeline(self):
        pass

if __name__ == '__main__':
    unittest.main()

Writing src/tests/test_pipeline.py


Using the next command, you can run the tests in the script using python `unittest` test runner. It discovers all the test files that start with `test_*`

You can also use other testing framework of your choice (e.g. `pytest`)

In [60]:
!PYTHONPATH=src python -m unittest discover -s src/tests/

.
----------------------------------------------------------------------
Ran 1 test in 0.000s

OK


### Create a script to submit your compile kubeflow pipeline (`pipeline.json`) to Vertex AI

In [61]:
%%writefile src/submit-pipeline.py
import os

from google.cloud import aiplatform
import google.auth

PROJECT_ID = os.getenv("PROJECT_ID")
if not PROJECT_ID:
    creds, PROJECT_ID = google.auth.default()

REGION = os.environ["REGION"]
BUCKET_NAME = os.environ["BUCKET_NAME"]
PIPELINE_NAME = os.environ["PIPELINE_NAME"]
EXPERIMENT_NAME = os.environ.get("EXPERIMENT_NAME", "dummy-experiment")
ENDPOINT_NAME = os.environ.get("ENDPOINT_NAME","dummy-endpoint")

aiplatform.init(project=PROJECT_ID, location=REGION)
sync_pipeline = os.getenv("SUBMIT_PIPELINE_SYNC", 'False').lower() in ('true', '1', 't')

job = aiplatform.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path='pipeline.json',
    location=REGION,
    project=PROJECT_ID,
    enable_caching=True,
    pipeline_root=f'gs://{BUCKET_NAME}',
    parameter_values={'project_id':PROJECT_ID, 'location':REGION}
)
print(f"Submitting pipeline {PIPELINE_NAME} in experiment {EXPERIMENT_NAME}.")
job.submit(experiment=EXPERIMENT_NAME)

if sync_pipeline:
    job.wait()

Writing src/submit-pipeline.py


Let's test this script in the Notebook. You can check the pipeline's status by clicking on the link printed by the script.

In [62]:
%set_env REGION=$REGION
%set_env BUCKET_NAME=$BUCKET_NAME
%set_env EXPERIMENT_NAME=$EXPERIMENT_NAME
%set_env PIPELINE_NAME=$PIPELINE_NAME
%set_env ENDPOINT_NAME=$ENDPOINT_NAME
%set_env SUBMIT_PIPELINE_SYNC=1

!python src/submit-pipeline.py

env: REGION=us-central1
env: BUCKET_NAME=qinetiq-workshop23lon-5240-mlops
env: EXPERIMENT_NAME=test-experiment
env: PIPELINE_NAME=mlops-pipeline-prod
env: ENDPOINT_NAME=$ENDPOINT_NAME
env: SUBMIT_PIPELINE_SYNC=1
Submitting pipeline mlops-pipeline-prod in experiment test-experiment.
Creating PipelineJob
PipelineJob created. Resource name: projects/450304494793/locations/us-central1/pipelineJobs/taxi-tips-training-20230122184911
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/450304494793/locations/us-central1/pipelineJobs/taxi-tips-training-20230122184911')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/taxi-tips-training-20230122184911?project=450304494793
Associating projects/450304494793/locations/us-central1/pipelineJobs/taxi-tips-training-20230122184911 to Experiment: test-experiment
PipelineJob projects/450304494793/locations/us-central1/pipelineJobs/taxi-tips-training-2023012218491

### Automate Kubeflow pipeline compilation, template generation, and execution through Cloud Build

Cloud Build is a service that executes your builds on Google Cloud. In this exercise, we want to use it to both compile and run your machine learning pipeline. For more information, please refer to the [Cloud Build documentation](https://cloud.google.com/build/docs/overview).

In [32]:
%%writefile src/cloudbuild.yaml
steps:
  # Install dependencies
  - name: 'python'
    entrypoint: 'pip'
    args: ["install", "-r", "requirements.txt", "--user"]

  # Compile pipeline
  - name: 'python'
    entrypoint: 'python'
    args: ['pipeline.py']
    id: 'compile'

  # Test the Pipeline Components 
  - name: 'python'
    entrypoint: 'python'
    args: ['-m', 'unittest', 'discover', 'tests/']
    id: 'test_pipeline'
    waitFor: ['compile']

  # Upload compiled pipeline to GCS
  - name: 'gcr.io/cloud-builders/gsutil'
    args: ['cp', 'pipeline.json', 'gs://${_BUCKET_NAME}']
    id: 'upload'
    waitFor: ['test_pipeline']
        
  # Run the Vertex AI Pipeline (synchronously for test/qa environment).
  - name: 'python'
    id: 'test'
    entrypoint: 'python'
    env: ['BUCKET_NAME=${_BUCKET_NAME}', 'EXPERIMENT_NAME=qa-${_EXPERIMENT_NAME}', 'PIPELINE_NAME=${_PIPELINE_NAME}',
          'REGION=${_REGION}', 'ENDPOINT_NAME=qa-${_ENDPOINT_NAME}', 'SUBMIT_PIPELINE_SYNC=true']
    args: ['submit-pipeline.py']
    
  # Run the Vertex AI Pipeline (asynchronously for prod environment). In a real production scenario, this would run in a different GCP project.
  - name: 'python'
    id: 'prod'
    entrypoint: 'python'
    env: ['BUCKET_NAME=${_BUCKET_NAME}', 'EXPERIMENT_NAME=prod-${_EXPERIMENT_NAME}', 'PIPELINE_NAME=${_PIPELINE_NAME}',
          'REGION=${_REGION}', 'ENDPOINT_NAME=prod-${_ENDPOINT_NAME}', 'SUBMIT_PIPELINE_SYNC=false']
    args: ['submit-pipeline.py']
    

Writing src/cloudbuild.yaml


Cloud Build uses a special service account to execute builds on your behalf. When you enable the Cloud Build API on a Google Cloud project, the Cloud Build service account is automatically created and granted the Cloud Build Service Account role for the project. This role gives the service account permissions to perform several tasks, however you can grant more permissions to the service account to perform additional tasks. [This page](https://cloud.google.com/build/docs/securing-builds/configure-access-for-cloud-build-service-account) explains how to grant and revoke permissions to the Cloud Build service account.

For Cloud Build to be able to deploy your pipeline, you need to give its' service account `{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com` the **Vertex AI User** and **Service Account User** role. (Step already done as part of the workshop setup)

Now you are ready to trigger this pipeline. This CICD pipeline can be embedded in your repository and triggered when a file changes with any
git provider, see more about triggering [here](https://cloud.google.com/build/docs/automating-builds/create-manage-triggers). 

In [48]:
!gcloud builds submit ./src --config=src/cloudbuild.yaml --substitutions=_BUCKET_NAME=$BUCKET_NAME,_EXPERIMENT_NAME=$EXPERIMENT_NAME,_PIPELINE_NAME=$PIPELINE_NAME,_REGION=$REGION,_ENDPOINT_NAME=$ENDPOINT_NAME

Creating temporary tarball archive of 7 file(s) totalling 27.0 KiB before compression.
Uploading tarball of [./src] to [gs://qinetiq-workshop23lon-5240_cloudbuild/source/1674412711.713397-699cbb6eff4b4cb39f6b826850350ccc.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/qinetiq-workshop23lon-5240/locations/global/builds/aa69153a-7355-4855-98f6-0396eb6ba97e].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/aa69153a-7355-4855-98f6-0396eb6ba97e?project=450304494793 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "aa69153a-7355-4855-98f6-0396eb6ba97e"

FETCHSOURCE
Fetching storage object: gs://qinetiq-workshop23lon-5240_cloudbuild/source/1674412711.713397-699cbb6eff4b4cb39f6b826850350ccc.tgz#1674412711972244
Copying gs://qinetiq-workshop23lon-5240_cloudbuild/source/1674412711.713397-699cbb6eff4b4cb39f6b826850350ccc.tgz#1674412711972244...
/ [1 files][ 10.1 KiB/ 10.1 KiB]                                