### Vertex AI Pipelines Demo
* Submits Vertex AI Training job
* Training job submits Dataproc Serverless batch job for preprocessing
* Results stored in GCS bucket
* Preprocessing.py only splits the data into 2 categories (anomaly vs not anomaly) based on certain column values

In [None]:
# !pip install google-cloud-pipeline-components

In [None]:
# !pyspark --version

In [1]:
PROJECT_ID='gurkomal-playground'
REGION='us-central1'
BUCKET_URI='gs://avoxi_workshop_bucket'

In [2]:
import google.cloud.aiplatform as aiplatform
from kfp.v2 import dsl
from kfp.v2.dsl import component, Output, Dataset, Model
from kfp.v2 import compiler
from datetime import datetime

  from kfp.v2 import dsl


In [4]:
aiplatform.init(project=PROJECT_ID, staging_bucket=f'{BUCKET_URI}/staging')

In [5]:
from kfp import local

local.init(runner=local.DockerRunner(), raise_on_error=False)

In [6]:
@component(packages_to_install=["google-cloud-bigquery==3.25.0"])
def create_bigquery_tables(anomaly_data_path: str, no_anomaly_data_path: str, anomaly_table_name: str, no_anomaly_table_name: str):
    from google.cloud import bigquery
    # Initialize BigQuery client
    client = bigquery.Client()

    # Define the external connection for anomaly data
    external_config_anomaly = bigquery.ExternalConfig('CSV')
    external_config_anomaly.source_uris = [f"{anomaly_data_path}*.csv"]
    external_config_anomaly.autodetect = True

    # Define the external connection for non-anomaly data
    external_config_no_anomaly = bigquery.ExternalConfig('CSV')
    external_config_no_anomaly.source_uris = [f"{no_anomaly_data_path}*.csv"]
    external_config_no_anomaly.autodetect = True

    # Create or replace the anomaly table
    table_anomaly = bigquery.Table(anomaly_table_name)
    table_anomaly.external_data_configuration = external_config_anomaly
    client.create_table(table_anomaly, exists_ok=True)

    # Create or replace the non-anomaly table
    table_no_anomaly = bigquery.Table(no_anomaly_table_name)
    table_no_anomaly.external_data_configuration = external_config_no_anomaly
    client.create_table(table_no_anomaly, exists_ok=True)

    print(f"BigQuery tables created successfully: {anomaly_table_name}, {no_anomaly_table_name}")

  return component_factory.create_component_from_func(


In [7]:
def get_config(config_gcs_path : str) -> dict:
    import json
    from google.cloud import storage
    from datetime import datetime
    import yaml
    # Initialize GCS client
    storage_client = storage.Client()
    
    # Extract the bucket name and blob (file) name from the config GCS path
    bucket_name, blob_name = config_gcs_path.replace("gs://", "").split("/", 1)
    
    # Download the YAML file from GCS
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    config_data = blob.download_as_text()
    config_data = yaml.safe_load(config_data)
    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    BATCH_ID = "avoxi-workshop-" + TIMESTAMP
    config_data['batch_id'] = BATCH_ID
    return config_data 

In [8]:
config=get_config("gs://avoxi_workshop_bucket/data_pipeline/configuration.yaml")

In [9]:
# Create BigQuery tables
create_bigquery_tables(
    anomaly_data_path=config['dataproc_args']['anomaly_normalized_output'],
    no_anomaly_data_path=config['dataproc_args']['no_anomaly_normalized_output'],
    anomaly_table_name=config['anomaly_table_name'],
    no_anomaly_table_name=config['no_anomaly_table_name']
)

15:38:41.086 - INFO - Executing task [96m'create-bigquery-tables'[0m
15:38:41.087 - INFO - Streamed logs:

    Found image 'python:3.8'

    ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
    kfp 2.8.0 requires click<9,>=8.0.0, which is not installed.
    kfp 2.8.0 requires docstring-parser<1,>=0.7.3, which is not installed.
    kfp 2.8.0 requires google-cloud-storage<3,>=2.2.1, which is not installed.
    kfp 2.8.0 requires kfp-pipeline-spec==0.3.0, which is not installed.
    kfp 2.8.0 requires kfp-server-api<2.1.0,>=2.0.0, which is not installed.
    kfp 2.8.0 requires kubernetes<27,>=8.0.0, which is not installed.
    kfp 2.8.0 requires PyYAML<7,>=5.3, which is not installed.
    kfp 2.8.0 requires requests-toolbelt<1,>=0.8.0, which is not installed.
    kfp 2.8.0 requires tabulate<1,>=0.8.6, which is not installed.
    kfp 2.8.0 requires protobuf<5,>=4.

<kfp.dsl.pipeline_task.PipelineTask at 0x7fcdc0c27b80>

In [59]:
endpoint = aiplatform.Endpoint.create(
    project=config['project_id'],
    location=config['location'],
    display_name=config['endpoint_name'],
)

Creating Endpoint
Create Endpoint backing LRO: projects/506365831141/locations/us-central1/endpoints/2692098145516519424/operations/4810817641620963328
Endpoint created. Resource name: projects/506365831141/locations/us-central1/endpoints/2692098145516519424
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/506365831141/locations/us-central1/endpoints/2692098145516519424')


In [41]:
@component(packages_to_install=["google-cloud-bigquery==3.25.0"])
def train_model(config: dict):
    from google.cloud import bigquery
    
    # Initialize BigQuery client
    client = bigquery.Client()

    # Define the SQL query to create the model and save it in the model registry
    create_model_query = config['create_model_query'].format(**config)
    
    # Execute the query
    query_job = client.query(create_model_query)
    query_job.result()  # Wait for the job to complete

    print(f"Model {config['model_name']} created and saved in model registry at {config['model_registry_path']} successfully.")

In [43]:
# Train model
train_model(config=config)

21:16:47.857 - INFO - Executing task [96m'train-model'[0m
21:16:47.860 - INFO - Streamed logs:

    Found image 'python:3.8'

    ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
    kfp 2.8.0 requires click<9,>=8.0.0, which is not installed.
    kfp 2.8.0 requires docstring-parser<1,>=0.7.3, which is not installed.
    kfp 2.8.0 requires google-cloud-storage<3,>=2.2.1, which is not installed.
    kfp 2.8.0 requires kfp-pipeline-spec==0.3.0, which is not installed.
    kfp 2.8.0 requires kfp-server-api<2.1.0,>=2.0.0, which is not installed.
    kfp 2.8.0 requires kubernetes<27,>=8.0.0, which is not installed.
    kfp 2.8.0 requires PyYAML<7,>=5.3, which is not installed.
    kfp 2.8.0 requires requests-toolbelt<1,>=0.8.0, which is not installed.
    kfp 2.8.0 requires tabulate<1,>=0.8.6, which is not installed.
    kfp 2.8.0 requires protobuf<5,>=4.21.1, but y

<kfp.dsl.pipeline_task.PipelineTask at 0x7f9da264cca0>

In [118]:
@component(packages_to_install=["google-cloud-bigquery==3.25.0"])
def evaluate_model(model_name: str, dataset_name: str, metric: str) -> float:
    from google.cloud import bigquery

    # Initialize BigQuery client
    client = bigquery.Client()

    # Define the SQL query to evaluate the model
    evaluate_model_query = f"""
    SELECT
        {metric}
    FROM
        ML.EVALUATE(MODEL `{model_name}`, (
            SELECT
                *
            FROM
                `{dataset_name}`
        ))
    """

    # Execute the query
    query_job = client.query(evaluate_model_query)
    results = query_job.result()

    # Extract MSE from the results
    mse = None
    for row in results:
        print(f"{metric} score: {row[metric]}")
        mse = row[metric]
        break

    if mse is None:
        raise ValueError("Mean Squared Error (MSE) not found in evaluation results.")

    print(f"Model {model_name} evaluated with MSE: {mse}")
    return mse

In [120]:
# # Evaluate model
mse = evaluate_model(model_name=config['model_name'], dataset_name=config['eval_dataset'], metric=config['eval_metric'])

23:15:22.914 - INFO - Executing task [96m'evaluate-model'[0m
23:15:22.915 - INFO - Streamed logs:

    Found image 'python:3.7'

    ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
    kfp 2.7.0 requires click<9,>=8.0.0, which is not installed.
    kfp 2.7.0 requires docstring-parser<1,>=0.7.3, which is not installed.
    kfp 2.7.0 requires google-cloud-storage<3,>=2.2.1, which is not installed.
    kfp 2.7.0 requires kfp-pipeline-spec==0.3.0, which is not installed.
    kfp 2.7.0 requires kfp-server-api<2.1.0,>=2.0.0, which is not installed.
    kfp 2.7.0 requires kubernetes<27,>=8.0.0, which is not installed.
    kfp 2.7.0 requires PyYAML<7,>=5.3, which is not installed.
    kfp 2.7.0 requires requests-toolbelt<1,>=0.8.0, which is not installed.
    kfp 2.7.0 requires tabulate<1,>=0.8.6, which is not installed.
    kfp 2.7.0 requires urllib3<2.0.0, but you 

In [125]:
@component
def compare_metrics(blessed_mse: float, challenger_mse: float, default_mse_threshold: float) -> bool:
    # Define the default MSE threshold for deployment
    default_mse_threshold = 0.1  # Adjust this threshold as needed

    # Compare MSE values
    if challenger_mse < blessed_mse and challenger_mse < default_mse_threshold:
        print(f"Challenger model MSE ({challenger_mse}) is better than blessed model MSE ({blessed_mse}). Deploying challenger model.")
        return True
    else:
        print(f"Challenger model MSE ({challenger_mse}) is not better than blessed model MSE ({blessed_mse}). Keeping blessed model.")
        return False

In [126]:
# # Compare metrics
should_deploy = compare_metrics(blessed_mse=config['blessed_model_mse'], challenger_mse=mse.output, default_mse_threshold=config['default_mse_threshold'])

23:18:44.618 - INFO - Executing task [96m'compare-metrics'[0m
23:18:44.621 - INFO - Streamed logs:

    Found image 'python:3.7'

    [KFP Executor 2024-08-15 23:18:51,119 INFO]: Looking for component `compare_metrics` in --component_module_path `/tmp/tmp.LmVsaeNFvp/ephemeral_component.py`
    [KFP Executor 2024-08-15 23:18:51,119 INFO]: Loading KFP component "compare_metrics" from /tmp/tmp.LmVsaeNFvp/ephemeral_component.py (directory "/tmp/tmp.LmVsaeNFvp" and module name "ephemeral_component")
    [KFP Executor 2024-08-15 23:18:51,121 INFO]: Got executor_input:
    {
        "inputs": {
            "parameterValues": {
                "default_mse_threshold": 0.1,
                "challenger_mse": 11.438526625708217,
                "blessed_mse": 0.15
            }
        },
        "outputs": {
            "parameters": {
                "Output": {
                    "outputFile": "/home/jupyter/local_outputs/compare-metrics-2024-08-15-23-18-44-618003/compare-metrics/Output"
  

In [132]:
import yaml

config_file="config.yaml"
with open(config_file, 'r') as file:
        config = yaml.safe_load(file)

In [130]:
@component(packages_to_install=["google-cloud-aiplatform"])
def deploy_model(project_id:str, location: str, model_name: str, endpoint_name: str, should_deploy: bool, machine_type: str):
    from google.cloud import aiplatform
    
    if should_deploy:
        # Initialize Vertex AI SDK
        aiplatform.init(project=project_id, location=location)

        # Get the model
        model = aiplatform.Model(model_name)

        # Get the endpoint
        endpoint = aiplatform.Endpoint(endpoint_name)

        # Deploy the model to the endpoint
        model.deploy(endpoint=endpoint, machine_type=machine_type)

        print(f"Model {model_name} deployed to endpoint {endpoint_name} successfully.")
    else:
        print("Model deployment skipped as per comparison result.")

In [133]:
# # Deploy model
deploy_model(project_id=config['project_id'],location=config['location'], 
             model_name=config['model_name'], endpoint_name=config['endpoint_name'],
             should_deploy=should_deploy.output,
            machine_type=config['machine_type'])

23:23:26.859 - INFO - Executing task [96m'deploy-model'[0m
23:23:26.864 - INFO - Streamed logs:

    Found image 'python:3.7'

    ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
    kfp 2.7.0 requires click<9,>=8.0.0, which is not installed.
    kfp 2.7.0 requires docstring-parser<1,>=0.7.3, which is not installed.
    kfp 2.7.0 requires kfp-pipeline-spec==0.3.0, which is not installed.
    kfp 2.7.0 requires kfp-server-api<2.1.0,>=2.0.0, which is not installed.
    kfp 2.7.0 requires kubernetes<27,>=8.0.0, which is not installed.
    kfp 2.7.0 requires PyYAML<7,>=5.3, which is not installed.
    kfp 2.7.0 requires requests-toolbelt<1,>=0.8.0, which is not installed.
    kfp 2.7.0 requires tabulate<1,>=0.8.6, which is not installed.
    kfp 2.7.0 requires urllib3<2.0.0, but you have urllib3 2.0.7 which is incompatible.
    
    [notice] A new release of pip 

<kfp.dsl.pipeline_task.PipelineTask at 0x7efef1bc2e30>

Query is running:   0%|          |

In [None]:
%%bigquery
CREATE TABLE avoxi_workshop.detected_anomalies AS(
SELECT
  *
FROM
  ML.DETECT_ANOMALIES(MODEL `avoxi_workshop.anomaly_autoencoder_v3`,
    STRUCT(0.03 AS contamination),
    TABLE `avoxi_workshop.eval_set`))