# Install packages

In [1]:
# USER_FLAG = "--user"
# ! pip3 install {USER_FLAG} google-cloud-aiplatform==1.7.0
# ! pip3 install {USER_FLAG} kfp==1.8.9

# Restart kernel

In [2]:
# import os
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [3]:
from kfp.v2 import compiler
from kfp.v2.dsl import component, pipeline, Input, Output, OutputPath, Dataset, Metrics, Model, Artifact
from google.cloud import aiplatform

# Define constants

In [4]:
REGION = "europe-west1"
BUCKET_NAME = "gs://my-artifacts-pipeline"
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"  # Cloud Storage path where artifacts created by our pipeline will be written

shell_output = ! gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = shell_output[0]

# Create a Cloud Storage bucket

In [5]:
# But first, delete bucket if exists:
! gcloud storage rm --recursive $BUCKET_NAME
! gcloud storage buckets create $BUCKET_NAME --location=$REGION

Removing objects:
Removing gs://my-artifacts-pipeline/pipeline_root/#1668808263033090...         
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-large-20221118215013/#1668808289886807...
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-large-20221118215013/get-data_3757410563622174720/output_data_path#1668808295515741...
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-small-20221118215013/train-model_-8123085253381193728/model.joblib#1668808341973912...
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-small-20221118215013/get-data_1100286783473582080/output_data_path#1668808268730669...
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-small-20221118215013/train-model_-8123085253381193728/executor_output.json#1668808342278381...
Removing gs://my-artifacts-pipeline/pipeline_root/539271406833/artifacts-pipeline-small-202211

# Create components

In [6]:
@component(
    packages_to_install=["google-cloud-bigquery", "pandas", "db-dtypes"],
    base_image="python:3.9",
    output_component_file="get_data.yaml"
)
def get_data(
    bq_table: str,
    output_data_path: OutputPath("Dataset")
):
    import os
    from google.cloud import bigquery
    import pandas as pd
    
    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
    bqclient = bigquery.Client(project=project_number)
    table = bigquery.TableReference.from_string(bq_table)
    rows = bqclient.list_rows(table)
    dataframe = rows.to_dataframe(create_bqstorage_client=True)
    dataframe = dataframe.sample(frac=1, random_state=2)
    dataframe.to_csv(output_data_path)

In [7]:
@component(
    packages_to_install=["scikit-learn", "pandas", "joblib", "db-dtypes"],
    base_image="python:3.9",
    output_component_file="train_model.yaml"
)
def train_model(
    dataset: Input[Dataset],
    metrics: Output[Metrics],
    model: Output[Model]
):
    from joblib import dump
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    
    df = pd.read_csv(dataset.path)
    labels = df.pop("Class").tolist()
    data = df.values.tolist()
    x_train, x_test, y_train, y_test = train_test_split(data, labels)
    classifier = DecisionTreeClassifier()
    classifier.fit(x_train, y_train)
    score = classifier.score(x_test, y_test)
    print("accuracy is:", score)
    
    metrics.log_metric("accuracy", (score * 100.0))
    metrics.log_metric("framework", "Scikit-learn")
    metrics.log_metric("dataset_size", len(df))
    
    dump(classifier, model.path + ".joblib")

In [8]:
@component(
    packages_to_install=["google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="deploy_model.yaml"
)
def deploy_model(
    model: Input[Model],
    project: str,
    region: str,
    vertex_model: Output[Model],
    vertex_endpoint: Output[Artifact]
):
    from google.cloud import aiplatform
    
    aiplatform.init(project=project, location=region)
    
    deployed_model = aiplatform.Model.upload(
        display_name="artifacts-model",
        artifact_uri = model.uri.replace("/model", "/"),
        serving_container_image_uri="europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")
    
    vertex_model.uri = deployed_model.resource_name
    vertex_endpoint.uri = endpoint.resource_name

# Define pipeline

In [9]:
@pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="artifacts-pipeline"
)
def pipeline(
    bq_table: str,
    output_data_path: str = "data.csv",
    project: str = PROJECT_ID,
    region: str = REGION
):
    task_get_data = get_data(bq_table)
    task_train_model = train_model(task_get_data.output)
    task_deploy_model = deploy_model(task_train_model.outputs["model"], project=project, region=region)

# Compile pipeline

This will generate a JSON file that you'll use to run the pipeline

In [10]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="artifacts_pipeline.json"
)



# ðŸ•‘ Define a timestamp to use for our pipeline job IDs

In [11]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# Create pipeline runs

In [12]:
run1 = aiplatform.PipelineJob(
    display_name="artifacts-pipeline",
    template_path="artifacts_pipeline.json",
    job_id=f"artifacts-pipeline-small-{TIMESTAMP}",
    parameter_values={"bq_table": "sara-vertex-demos.beans_demo.small_dataset"},
    enable_caching=False,
    location=REGION,
)

In [13]:
run2 = aiplatform.PipelineJob(
    display_name="artifacts-pipeline",
    template_path="artifacts_pipeline.json",
    job_id=f"artifacts-pipeline-large-{TIMESTAMP}",
    parameter_values={"bq_table": "sara-vertex-demos.beans_demo.large_dataset"},
    enable_caching=False,
    location=REGION,
)

# Start pipeline runs

In [14]:
run1.submit(service_account=SERVICE_ACCOUNT)

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/539271406833/locations/europe-west1/pipelineJobs/artifacts-pipeline-small-20221119202133
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/539271406833/locations/europe-west1/pipelineJobs/artifacts-pipeline-small-20221119202133')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west1/pipelines/runs/artifacts-pipeline-small-20221119202133?project=539271406833


In [15]:
run2.submit(service_account=SERVICE_ACCOUNT)

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/539271406833/locations/europe-west1/pipelineJobs/artifacts-pipeline-large-20221119202133
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/539271406833/locations/europe-west1/pipelineJobs/artifacts-pipeline-large-20221119202133')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west1/pipelines/runs/artifacts-pipeline-large-20221119202133?project=539271406833
