In [1]:
from datetime import datetime
from typing import NamedTuple

from kfp.v2 import dsl, compiler
from kfp.v2.dsl import component, Input, Output, Dataset, Model, Metrics, ClassificationMetrics, Artifact
from google.cloud.aiplatform import pipeline_jobs

# Set up global variables

In [2]:
# Get project name
shell_output = !gcloud config get-value project
PROJECT_ID = shell_output[0]

REGION = "europe-west1"
BUCKET_NAME = f"gs://{PROJECT_ID}-bucket-wine-quality"
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root_wine/"

# 1️⃣ Load data

In [4]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.0.0"],
    base_image="python:3.9",
    output_component_file="get_wine_data.yaml"
)

def get_wine_data(url: str, train_dataset: Output[Dataset], test_dataset: Output[Dataset]):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    
    df_wine = pd.read_csv(url, delimiter=";")
    df_wine["best_quality"] = [1 if x>=7 else 0 for x in df_wine.quality]
    df_wine["target"] = df_wine.best_quality
    df_wine = df_wine.drop(["quality", "total sulfur dioxide", "best_quality"], axis=1)
    
    train, test = train_test_split(df_wine, test_size=0.3)
    train.to_csv(train_dataset.path + ".csv", index=False, encoding="utf-8-sig")
    test.to_csv(test_dataset.path + ".csv", index=False, encoding="utf-8-sig")

# 2️⃣ Train model

In [5]:
@component(
    packages_to_install=["pandas", "scikit-learn==1.0.0"],
    base_image="python:3.9"
)

def train_winequality(dataset: Input[Dataset], model: Output[Model]):
    import pickle
    
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    
    data = pd.read_csv(dataset.path + ".csv")
    model_rf = RandomForestClassifier(n_estimators=10)
    model_rf.fit(data.drop(columns=["target"]), data.target)
    
    model.metadata["framework"] = "RF"
    file_name = model.path + ".pkl"
    
    with open(file_name, "wb") as file:
        pickle.dump(model_rf, file)

# 3️⃣ Evaluate model

In [6]:
@component(
    packages_to_install=["pandas", "sklearn"],
    base_image="python:3.9"
)

def winequality_evaluation(test_dataset: Input[Dataset], model_rf: Input[Model], thresholds: str,
                            metrics: Output[ClassificationMetrics], kpi: Output[Metrics]) -> NamedTuple("output", [("deploy", str)]):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import pandas as pd
    
    
    data = pd.read_csv(test_dataset.path + ".csv")
    model = RandomForestClassifier()
    file_name = model_rf.path + ".pkl"
    
    with open(file_name, "rb") as file:
        model = pickle.load(file)
    
    y_test = data.drop(columns=["target"])
    y_target = data.target
    y_pred = model.predict(y_test)
    
    y_scores = model.predict_proba(data.drop(columns=["target"]))[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true=data.target.to_numpy(), y_score=y_scores, pos_label=True)
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())
    
    metrics.log_confusion_matrix(["False", "True"], confusion_matrix(data.target, y_pred).tolist())
    
    accuracy = accuracy_score(data.target, y_pred.round())
    thresholds_dict = json.loads(thresholds)
    
    model_rf.metadata["accuracy"] = float(accuracy)
    
    kpi.log_metric("accuracy", float(accuracy))
    
    deploy = threshold_check(float(accuracy), int(thresholds_dict["roc"]))
    
    return (deploy,)

# 4️⃣ Deploy model

In [7]:
@component(
    packages_to_install=["google-cloud-aiplatform", "sklearn", "kfp"],
    base_image="python:3.9",
    output_component_file="model_winequality_component.yaml"
)

def deploy_winequality(model: Input[Model], project: str, region: str, serving_container_image_uri: str, vertex_endpoint: Output[Artifact], vertex_model: Output[Model]):
    
    from google.cloud import aiplatform
    
    DISPLAY_NAME = "winequality"
    MODEL_NAME = "winequality-rf"
    ENDPOINT_NAME = "winequality_endpoint"
    
    
    def create_endpoint():
        endpoints = aiplatform.Endpoint.list(filter=f'display_name="{EDNPOINT_NAME}"', order_by="create_time desc", project=project, location=region)
        if len(endpoints) > 0:
            endpoint = endpoints[0]  # most recently created
        else:
            endpoint = aiplatform.Endpoint.create(display_name=ENDPOINT_NAME, project=project, location=region)
    
    
    endpoint = create_endpoint()
    
    # Import model
    model_upload = aiplatform.Model.upload(
        display_name=DISPLAY_NAME,
        artifact_uri=model.uri.replace("model", ""),
        serving_container_image_uri=serving_container_image_uri,
        serving_container_health_route=f"/v1/models/{MODEL_NAME}",
        serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
        serving_container_environment_variables={"MODEL_NAME": MODEL_NAME}
    )
    
    model_deploy = model_upload.deploy(
        machine_type="n1-standard-4",
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=DISPLAY_NAME
    )
    
    # Save data to the ouput params
    vertex_model.uri = model_deploy.resource_name

# Create the pipeline

In [8]:
# To define unique pipeline job names you can use a timestamp
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = "pipeline-winequality-job-{}".format(TIMESTAMP)

In [9]:
# Define the pipeline and then compile it into a .json file

@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,  # default pipeline root. you can override it when submitting the pipeline
    name="pipeline-winequality"  # a name for the pipeline. use to determine the pipeline Context
)

def pipeline(
    url: str = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
    project: str = PROJECT_ID,
    region: str = REGION, 
    display_name: str = DISPLAY_NAME,
    api_endpoint: str = REGION + "-aiplatform.googleapis.com",
    thresholds: str = '{"roc":0.8}',
    serving_container_image_uri: str = "europe-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    ):
    
    # Operators
    data_op = get_wine_data(url)
    train_model_op = train_winequality(data_op.outputs["train_dataset"])
    model_evaluation_op = winequality_evaluation(
        test_dataset=data_op.outputs["test_dataset"],
        model_rf=train_model_op.outputs["model"],
        thresholds=thresholds  # deploy the model only if the model performance is above the threshold
    )
    
    with dsl.Condition(
        model_evaluation_op.outputs["deploy"]=="true",
        name="deploy-winequality"
    ):
        
        deploy_model_op = deploy_winequality(
            model=train_model_op.outputs["model"],
            project=project,
            region=region,
            serving_container_image_uri = serving_container_image_uri,
        )

# Compile and run the pipeline

In [10]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path="ml_winequality.json")



In [11]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="winequality-pipeline",
    template_path="ml_winequality.json",
    enable_caching=False,
    location=REGION,
)

In [16]:
start_pipeline.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


InvalidArgument: 400 Exactly one of deployment_config and deployment_spec is expected.