Install required packages

In [10]:
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform



Restart the kernel

In [11]:

# Imports from the later cell (assuming you run this after the import cell)
import kfp
import google_cloud_pipeline_components

print(f"KFP SDK version: {kfp.__version__}")
# Note: google-cloud-aiplatform does not expose __version__ directly on its top-level import
print(f"google-cloud-pipeline-components version: {google_cloud_pipeline_components.__version__}")

KFP SDK version: 2.14.6
google-cloud-pipeline-components version: 2.21.0


Imports

In [12]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.types import artifact_types

Project and pipeline configurations

In [None]:
PROJECT_ID   = "assignment1"        
REGION       = "us-central1"
PIPELINE_ROOT = "gs://assignment1group3/runs"
DATA_BUCKET = 'assignment1group3'
DATA_FILE = 'data/penguins_clean.csv'
MODEL_DIR = 'gs://assignment1group3/models'

Pipeline component: Data Ingestion

In [14]:
@dsl.component(
    packages_to_install= ["pandas", "google-cloud-storage"],
    base_image="python:3.10.7-slim",
)
def download_data(project_id: str,
                  bucket: str,
                  file_name: str,
                  dataset: Output[Dataset]):
    """Download data"""
    from google.cloud import storage
    import pandas as pd
    import logging
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(file_name)
    blob.download_to_filename(dataset.path + '.csv')
    logging.info(f"Downloaded to {local_csv}")


Pipeline component: Preprocess data

In [15]:
@dsl.component(
        packages_to_install=['pandas', 'scikit-learn'],
        base_image="python:3.10.7-slim"
        )
def preprocess(dataset: Input[Dataset],
               train_out: Output[Dataset],
               test_out: Output[Dataset]):
    
    import pandas as pd
    from sklearn.model_selection import train_test_split

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    cols = ["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","species"]
    df = pd.read_csv(dataset.path + ".csv")[cols].dropna()
    train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["species"])
    train.to_csv(train_out.path + ".csv", index=False)
    test.to_csv(test_out.path + ".csv", index=False)

Pipeline component: train data

In [16]:
@dsl.component(
    base_image="python:3.10.7-slim",
    packages_to_install=["pandas","scikit-learn","joblib"]
)

def train(train_ds: Input[Dataset],
          model_art: Output[Model]):
    import pandas as pd
    import joblib
    import json
    import os
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df = pd.read_csv(train_ds.path + ".csv")
    X = df[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]]
    y = df["species"]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, multi_class="ovr", random_state=42))
    ]).fit(X, y)

    joblib.dump(pipe, model_art.path + ".pkl")
    with open(model_art.path + "_meta.json","w") as f:
        json.dump({"classes": sorted(y.unique())}, f)

Component: predict

In [17]:
@dsl.component(
    base_image="python:3.10.7-slim",
    packages_to_install=["pandas","joblib"]
)
def batch_predict(model_art: Input[Model],
                  features_csv: Input[Dataset],
                  predictions_out: Output[Dataset]):
    """make predictions"""

    import pandas as pd
    import joblib

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    X = pd.read_csv(features_csv.path + ".csv")[[
        "bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"
    ]]
    model = joblib.load(model_art.path + ".pkl")
    preds = model.predict(X)

    out = pd.DataFrame({"prediction": preds})
    out.to_csv(predictions_out.path + ".csv", index=False)


Component: Evaluation

In [18]:
@dsl.component(
    base_image="python:3.10.7-slim",
    packages_to_install=["pandas","joblib","gcsfs","fsspec"]
)

def evaluate_and_promote(test_ds: Input[Dataset],
                         model_art: Input[Model],
                         metrics: Output[Metrics],
                         model_dir: str):
    """evaluate"""

    import pandas as pd
    import joblib
    import json
    import fsspec
    import os

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df = pd.read_csv(test_ds.path + ".csv")
    X = df[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]]
    y = df["species"]

    model = joblib.load(model_art.path + ".pkl")
    acc = float((model.predict(X) == y).mean())
    metrics.log_metric("accuracy", acc)

    # Promote to your single model location (no champion subfolder per your choice)
    fs = fsspec.filesystem("gcs")
    proj_model_dir = model_dir.rstrip("/")
    fs.put(model_art.path + ".pkl",        f"{proj_model_dir}/model.pkl")
    fs.put(model_art.path + "_meta.json",  f"{proj_model_dir}/model_meta.json")
    # optional metrics file
    with fs.open(f"{proj_model_dir}/metrics.json","w") as f:
        json.dump({"accuracy": acc}, f)


Component: pipeline definition

In [19]:
@kfp.dsl.pipeline(name="penguins-pipeline")

def penguins_pipeline(run_id: str = "manual-run"):
    
    ingest = download_data(
        project_id=PROJECT_ID,
        bucket=DATA_BUCKET,
        file_name=DATA_FILE
    )

    prep = preprocess(
        dataset=ingest.outputs["dataset"]
    )

    trn = train(
        train_ds=prep.outputs["train_out"]
    )

    evaluate_and_promote(
        test_ds=prep.outputs["test_out"],
        model_art=trn.outputs["model_art"],
        model_dir=MODEL_DIR
    )


Compilation

In [None]:
from kfp.compiler import Compiler
compiler.Compiler().compile(pipeline_func = penguins_pipeline, 
                            package_path = 'penguins_pipeline.json')

  from kfp.v2 import compiler
