# An Introduction to Kubeflow Lightweight Components

## Imports

I like to put all my imports at the top of the notebook.

In [1]:
import time

from random import SystemRandom
from string import ascii_lowercase as lc

import kfp
from kfp import dsl
from kfp import compiler
from typing import NamedTuple

from kfp.v2.dsl import component, Input, Output, OutputPath, Dataset, Model, InputPath, OutputPath, ClassificationMetrics, Metrics

#TypeError: In v2 components, please import the Python function annotations `InputPath` and `OutputPath` from package `kfp.v2.dsl` instead of `kfp.dsl`.

rand = SystemRandom()


def upload_pipeline(client: kfp.Client, metadata: dict, pipeline_function):
    
    compiler.Compiler().compile(
        pipeline_function,
        metadata.get("pipeline_package_path"))

    return client.upload_pipeline(
        metadata.get("pipeline_package_path"),
        metadata.get("pipeline_name"))


def random_string():
    return ''.join(rand.choice(lc) for _ in range(4))


def experiment_metadata(
    namespace: str,
    experiment_name: str,
    experiment_description: str,
    pipeline_name: str,
    pipeline_description: str
):
    """Create Metadata for Kubeflow Pipeline Experiment."""

    _namespace = namespace.lower().replace(" ", "-")
    _experiment_name = f"{namespace}-{experiment_name}".lower().replace(" ", "-")
    _experiment_description = experiment_description
    _pipeline_name = f"{_experiment_name}-{pipeline_name}-{random_string()}".lower().replace(" ", "-")
    _pipeline_description = pipeline_description
    _run_name = f"{time.strftime('%Y%m%d-%H%M%S')}-{_pipeline_name}"
    _pipeline_package_path = f"{_run_name}.yaml.zip"

    print("--------------------------")
    print("Metadata")
    print("--------------------------")
    print("Namespace")
    print(f"Name:\t\t{_namespace}")
    print("--------------------------")
    print("Experiment")
    print(f"Name:\t\t{_experiment_name}")
    print(f"Description:\t{_experiment_description}")
    print("--------------------------")
    print("Pipeline")
    print(f"Name:\t\t{_pipeline_name}")
    print(f"Description:\t{_pipeline_description}")
    print(f"Zipped YAML:\t{_pipeline_package_path}")
    print("--------------------------")
    print("Run")
    print(f"Name:\t\t{_run_name}")
    print("--------------------------")

    return {
        "namespace": _namespace,
        "experiment_name": _experiment_name,
        "experiment_description": _experiment_description,
        "pipeline_name": _pipeline_name,
        "pipeline_description": _pipeline_description,
        "run_name": _run_name,
        "pipeline_package_path": _pipeline_package_path
    }

## Metadata

Fill out the metadata for the run, pipeline and experiment!

1. `namespace`: Your namespace.
1. `experiment_name`: Your pipelines are run in an experiment. Give your experiment a unique and descriptive name.
1. `experiment_description`: You should provide a short description, it will be a gift to your future self.
1. `pipeline_name`: Name your pipeline. Must be unique. Try to be descriptive.
1. `pipeline_description`: The more metadata the better!
1. `pipeline_package_path`: This is the location of the zipped YAML containing the description of the pipeline.
1. `run_name`: The run's name is automatically generated by concatenating the `experiment_name`, `pipeline_name` and today's time/date.

In [2]:
# Fill in the following 5 metadata fields:
namespace = "bryanpaget"

experiment_name = "Visualize This!"
experiment_description = "Bryan's Kubeflow visualization experiment."

pipeline_name = "Yummy not Yummy"
pipeline_description = "Is it yummy or not?"

# -------------------------------------------------

# Metadata is created here:
metadata = experiment_metadata(
    namespace, experiment_name, experiment_description,
    pipeline_name, pipeline_description)

--------------------------
Metadata
--------------------------
Namespace
Name:		bryanpaget
--------------------------
Experiment
Name:		bryanpaget-visualize-this!
Description:	Bryan's Kubeflow visualization experiment.
--------------------------
Pipeline
Name:		bryanpaget-visualize-this!-yummy-not-yummy-hnam
Description:	Is it yummy or not?
Zipped YAML:	20220509-205012-bryanpaget-visualize-this!-yummy-not-yummy-hnam.yaml.zip
--------------------------
Run
Name:		20220509-205012-bryanpaget-visualize-this!-yummy-not-yummy-hnam
--------------------------


## Define Components as Functions

In [3]:
@component(
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["pandas"],
    output_component_file="save_confusion_matrix_op.yaml"
)
def save_confusion_matrix(mlpipeline_ui_metadata_path: OutputPath()):

    import pandas as pd 
    import json

    matrix = [
        ['yummy', 'yummy', 10],
        ['yummy', 'not yummy', 2],
        ['not yummy', 'yummy', 6],
        ['not yummy', 'not yummy', 7]
    ]

    df = pd.DataFrame(matrix,columns=['target','predicted','count'])
    print(df)

    metadata = {
        "outputs": [
            {
                "type": "confusion_matrix",
                "format": "csv",
                "schema": [
                    {
                        "name": "target",
                        "type": "CATEGORY"
                    },
                    {
                        "name": "predicted",
                        "type": "CATEGORY"
                    },
                    {
                        "name": "count",
                        "type": "NUMBER"
                    }
                ],
                "source": df.to_csv(header=False, index=False),
                "storage": "inline",
                "labels": [
                    "yummy",
                    "not yummy"
                ]
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, 'w') as f:
        json.dump(metadata, f)


@component(
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["pandas"],
    output_component_file="produce_metrics_op.yaml"
)
def produce_metrics() -> NamedTuple("Outputs", [("mlpipeline_metrics", 'Metrics')]):

    import json

    accuracy = 0.9

    metrics = {
        "metrics": [{
            "name": "accuracy-score",  # The name of the metric. Visualized as the column name in the runs table.
            "numberValue":  accuracy,  # The value of the metric. Must be a numeric value.
            "format": "PERCENTAGE"     # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
        }]
    }

    return [json.dumps(metrics)]


@component(
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["sklearn"],
    output_component_file="iris_sgdclassifier_op.yaml"
)
def iris_sgdclassifier(
    test_samples_fraction: float,
    metrics: Output[ClassificationMetrics]
):

    from sklearn import datasets, model_selection
    from sklearn.linear_model import SGDClassifier
    from sklearn.metrics import confusion_matrix

    iris_dataset = datasets.load_iris()
    
    train_x, test_x, train_y, test_y = model_selection.train_test_split(
        iris_dataset['data'],
        iris_dataset['target'],
        test_size=test_samples_fraction)

    classifier = SGDClassifier()

    classifier.fit(train_x, train_y)

    predictions = model_selection.cross_val_predict(
        classifier,
        train_x,
        train_y,
        cv=3)

    metrics.log_confusion_matrix(
        ['Setosa', 'Versicolour', 'Virginica'],
        confusion_matrix(train_y, predictions).tolist())
    

@component(
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["sklearn"],
    output_component_file="wine_classification_op.yaml"
)
def wine_classification(metrics: Output[ClassificationMetrics]):
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_curve
    from sklearn.datasets import load_wine
    from sklearn.model_selection import train_test_split, cross_val_predict

    X, y = load_wine(return_X_y=True)
    # Binary classification problem for label 1.
    y = y == 1

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    rfc = RandomForestClassifier(n_estimators=10, random_state=42)
    rfc.fit(X_train, y_train)
    y_scores = cross_val_predict(rfc, X_train, y_train, cv=3, method='predict_proba')
    y_predict = cross_val_predict(rfc, X_train, y_train, cv=3, method='predict')
    fpr, tpr, thresholds = roc_curve(y_true=y_train, y_score=y_scores[:,1], pos_label=True)
    metrics.log_roc_curve(fpr, tpr, thresholds)


@component(
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["sklearn"],
    output_component_file="digit_classification_op.yaml"
)
def digit_classification(metrics: Output[Metrics]):
    
    from sklearn import model_selection
    from sklearn.linear_model import LogisticRegression
    from sklearn import datasets
    from sklearn.metrics import accuracy_score

    # Load digits dataset
    iris = datasets.load_iris()

    # # Create feature matrix
    X = iris.data

    # Create target vector
    y = iris.target

    #test size
    test_size = 0.33

    seed = 7
    #cross-validation settings
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)

    #Model instance
    model = LogisticRegression()
    scoring = 'accuracy'
    results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)

    #split data
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
    #fit model
    model.fit(X_train, y_train)

    #accuracy on test set
    result = model.score(X_test, y_test)
    metrics.log_metric('accuracy', (result*100.0))

## Pipeline

In [6]:
@dsl.pipeline(
    name=metadata.get("pipeline_name"),
    description=metadata.get("pipeline_description")
)
def pipeline():
    iris_sgdclassifier_op = iris_sgdclassifier(test_samples_fraction=0.3)
    wine_classification_op = wine_classification()
    digit_classification_op = digit_classification()
    save_confusion_matrix_op = save_confusion_matrix()
    produce_metrics_op = produce_metrics()

## Publish Pipeline and Run Pipeline in an Experiment

The experiment is created once a connection is established to the KFP client. The pipeline is compiled and then run inside the experiment.

In [7]:
client = kfp.Client()

response = upload_pipeline(client, metadata, pipeline)

try:
    experiment = client.get_experiment(
        name=experiment_name,
        description=experiment_description,
        namespace=namespace)
except:
    experiment = client.create_experiment(
        name=experiment_name,
        description=experiment_description,
        namespace=namespace)

run = client.run_pipeline(
    experiment_id=experiment.id,
    job_name=metadata.get("run_name"),
    pipeline_package_path=metadata.get("pipeline_package_path"))