# An Introduction to Kubeflow Lightweight Components

## Imports

I like to put all my imports at the top of the notebook.

In [4]:
import time
import os

import kfp
from kfp import dsl
from kfp import compiler
from kfp import components as comp

from kfp.v2.dsl import component, Input, Output, Dataset

## Metadata

Fill out the metadata for the run, pipeline and experiment!

1. `namespace`: Your namespace.
1. `experiment_name`: Your pipelines are run in an experiment. Give your experiment a unique and descriptive name.
1. `experiment_description`: You should provide a short description, it will be a gift to your future self.
1. `pipeline_name`: Name your pipeline. Must be unique. Try to be descriptive.
1. `pipeline_description`: The more metadata the better!
1. `pipeline_package_path`: This is the location of the zipped YAML containing the description of the pipeline.
1. `run_name`: The run's name is automatically generated by concatenating the `experiment_name`, `pipeline_name` and today's time/date.

In [5]:
namespace = "bryanpaget"
experiment_name = "happy-big-experiment"
experiment_description = "My happiest and biggest experiment to date."
pipeline_name = "happy-pipeline-lightweight-99"
pipeline_description = "This is what I'm doing now!"

run_name = f"{experiment_name}-{pipeline_name}-{time.strftime('%Y%m%d-%H%M%S')}"
pipeline_package_path = f"{run_name}.yaml.zip"

## Components

In [7]:
def download_data(output_csv: comp.OutputPath(str)):
    import pandas as pd
    print("first step")
    df = pd.read_csv("https://www.openml.org/data/get_csv/21792853/dataset")
    print("downloaded data")
    print(df.head())
    print(output_csv)
    df.to_csv(output_csv, index=False)


def price_filter(input_csv: comp.InputPath(str),
                 output_csv: comp.OutputPath(str)):
    import pandas as pd
    df = pd.read_csv(input_csv)
    df = df[df["price"] < 10000]
    print("Load Data: ", df.head())
    df.to_csv(output_csv, index=False)


@component(packages_to_install=['sklearn'], base_image='python:3.9')
def price_sgdclassifier(input_csv: comp.InputPath(str),
                        test_samples_fraction: float,
                        metrics: Output[comp.ClassificationMetrics]):

    import pandas as pd
    from sklearn import model_selection
    from sklearn.linear_model import SGDClassifier
    from sklearn.metrics import confusion_matrix

    df = pd.read_csv(input_csv)
    Y = df["price"]
    X = df.drop("price", axis=1, inplace=True)

    train_x, test_x, train_y, test_y = model_selection.train_test_split(
       X, Y, test_size=test_samples_fraction)

    classifier = SGDClassifier()
    classifier.fit(train_x, train_y)

    predictions = model_selection.cross_val_predict(
        classifier, train_x, train_y, cv=3)

    metrics.log_confusion_matrix(confusion_matrix(train_y, predictions).tolist())


download_data_op = comp.create_component_from_func(
    download_data,
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["pandas==1.1.4"])

price_filter_op = comp.create_component_from_func(
    price_filter,
    base_image="k8scc01covidacr.azurecr.io/jupyterlab-cpu:16b01881",
    packages_to_install=["pandas==1.1.4"])

AttributeError: module 'kfp.components' has no attribute 'ClassificationMetrics'

## Pipeline

In [None]:
@dsl.pipeline(name=pipeline_name, description=pipeline_description)
def my_pipeline():
    """
    Pipeline: download open diamond dataset (21792853) from openml;
        filter data;
        store it.
    """
    classifier_op = price_sgdclassifier(test_samples_fraction=0.3)
    downloaded_data = download_data_op()
    result = price_filter_op(downloaded_data.output)
    classifier_op(result)

## Publish Pipeline and Run Pipeline in an Experiment

The experiment is created once a connection is established to the KFP client. The pipeline is compiled and then run inside the experiment.

In [None]:
client = kfp.Client()

compiler.Compiler().compile(my_pipeline, pipeline_package_path)

response = client.upload_pipeline(
    pipeline_package_path, pipeline_name=pipeline_name)

In [None]:
try:
    experiment = client.get_experiment(
        name=experiment_name,
        description=experiment_description,
        namespace=namespace)
except:
    experiment = client.create_experiment(
        name=experiment_name,
        description=experiment_description,
        namespace=namespace)

run = client.run_pipeline(
    experiment_id=experiment.id,
    job_name=run_name,
    pipeline_package_path=pipeline_package_path)