# Vertex AI Pipelines: Metrics visualization and run comparison using the KFP SDK

* https://github.com/kubeflow/pipelines/blob/master/samples/test/metrics_visualization_v2.py

In [1]:
from kfp.v2 import dsl
from kfp.v2.dsl import ClassificationMetrics, Metrics, Output, component

## Define train components

In [2]:
@component(packages_to_install=['scikit-learn'], base_image='python:3.9')
def wine_classification(wmetrics: Output[ClassificationMetrics]):
    from sklearn.datasets import load_wine
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import cross_val_predict, train_test_split

    X, y = load_wine(return_X_y=True)
    # Binary classification problem for label 1
    y = y == 1

    X_train, _, y_train, _ = train_test_split(X, y, random_state=42)
    rfc = RandomForestClassifier(n_estimators=10, random_state=42)
    rfc.fit(X_train, y_train)
    y_scores = cross_val_predict(rfc, X_train, y_train, cv=3, method='predict_proba')
    fpr, tpr, thresholds = roc_curve(
        y_true=y_train, y_score=y_scores[:, 1], pos_label=True
    )
    wmetrics.log_roc_curve(fpr, tpr, thresholds)

In [3]:
@component(packages_to_install=['scikit-learn'], base_image='python:3.9')
def iris_sgdclassifier(
    test_samples_fraction: float,
    metricsc: Output[ClassificationMetrics]
):
    from sklearn import datasets, model_selection
    from sklearn.linear_model import SGDClassifier
    from sklearn.metrics import confusion_matrix

    iris_dataset = datasets.load_iris()
    train_x, _, train_y, _ = model_selection.train_test_split(
        iris_dataset['data'],
        iris_dataset['target'],
        test_size=test_samples_fraction
    )

    classifier = SGDClassifier()
    classifier.fit(train_x, train_y)
    predictions = model_selection.cross_val_predict(classifier, train_x, train_y, cv=3)
    metricsc.log_confusion_matrix(
        ['Setosa', 'Versicolour', 'Virginica'],
        confusion_matrix(train_y, predictions).tolist()
    )

In [5]:
@component(packages_to_install=['scikit-learn'], base_image='python:3.9')
def iris_logregression(
    input_seed: int,
    split_count: int,
    metrics: Output[Metrics]
):
    from sklearn import datasets, model_selection
    from sklearn.linear_model import LogisticRegression

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    test_size = 0.2

    kfold = model_selection.KFold(split_count, random_state=input_seed, shuffle=True)

    model = LogisticRegression()
    scoring = 'accuracy'
    results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    print(f'results: {results}')

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=test_size, random_state=input_seed
    )

    model.fit(X_train, y_train)

    result = model.score(X_test, y_test)
    print(f'result: {result}')
    metrics.log_metric('accuracy', (result * 100))

## Define the pipeline

In [6]:
@dsl.pipeline(name='metrics-pipeline-v2')
def pipeline(seed: int, splits:int):
    wine_classification_op = wine_classification()
    iris_logregression_op = iris_logregression(
        input_seed=seed, split_count=splits
    )
    iris_sgdclassifier_op = iris_sgdclassifier(test_samples_fraction=0.3)

## Run the pipeline

In [None]:
import kfp

endpoint = 'http://localhost:8080/pipeline'

In [9]:
kfp.Client(host=endpoint).create_run_from_pipeline_func(
    pipeline,
    # arguments={'seed': 7, 'splits': 10},
    arguments={'seed': 5, 'splits': 7},
    mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE,
    enable_caching=False
)

RunPipelineResult(run_id=e3f08e90-981a-44dc-bad5-57fdd1aa9151)