**CHECKS, IMPORTS AND ENVIRONMENT SETTINGS**

In [None]:
import os
# Google Cloud Notebook
if os.path.exists("/opt/deeplearning/metadata/env_version"):
    USER_FLAG = "--user"
else:
    USER_FLAG = ""

! pip3 install --upgrade google-cloud-aiplatform $USER_FLAG

In [None]:
! pip3 install -U google-cloud-storage $USER_FLAG

In [None]:
! pip3 install $USER kfp --upgrade

In [5]:
if os.getenv("IS_TESTING"):
    ! pip3 install --upgrade matplotlib $USER_FLAG

In [6]:
import os
if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"

KFP SDK version: 1.8.12


In [2]:
PROJECT_ID = "ccd-test-354201" 
REGION = "asia-southeast1" 
BUCKET_NAME = "gs://ccd-bucket" 

In [8]:
! gsutil ls -al $BUCKET_NAME

   3386491  2022-06-23T01:51:57Z  gs://ccd-bucket/ccd_dataset.csv#1655949117320722  metageneration=1
TOTAL: 1 objects, 3386491 bytes (3.23 MiB)


In [4]:
PIPELINE_ROOT = "{}/pipeline_root/ccd".format(BUCKET_NAME)

In [17]:
import google.cloud.aiplatform as aip
from kfp.v2 import dsl, compiler
from kfp.v2.dsl import Artifact, Dataset, Input, Model, ClassificationMetrics, Metrics, Output, component

In [10]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

**CREATE COMPONENTS**

In [73]:
@component(
    base_image="python:3.9",
    packages_to_install=["pandas", "sklearn", "fsspec", "gcsfs"],
    output_component_file="ccd_preprocess.yaml",)
def preprocess(url: str, dataset_train: Output[Dataset], dataset_test: Output[Dataset]):
    import gcsfs
    import pandas as pd
    from functools import reduce
    from sklearn.model_selection import train_test_split as tts
                         
    #Read CSV and fix columns
    fs = gcsfs.GCSFileSystem()
    with fs.open(url, 'rb') as f:
        df = pd.read_csv(f) 
    
    columns = df.columns.to_list()
    columns = list(map(lambda name: name.lower().strip(), columns))
    df.columns = columns
    
    #Drop NAs and do other fixes
    df.dropna()
    df['target'] = df['default'].apply(lambda x: 1 if x == 'Y' else 0)
    df.loc[df['education'] == '0', 'education'] = 'Unknown'
    df.loc[df['marriage'] == '0', 'marriage'] = 'Other'
    sex = pd.get_dummies(df.sex, prefix = 'gender')
    education = pd.get_dummies(df.education, prefix = 'ed')
    marriage = pd.get_dummies(df.marriage, prefix = 'mstatus')
    frames = [df, sex, education, marriage]
    final = reduce(lambda l, r: pd.concat([l, r], axis = 1), frames)
    final.drop(['default', 'sex', 'education', 'marriage'], axis = 1, inplace = True)
    
    train, test = tts(final, test_size = 0.2)
    
    train.to_csv(dataset_train.path)
    test.to_csv(dataset_test.path)

In [77]:
@component(
    packages_to_install=["pandas", "sklearn", "xgboost", "fsspec"],
    base_image="python:3.9",
    output_component_file="ccd_train.yaml",)
def train(dataset: Input[Dataset], model_artifact: Output[Model]):
    
    import pandas as pd
    from xgboost import XGBClassifier

    data = pd.read_csv(dataset.path)
    X = data.drop(['id', 'target'], axis = 1)
    Y = data['target']
    
    model = XGBClassifier(max_depth=2, learning_rate = 0.1, n_estimators=350, n_jobs = -1)
    model.fit(X, Y)
    
    score = model.score(X, Y)
    
    model_artifact.metadata["train_score"] = float(score)
    model_artifact.metadata["framework"] = "XGBoost"
    
    model.save_model(model_artifact.path)

In [80]:
@component(
    packages_to_install=["pandas", "sklearn", "xgboost", "fsspec"],
    base_image="python:3.9",
    output_component_file="ccd_eval.yaml",)
def eval_model(test_set: Input[Dataset], xgb_model: Input[Model],
               metrics: Output[ClassificationMetrics], smetrics: Output[Metrics]):
    
    from xgboost import XGBClassifier
    import pandas as pd
    from sklearn.metrics import roc_curve, confusion_matrix
    
    data = pd.read_csv(test_set.path)
    model = XGBClassifier()
    model.load_model(xgb_model.path)
    
    X = data.drop(['id', 'target'], axis = 1)
    Y = data['target']
    
    score = model.score(X, Y)
    
    y_prob = model.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = roc_curve(Y, y_prob, pos_label = 1)
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())
    
    y_pred = model.predict(X)
    
    metrics.log_confusion_matrix([0, 1], confusion_matrix(Y, y_pred).tolist())
    
    xgb_model.metadata["test_score"] = float(score)
    smetrics.log_metric("score", float(score))

**CREATE PIPELINE**

In [57]:
url = "gs://ccd-bucket/ccd_dataset.csv"

In [81]:
@dsl.pipeline(
    pipeline_root = PIPELINE_ROOT,
    name = 'ccd-pipeline-test',)
def pipeline():
    dataset_op = preprocess(url)
    train_op = train(dataset_op.outputs["dataset_train"])
    eval_op = eval_model(test_set = dataset_op.outputs["dataset_test"], xgb_model = train_op.outputs["model_artifact"])
    
compiler.Compiler().compile(pipeline_func = pipeline, package_path='xgb_pipe.json')

In [None]:
from kfp.v2.google.client import AIPlatformClient

job = aip.PipelineJob(
    display_name='ccd-test-v1',
    template_path="xgb_pipe.json",
    job_id="test-12",
    pipeline_root=PIPELINE_ROOT,
)

job.run()