In [3]:
# !python3 -m pip install kfp

In [36]:
import os
import kfp
import yaml
import kfp.components as comp
import kfp.dsl as dsl
from typing import NamedTuple
from kfp.compiler import compiler

In [70]:
def data_preprocess(
    bucket_name: str,
    input_blob_name: str,
    target_column: str,
    ) -> NamedTuple('PreprocessOutput', 
              [
                  ('x_train_name', str),
                  ('x_test_name', str),
                  ('y_train_name', str),
                  ('y_test_name', str),
                  ('n_classes', int),
              ]):
    
    from collections import namedtuple
    from sklearn.model_selection import train_test_split
    import pandas as pd
    import os
    import logging

    input_file = 'gs://{}/{}'.format(
        bucket_name, 
        input_blob_name, 
        )
    logging.info("Loading {}".format(input_file))
    dataset = pd.read_csv(input_file)
    # drop unique id column which is not useful for ML
    dataset.drop(['LOAN_SEQUENCE_NUMBER'], axis=1, inplace=True)

    # Convert categorical columns into one-hot encodings
    str_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']
    dataset = pd.get_dummies(dataset, columns=str_cols)
    n_classes = dataset[target_column].nunique()
    logging.info("No. of Classes: {}".format(n_classes))

    # Split with a small test size so as to allow our model to train on more data
    x_train, x_test, y_train, y_test = train_test_split(
        dataset.drop(target_column, axis=1), 
        dataset[target_column], 
        test_size=0.1,
        random_state=1,
        shuffle=True, 
        stratify=dataset[target_column], 
        )

    logging.info("x_train shape = {}".format(x_train.shape))
    logging.info("x_test shape = {}".format(x_test.shape))
    logging.info("y_train shape = {}".format(y_train.shape))
    logging.info("y_test shape = {}".format(y_test.shape))

    base_name, ext_name = os.path.splitext(input_file)
    x_train_name = "{}_x_train{}".format(base_name, ext_name)
    x_test_name = "{}_x_test{}".format(base_name, ext_name)
    y_train_name = "{}_y_train{}".format(base_name, ext_name)
    y_test_name = "{}_y_test{}".format(base_name, ext_name)

    x_train.to_csv(x_train_name, index=False)
    x_test.to_csv(x_test_name, index=False)
    y_train.to_csv(y_train_name, index=False)
    y_test.to_csv(y_test_name, index=False)

    logging.info("x_train saved to {}".format(x_train_name))
    logging.info("x_test saved to {}".format(x_test_name))
    logging.info("y_train saved to {}".format(y_train_name))
    logging.info("y_test saved to {}".format(y_test_name))
    logging.info("finished")
    
    PreprocessOutput = namedtuple('PreprocessOutput', 
        ['x_train_name', 'x_test_name', 'y_train_name', 'y_test_name', 'n_classes'])
    return PreprocessOutput(
        x_train_name=x_train_name,
        x_test_name=x_test_name,
        y_train_name=y_train_name,
        y_test_name=y_test_name,
        n_classes=n_classes,
    )

In [71]:
def train(
        job_name: str,
        project_id: str,
        user_name: str,
        bucket_name: str,
        job_folder_name: str,
        region: str,
        train_feature_path: str,
        train_label_path: str,
        val_feature_path: str,
        val_label_path: str,
    ) -> NamedTuple('TrainOutput', 
              [('response', str)]):
    from collections import namedtuple
    import subprocess


    job_dir = 'gs://{}/{}/jobdir'.format(
        bucket_name,
        job_folder_name,
        )
    package_path = "/pipelines/component/trainer"
    job_config = "/pipelines/component/config/config_hpt.yaml"
    print("JOB_NAME = ", job_name)
    print("JOB_DIR = ", job_dir)
    print("JOB_CONFIG = ", job_config)
        
    response = subprocess.run([
        "gcloud", "ai-platform", "jobs", "submit", "training",
        job_name,
        "--package-path", package_path,
        "--module-name", "trainer.train_hpt",
        "--python-version", "3.7",
        "--runtime-version", "2.2",
        "--job-dir", job_dir,
        "--region", region,
        "--config", job_config,
        "--",
        "--train_feature_name", train_feature_path,
        "--train_label_name", train_label_path,
        "--val_feature_name", val_feature_path,
        "--val_label_name", val_label_path
    ], stdout=subprocess.PIPE)
    
    response = subprocess.run([
        "gcloud", "ai-platform", "jobs", "describe", job_name,
    ], stdout=subprocess.PIPE)
    
    TrainOutput = namedtuple('TrainOutput',['response'])
        
    return TrainOutput(response=response.stdout.decode())

### Compile python functions to components

In [72]:
component_dir = "./components"

base_image = "gcr.io/deeplearning-platform-release/tf2-gpu.2-1"
yaml_name = '{}/preprocess.yaml'.format(component_dir)

preprocess_op = comp.func_to_container_op(
    data_preprocess, 
    output_component_file=yaml_name,
    base_image=base_image)


base_image = "gcr.io/img-seg-3d/trainer:v1"
yaml_name = '{}/train.yaml'.format(component_dir)

train_op = comp.func_to_container_op(
    train, 
    output_component_file=yaml_name,
    base_image=base_image)


### Compile KFP pipeline

In [73]:
@dsl.pipeline(
   name='generic prediction pipeline',
   description='A pipeline that performs generic seismic image segmentation.'
)
def train_pipeline(
    job_name: str,
    project_id: str,
    region: str,
    user_name: str,
    bucket_name: str,
    input_blob_name: str,
    job_folder_name: str,
    target_column: str,
    ):
    preprocess_task = preprocess_op(
        bucket_name = bucket_name,
        input_blob_name = input_blob_name,
        target_column = target_column,
    )
    
    train_task = train_op(
        job_name = job_name,
        project_id = project_id,
        user_name = user_name,
        bucket_name = bucket_name,
        job_folder_name = job_folder_name,
        region = region,
        train_feature_path = preprocess_task.outputs['x_train_name'],
        train_label_path = preprocess_task.outputs['y_train_name'],
        val_feature_path = preprocess_task.outputs['x_test_name'],
        val_label_path = preprocess_task.outputs['y_test_name'],
    )
    
pipeline_pkg_path="./train_pipeline.tar.gz"

compiler.Compiler().compile(train_pipeline, package_path=pipeline_pkg_path)

### Run KFP pipeline on AI Platform hosted Kubernetes cluster

In [75]:
from datetime import datetime
from pytz import timezone
my_timezone = 'US/Pacific'

params = {
    "job_name": 'xgb_train_elvinzhu_{}_hpt'.format(
        datetime.now(timezone(my_timezone)).strftime("%m%d%y_%H%M")
        ),
    "project_id": 'img-seg-3d',
    "region": 'us-central1',
    "user_name": 'elvinzhu',
    "job_folder_name": 'xgb_train_job',
    "bucket_name": 'tuti_asset',
    "input_blob_name": 'datasets/mortgage_structured.csv',
    "target_column": 'TARGET',
}
kfp_host_name = '6ff530db99970db2-dot-us-central2.pipelines.googleusercontent.com'
kfp_exp_name = 'xgboost_ai_platform'
kfp_run_name = 'demo_xgboost'

client = kfp.Client(host=kfp_host_name) 
# Create Experiment GROUP
exp = client.create_experiment(name = kfp_exp_name)
# Create Experiment RUN
run = client.run_pipeline(exp.id, kfp_run_name, pipeline_pkg_path, params=params)