**Check environment dependencies**

In [1]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
! python3 -c "import sklearn; print('Sklearn version: {}'.format(sklearn.__version__))"

KFP SDK version: 1.8.14
google_cloud_pipeline_components version: 1.0.26
Sklearn version: 1.0.2


In [2]:
from datetime import datetime

import google.cloud.aiplatform as aip
from google.cloud import aiplatform
import kfp
from kfp.v2 import dsl, compiler
from kfp.v2.google.client import AIPlatformClient

# custom code for data processing and model training
from utils import create_data, train_model

**Define environment variables**

User should update the <code>BUCKET_NAME</code>. The <code>PROJECT_ID</code> is picked up based on the gcloud configuration

In [3]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "black-friday-dataset-test"  # modify
BUCKET_URI = f"gs://{BUCKET_NAME}"
REGION = "us-central1"
PIPELINE_ROOT = "{}/pipeline_root/black_friday".format(BUCKET_URI)
DISPLAY_NAME = "black-friday-" + TIMESTAMP
PACKAGE_PATH = "pipeline.json"
project_id_shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_shell_output[0]

## Data processing and model training

**Initialize the client**

In [4]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

**Define the pipeline**

In [5]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="black-friday-pipeline",
)
def pipeline(
    train_file_x: str,
    train_file_y: str,
    test_file_x: str,
    test_file_y: str,
    best_params_file: str,
    metrics_file: str,
    num_iterations: int,
    hp_tune: bool,
    best_params: dict,
    use_demographic: bool,
):

    create_data_task = create_data(
        project_id="mwpmltr",
        bucket_name=BUCKET_NAME,
        dataset_id="black_friday",
        use_demographic=use_demographic,
    )

    train_model_task = train_model(
        hp_tune=hp_tune,
        project_id="mwpmltr",
        bucket_name=BUCKET_NAME,
        best_params=best_params,
        num_iterations=num_iterations,
        train_file_x=create_data_task.outputs["train_file_x"],
        test_file_x=create_data_task.outputs["test_file_x"],
        train_file_y=create_data_task.outputs["train_file_y"],
        test_file_y=create_data_task.outputs["test_file_y"],
    )

In [6]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path=PACKAGE_PATH)



**Submit the pipeline to Vertex AI Pipeline**

In [None]:
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "train_file_x": "x_train.csv",
        "train_file_y": "y_train.csv",
        "test_file_x": "x_test.csv",
        "test_file_y": "y_test.csv",
        "metrics_file": "metrics.json",
        "num_iterations": 100,
        "hp_tune": True,
        "best_params_file": "best_params.json",
        "best_params": {
            "n_estimators": 644,
            "max_depth": 343,
            "min_samples_split": 6,
            "min_samples_leaf": 1,
            "max_features": "auto",
        },
        "use_demographic": True,
    },
)

job.run()

**Build same model with demographic data removed**

In [7]:
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "train_file_x": "x_train.csv",
        "train_file_y": "y_train.csv",
        "test_file_x": "x_test.csv",
        "test_file_y": "y_test.csv",
        "metrics_file": "metrics.json",
        "num_iterations": 100,
        "hp_tune": True,
        "best_params_file": "best_params.json",
        "best_params": {
            "n_estimators": 644,
            "max_depth": 343,
            "min_samples_split": 6,
            "min_samples_leaf": 1,
            "max_features": "auto",
        },
        "use_demographic": False,
    },
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/55590906972/locations/us-central1/pipelineJobs/black-friday-pipeline-20221104045631
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/55590906972/locations/us-central1/pipelineJobs/black-friday-pipeline-20221104045631')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/black-friday-pipeline-20221104045631?project=55590906972
PipelineJob projects/55590906972/locations/us-central1/pipelineJobs/black-friday-pipeline-20221104045631 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/55590906972/locations/us-central1/pipelineJobs/black-friday-pipeline-20221104045631 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/55590906972/locations/us-central1/pipelineJobs/black-friday-pipeline-20221104045631 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/55590906972/location

**Copy the model to your local directory**

Get the URI of the model file by clicking through the Vertex AI Pipeline UI. Select the model artifact and follow the path to the model

In [27]:
# model without demographic data
! gsutil cp gs://black-friday-dataset-test/pipeline_root/black_friday/55590906972/black-friday-pipeline-20221103222823/train-model_910918445577535488/model_file.pkl model.pkl

Copying gs://black-friday-dataset-test/pipeline_root/black_friday/55590906972/black-friday-pipeline-20221103222823/train-model_910918445577535488/model_file.pkl...
- [1 files][ 61.9 MiB/ 61.9 MiB]                                                
Operation completed over 1 objects/61.9 MiB.                                     


## Model Deployment

**Set environment variables for command line arguments**

For custom prediction routines, a docker image must be provided

In [28]:
%env PROJECT_ID={PROJECT_ID}
%env REGION={REGION}
%env REPOSITORY=black-friday-v1
%env IMAGE=black-friday-image

env: PROJECT_ID=55590906972
env: REGION=us-central1
env: REPOSITORY=black-friday-v1
env: IMAGE=black-friday-image


In [None]:
# build image
!docker build --tag=$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE .

In [None]:
# create repository in artifact repository
! gcloud artifacts repositories create $REPOSITORY  \
                             --repository-format=docker \
                             --location=$REGION

In [None]:
# push docker image to the newly created artifact repository
! docker push $REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE

In [None]:
# upload model to Vertex AI  model registry
! gcloud ai models upload \
  --region=$REGION \
  --display-name=black-friday-model \
  --container-image-uri=$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE \
  --container-ports=5005 \
  --container-health-route=/healthz \
  --container-predict-route=/predict

In [32]:
# list models to double check
!gcloud ai models list \
  --region=us-central1 \
  --filter=display_name=black-friday-model

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
3372558404156391424  black-friday-model
1954628208976461824  black-friday-model


In [19]:
# create a Vertex AI endpoint
!gcloud ai endpoints create \
  --region=us-central1 \
  --display-name=black-friday

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [8548616388137189376]...done.                            
Created Vertex AI endpoint: projects/55590906972/locations/us-central1/endpoints/549654658818244608.


**Deploy the model to the endpoint**

The model endpoint is collected from the previous cell. The model id is collected from the list of models

In [None]:
!gcloud ai endpoints deploy-model 549654658818244608 \
  --region=us-central1 \
  --model=3372558404156391424 \
  --display-name=black-friday-model \
  --machine-type=n1-standard-4 \
  --min-replica-count=1 \
  --max-replica-count=2 

## Sample Prediction

In [21]:
import pandas as pd

data = pd.read_csv(
    "gs://black-friday-dataset-test/pipeline_root/black_friday/55590906972/black-friday-pipeline-20221102205557/create-data_847692128933904384/test_file_x",
    header=None,
)

In [22]:
ENDPOINT_ID = "549654658818244608"
PROJECT_ID = "55590906972"

In [25]:
def endpoint_predict_sample(
    project: str, location: str, instances: list, endpoint: str
):
    aiplatform.init(project=project, location=location)

    endpoint = aiplatform.Endpoint(endpoint)

    prediction = endpoint.predict(instances=instances)
    print(prediction)
    return prediction

In [35]:
endpoint_predict_sample(
    project=PROJECT_ID,
    location=REGION,
    instances=data.iloc[0].tolist()[4:],
    endpoint=ENDPOINT_ID,
)  # [4:] to remove the demographic features

Prediction(predictions=['Product Category 1'], deployed_model_id='8709526272729939968', model_version_id='1', model_resource_name='projects/55590906972/locations/us-central1/models/3372558404156391424', explanations=None)


Prediction(predictions=['Product Category 1'], deployed_model_id='8709526272729939968', model_version_id='1', model_resource_name='projects/55590906972/locations/us-central1/models/3372558404156391424', explanations=None)