**Check environment dependencies**

In [1]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
! python3 -c "import sklearn; print('Sklearn version: {}'.format(sklearn.__version__))"

KFP SDK version: 1.8.19
google_cloud_pipeline_components version: 1.0.41
Sklearn version: 1.0.2


In [3]:
from datetime import datetime

import google.cloud.aiplatform as aip
from google.cloud import aiplatform
import kfp
from kfp.v2 import dsl, compiler
from kfp.v2.google.client import AIPlatformClient

# custom code for data processing and model training
from utils import create_data, train_model

**Define environment variables**

User should update the <code>BUCKET_NAME</code>. The <code>PROJECT_ID</code> is picked up based on the gcloud configuration

In [4]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "black-friday-dataset-aaa"  # modify
BUCKET_URI = f"gs://{BUCKET_NAME}"
REGION = "us-central1"
PIPELINE_ROOT = "{}/pipeline_root/black_friday".format(BUCKET_URI)
DISPLAY_NAME = "black-friday-" + TIMESTAMP
PACKAGE_PATH = "pipeline.json"
project_id_shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_shell_output[0]

## Data processing and model training

**Initialize the client**

In [5]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

**Define the pipeline**

In [6]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="black-friday-pipeline",
)
def pipeline(
    train_file_x: str,
    train_file_y: str,
    test_file_x: str,
    test_file_y: str,
    best_params_file: str,
    metrics_file: str,
    num_iterations: int,
    hp_tune: bool,
    best_params: dict,
    use_demographic: bool,
):

    create_data_task = create_data(
        project_id="ds-training-380514",
        bucket_name=BUCKET_NAME,
        dataset_id="black_friday",
        use_demographic=use_demographic,
    )

    train_model_task = train_model(
        hp_tune=hp_tune,
        project_id="ds-training-380514",
        bucket_name=BUCKET_NAME,
        best_params=best_params,
        num_iterations=num_iterations,
        train_file_x=create_data_task.outputs["train_file_x"],
        test_file_x=create_data_task.outputs["test_file_x"],
        train_file_y=create_data_task.outputs["train_file_y"],
        test_file_y=create_data_task.outputs["test_file_y"],
    )

In [7]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path=PACKAGE_PATH)



**Submit the pipeline to Vertex AI Pipeline**

In [8]:
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    enable_caching=True,
    parameter_values={
        "train_file_x": "x_train.csv",
        "train_file_y": "y_train.csv",
        "test_file_x": "x_test.csv",
        "test_file_y": "y_test.csv",
        "metrics_file": "metrics.json",
        "num_iterations": 2,
        "hp_tune": True,
        "best_params_file": "best_params.json",
        "best_params": {
            "n_estimators": 644,
            "max_depth": 343,
            "min_samples_split": 6,
            "min_samples_leaf": 1,
            "max_features": "auto",
        },
        "use_demographic": True,
    },
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328211523
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328211523')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/black-friday-pipeline-20230328211523?project=354621994428
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328211523 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328211523 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328211523 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/l

**Build same model with demographic data removed**

In [9]:
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    enable_caching=True,
    parameter_values={
        "train_file_x": "x_train.csv",
        "train_file_y": "y_train.csv",
        "test_file_x": "x_test.csv",
        "test_file_y": "y_test.csv",
        "metrics_file": "metrics.json",
        "num_iterations": 2,
        "hp_tune": True,
        "best_params_file": "best_params.json",
        "best_params": {
            "n_estimators": 644,
            "max_depth": 343,
            "min_samples_split": 6,
            "min_samples_leaf": 1,
            "max_features": "auto",
        },
        "use_demographic": False,
    },
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328212758
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328212758')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/black-friday-pipeline-20230328212758?project=354621994428
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328212758 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328212758 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/locations/us-central1/pipelineJobs/black-friday-pipeline-20230328212758 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/354621994428/l

**Copy the model to your local directory**

Get the URI of the model file by clicking through the Vertex AI Pipeline UI. Select the model artifact and follow the path to the model

In [13]:
# model without demographic data
! gsutil cp gs://black-friday-dataset-aaa/pipeline_root/black_friday/354621994428/black-friday-pipeline-20230328212758/train-model_-120342922049617920/model_file.pkl model.pkl

Copying gs://black-friday-dataset-aaa/pipeline_root/black_friday/354621994428/black-friday-pipeline-20230328212758/train-model_-120342922049617920/model_file.pkl...
- [1 files][347.6 MiB/347.6 MiB]                                                
Operation completed over 1 objects/347.6 MiB.                                    


## Model Deployment

**Set environment variables for command line arguments**

For custom prediction routines, a docker image must be provided

In [14]:
%env PROJECT_ID={PROJECT_ID}
%env REGION={REGION}
%env REPOSITORY=black-friday-v1
%env IMAGE=black-friday-image

env: PROJECT_ID=ds-training-380514
env: REGION=us-central1
env: REPOSITORY=black-friday-v1
env: IMAGE=black-friday-image


In [15]:
# build image
!docker build --tag=$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE .

Sending build context to Docker daemon  364.6MB
Step 1/6 : FROM python:3.9-slim
3.9-slim: Pulling from library/python

[1B6f570256: Pulling fs layer 
[1B01660885: Pulling fs layer 
[1B40b65f34: Pulling fs layer 
[1B763962ba: Pulling fs layer 
[1B10d4c3ff: Pull complete 196MB/3.196MBB[4A[2K[3A[2K[5A[2K[1A[2K[1A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[4A[2K[4A[2K[4A[2K[3A[2K[3A[2K[3A[2K[3A[2K[3A[2K[3A[2K[3A[2K[2A[2K[1A[2K[1A[2K[1A[2K[1A[2KDigest: sha256:5192f07402cbe8b0267eef13085b321d50ab8aaac79d2f0657f96810c3f4555c
Status: Downloaded newer image for python:3.9-slim
 ---> 3279228d157d
Step 2/6 : WORKDIR /app
 ---> Running in 94d994d7e477
Removing intermediate container 94d994d7e477
 ---> 7b635a7cf4a9
Step 3/6 : COPY . /app
 ---> bddeeec4bfbe
Step 4/6 : RUN pip3 install scikit-learn==1.1.3 gunicorn flask flask-cors
 ---> Running in 69c825e947ba
Collectin

In [16]:
# create repository in artifact repository
! gcloud artifacts repositories create $REPOSITORY  \
                             --repository-format=docker \
                             --location=$REGION

Create request issued for: [black-friday-v1]
Waiting for operation [projects/ds-training-380514/locations/us-central1/operat
ions/8f03b197-cbb1-42f8-bc29-15e38b2b5bee] to complete...done.                 
Created repository [black-friday-v1].


In [30]:
# push docker image to the newly created artifact repository
! docker push $REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE

Using default tag: latest
The push refers to repository [us-central1-docker.pkg.dev/ds-training-380514/black-friday-v1/black-friday-image]

[1B575b3d6d: Preparing 
[1B87e19ebb: Preparing 
[1Bb4af9a47: Preparing 
[1Bc5e17780: Preparing 
[1Ba85d1cef: Preparing 
[1B7cd997d9: Preparing 
[1B6c2423f1: Preparing 
[8B575b3d6d: Pushed   409.3MB/404.3MB[8A[2K[4A[2K[8A[2K[5A[2K[8A[2K[5A[2K[7A[2K[5A[2K[7A[2K[5A[2K[7A[2K[8A[2K[2A[2K[5A[2K[3A[2K[2A[2K[8A[2K[5A[2K[2A[2K[3A[2K[5A[2K[2A[2K[2A[2K[2A[2K[8A[2K[2A[2K[5A[2K[7A[2K[3A[2K[1A[2K[7A[2K[2A[2K[3A[2K[1A[2K[8A[2K[1A[2K[7A[2K[8A[2K[3A[2K[3A[2K[1A[2K[3A[2K[1A[2K[3A[2K[8A[2K[3A[2K[7A[2K[1A[2K[8A[2K[1A[2K[3A[2K[7A[2K[3A[2K[8A[2K[3A[2K[8A[2K[3A[2K[1A[2K[3A[2K[1A[2K[7A[2K[3A[2K[8A[2K[1A[2K[3A[2K[1A[2K[8A[2K[7A[2K[1A[2K[7A[2K[1A[2K[8A[2K[7A[2K[1A[2K[7A[2K[8A[2K[7A[2K[1A[2K[7A[2K[8A[2K[1A[2

In [31]:
# upload model to Vertex AI  model registry
! gcloud ai models upload \
  --region=$REGION \
  --display-name=black-friday-model \
  --container-image-uri=$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY/$IMAGE \
  --container-ports=5005 \
  --container-health-route=/healthz \
  --container-predict-route=/predict

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [379980017284677632]...done.                             


In [32]:
# list models to double check
!gcloud ai models list \
  --region=us-central1 \
  --filter=display_name=black-friday-model

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
MODEL_ID             DISPLAY_NAME
8326014417938415616  black-friday-model


In [33]:
# create a Vertex AI endpoint
!gcloud ai endpoints create \
  --region=us-central1 \
  --display-name=black-friday

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [4209165600456441856]...done.                            
Created Vertex AI endpoint: projects/354621994428/locations/us-central1/endpoints/8591966489488130048.


**Deploy the model to the endpoint**

The model endpoint is collected from the previous cell. The model id is collected from the list of models

In [35]:
!gcloud ai endpoints deploy-model 8591966489488130048 \
  --region=us-central1 \
  --model=8326014417938415616 \
  --display-name=black-friday-model \
  --machine-type=n1-standard-4 \
  --min-replica-count=1 \
  --max-replica-count=2 

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [2860337512058978304]...done.                            
Deployed a model to the endpoint 8591966489488130048. Id of the deployed model: 514256981473624064.


## Sample Prediction

In [38]:
import pandas as pd

data = pd.read_csv(
    "gs://black-friday-dataset-aaa/pipeline_root/black_friday/354621994428/black-friday-pipeline-20230328211523/create-data_5752350992041508864/test_file_x",
    header=None,
)

In [39]:
ENDPOINT_ID = "8591966489488130048"
PROJECT_ID = "354621994428"

In [40]:
def endpoint_predict_sample(
    project: str, location: str, instances: list, endpoint: str
):
    aiplatform.init(project=project, location=location)

    endpoint = aiplatform.Endpoint(endpoint)

    prediction = endpoint.predict(instances=instances)
    print(prediction)
    return prediction

In [41]:
endpoint_predict_sample(
    project=PROJECT_ID,
    location=REGION,
    instances=data.iloc[0].tolist()[4:],
    endpoint=ENDPOINT_ID,
)  # [4:] to remove the demographic features

Prediction(predictions=['Product Category 1'], deployed_model_id='514256981473624064', model_version_id='1', model_resource_name='projects/354621994428/locations/us-central1/models/8326014417938415616', explanations=None)


Prediction(predictions=['Product Category 1'], deployed_model_id='514256981473624064', model_version_id='1', model_resource_name='projects/354621994428/locations/us-central1/models/8326014417938415616', explanations=None)