In [None]:
#
# Copyright 2023 Google LLC
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     https://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Setup Container

In [None]:
# Artifact Registry Repo
AR_REPO="vertex-customjob"
IMG_NAME="intercom"

In [None]:
PROJECT_ID=!gcloud config get-value project
PROJECT_ID=PROJECT_ID[0]
LOCATION=""
BUCKET=""
IMAGE_URI="us-docker.pkg.dev/{}/{}/{}:latest".format(PROJECT_ID,AR_REPO,IMG_NAME)

In [None]:
%cd ~/vertex-deepspeed

In [None]:
DOCKERFILE=f"examples/ssh-only/{IMG_NAME}.Dockerfile"
!echo $DOCKERFILE
!docker build . -t $IMAGE_URI -f $DOCKERFILE

In [None]:
# This test triggers a test run with an example Vertex $CLUSTER_SPEC
# It will eventually call train.sh, which without customization enters sleep 3600
# You can kill the test explicitly
TEST_AIP_MODEL_DIR=f"gs://{BUCKET}/model/"
!docker run --rm -e TESTING="true" -e AIP_MODEL_DIR=$TEST_AIP_MODEL_DIR $IMAGE_URI 

In [None]:
# If this throws error
# add "us-docker.pkg.dev": "gcloud" to /home/jupyter/.docker/config.json
!gcloud auth configure-docker

In [None]:
# Make sure the repo specified in $AR_REPO exists.
!echo $IMAGE_URI
!docker push $IMAGE_URI

# Test container with aiplatform.CustomJob

In [None]:
from datetime import datetime
from google.cloud import aiplatform

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET, location=LOCATION)

In [None]:
worker_pool_specs = [
    # `WorkerPoolSpec` for worker pool 0, primary replica, required  
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            #"accelerator_type": "NVIDIA_TESLA_K80",
            #"accelerator_count": 1,       
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": [],
            "args": [],
            "env": [],                
        },
        # "disk_spec": {
        #    "boot_disk_size_gb": 200,            
        # }
    },
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            #"accelerator_type": "NVIDIA_TESLA_K80",
            #"accelerator_count": 1,           
       },
       "container_spec": {
           "image_uri": IMAGE_URI,
           "env": []
       },        
       "replica_count": 1,
    },
]

my_job = aiplatform.CustomJob(
    display_name="CustomJob multinode SSH Test " + TIMESTAMP,    
    worker_pool_specs=worker_pool_specs,
    # labels={'my_key': 'my_value'},
)

In [None]:
# 
# Running the CustomJob
#
# If custom vpc peering and custom service accounts are desirable, first configure them:
#
# VPC Peering - https://cloud.google.com/vertex-ai/docs/general/vpc-peering .
# Custom Service Account - https://cloud.google.com/vertex-ai/docs/general/custom-service-account 
#
# For custom service account, 
# be sure to first grant the SA running this notebook the "Service Account User" role, 
# otherwise you won't be able to launch the job with the custom service account.
#
# Checking Service account that will launch the job
!gcloud config get account

my_job.submit(    
    enable_web_access=True, # For debugging
    # network="projects/{PROJECT_NUMBER}/global/networks/{PEER_NETWORK_NAME}",
    # service_account="{CUSTOM_SA_EMAIL}",
)