In [None]:
#
# Copyright 2023 Google LLC
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     https://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Setup Container

In [None]:
# Artifact Registry Repo
AR_REPO="vertex-customjob"
IMG_NAME="deepspeed-chat"

In [None]:
PROJECT_ID=!gcloud config get-value project
PROJECT_ID=PROJECT_ID[0]
LOCATION=""
BUCKET=""
IMAGE_URI="us-docker.pkg.dev/{}/{}/{}:latest".format(PROJECT_ID,AR_REPO,IMG_NAME)

In [None]:
%cd ~/vertex-deepspeed

In [None]:
DOCKERFILE=f"examples/deepspeed-chat/{IMG_NAME}.Dockerfile"
!echo $DOCKERFILE
!docker build . -t $IMAGE_URI -f $DOCKERFILE

In [None]:
# This test triggers a test run with an example Vertex $CLUSTER_SPEC
# It then proceed to deepspeed_train.sh with a single node.
TEST_AIP_MODEL_DIR=f"gs://{BUCKET}/aiplatform-custom-job-xxxx-xx-xx-xx:xx:xx.xxx/model/"
TEST_AIP_TENSORBOARD_LOG_DIR=f"gs://{BUCKET}/aiplatform-custom-job-xxxx-xx-xx-xx:xx:xx.xxx/logs/"
!docker run \
    --gpus all \
    -e TESTING="true" \
    -e AIP_MODEL_DIR=$TEST_AIP_MODEL_DIR \
    -e AIP_TENSORBOARD_LOG_DIR=$TEST_AIP_TENSORBOARD_LOG_DIR \
    -e MODEL_PATH="facebook/opt-125m" \
    -e DATA_PATHS="Dahoas/synthetic-instruct-gptj-pairwise" \
    -e DATA_SPLIT="1,49,50" \
    -e ZERO_STAGE="3" \
    -e PER_DEVICE_BATCH_SIZE="4" \
$IMAGE_URI 

In [None]:
# If this throws error
# add "us-docker.pkg.dev": "gcloud" to /home/jupyter/.docker/config.json
!gcloud auth configure-docker

In [None]:
# Make sure the repo specified in $AR_REPO exists.
!echo $IMAGE_URI
!docker push $IMAGE_URI

# Test container with aiplatform.CustomJob

In [None]:
from datetime import datetime
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET, location=LOCATION)

In [None]:
######
# NOTE: This is an example to test multi-mode training with DeepSpeed on Vertex. 
#
# DeepspeedChat has 3 steps: SFT, Reward Model, and RLHF. We are only calling the SFT step here.
# DATA_SPLIT - "10,40,50" means 10% of the data is used for SFT. The DeepspeedChat code converts the string into fractions (data_utils.py).
#
# Testing facebook/opt-125m and Dahoas/synthetic-instruct-gptj-pairwise with 2 1xT4@n1-standard-4:
#   PER_DEVICE_BATCH_SIZE - 8 will utilize < half a T4's memory on each of the 2 nodes, 32 uses the memory 70+%
#
# While the Deepspeed Chat team has auto-tuning on roadmap, if you encounter CUDA OOM right now their advice is:
# - Reduce `--per_device_*_batch_size`,
# - Increase `--zero_stage {0,1,2,3}` on multi-gpu setups,
# - Enable `--gradient_checkpointing` or `--only_optimize_lora`,
# - Increase `--gradient_accumulate_steps {#}`, higher number reduces communication of gradients between steps
# 
worker_pool_specs = [
    # `WorkerPoolSpec` for worker pool 0, primary replica, required  
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,       
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": [],
            "args": [],
            "env": [
                {"name": "MODEL_PATH", "value": "facebook/opt-125m"},                        
                {"name": "DATA_PATHS", "value": "Dahoas/synthetic-instruct-gptj-pairwise"},                        
                {"name": "DATA_SPLIT", "value": "10,40,50"},
                {"name": "ZERO_STAGE", "value": "3"},
                {"name": "PER_DEVICE_BATCH_SIZE", "value": "32"}, 
            ],                
        },
        "disk_spec": {
            "boot_disk_size_gb": 1000,            
        }
    },
    {
       "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,           
       },
       "replica_count": 1,        
       "container_spec": {
           "image_uri": IMAGE_URI,
       },        
       "disk_spec": {
            "boot_disk_size_gb": 1000,            
       }        
    },
]

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOB_NAME  = "DeepSpeed Chat Test " + TIMESTAMP

my_job = aiplatform.CustomJob(
    display_name=JOB_NAME,    
    worker_pool_specs=worker_pool_specs,
)

# Checking Service account that will launch the job
!gcloud config get account

In [None]:
#####
# Either create or reuse a tensorboard
# tensorboard = aiplatform.Tensorboard.create(
#    display_name=JOB_NAME,
# )
# 
# tensorboard_name = ""
# tensorboard = aiplatform.Tensorboard(tensorboard_name=tensorboard_name)
# 
# print(tensorboard.resource_name)

In [None]:
# 
# Running the CustomJob
#
# If custom vpc peering and custom service accounts are desirable, first configure them:
#
# VPC Peering - https://cloud.google.com/vertex-ai/docs/general/vpc-peering .
# Custom Service Account - https://cloud.google.com/vertex-ai/docs/general/custom-service-account 
#
# For custom service account, 
# be sure to first grant the SA running this notebook the "Service Account User" role, 
# otherwise you won't be able to launch the job with the custom service account.
# 
# Tensorboard - https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
# Your training script must be configured to write TensorBoard logs to the Cloud Storage bucket, 
# the location of which the Vertex AI Training Service will automatically make available through 
# a predefined environment variable AIP_TENSORBOARD_LOG_DIR.
#
my_job.submit(    
    enable_web_access=True, # For debugging
    # network="projects/{PROJECT_NUMBER}/global/networks/{PEER_NETWORK_NAME}",
    # service_account="{CUSTOM_SA_EMAIL}",
    # tensorboard=tensorboard.resource_name,
)