### Cost Optimization for Vertex Training/Pipelines
#### ToDos (Strikethrough when complete)
 * ~~Be able to determine cost for a given training run~~
 * ~~Be able to determine cost for a given hyperparameter training run~~
 * Be able to determine pipeline cost (with arbitrary components)
 * Use Vizier to experiment with machine shapes for a given job
   * Why is this useful?  Maybe there are some efficiencies that can be gained for a given job that is run on a schedule?
   * How do you setup the parent, child relatioship with parameters
 * Get GPU Utilization usage for long running jobs - figure out how to return null or NAN if not
 * Populate results in BQ - report in datastudio?  Or could report directly in 

In [7]:
!pip install -U --user google-cloud-billing lxml

Collecting lxml
  Downloading lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: lxml
Successfully installed lxml-4.9.1


In [1]:
# Enable cloud billing API
!gcloud services enable cloudbilling.googleapis.com

In [85]:
# Vertex AI Machine Shape, GPU Combination
import pandas as pd

table_vertex_compute = pd.read_html('https://cloud.google.com/vertex-ai/docs/training/configure-compute', attrs={"id": "gpu-compatibility-table"})[0]

table_vertex_compute.columns = table_vertex_compute.columns.droplevel(0)

# Get rid of a2-ultras & NVIDIA_A100_80GB for now until quota available
table_vertex_compute = table_vertex_compute.loc[table_vertex_compute['Machine type'].str.contains(r'^(?!a2-ultra).*')]
table_vertex_compute = table_vertex_compute.drop('NVIDIA_A100_80GB', axis=1)
#table_vertex_compute = table_vertex_compute.fillna(0)

table_vertex_compute

Unnamed: 0,Machine type,NVIDIA_TESLA_A100,NVIDIA_TESLA_K80,NVIDIA_TESLA_P4,NVIDIA_TESLA_P100,NVIDIA_TESLA_T4,NVIDIA_TESLA_V100
4,a2-highgpu-1g,1.0,0,0,0,0,0
5,a2-highgpu-2g,2.0,0,0,0,0,0
6,a2-highgpu-4g,4.0,0,0,0,0,0
7,a2-highgpu-8g,8.0,0,0,0,0,0
8,a2-megagpu-16g,16.0,0,0,0,0,0
9,n1-standard-4,0.0,"1, 2, 4, 8","1, 2, 4","1, 2, 4","1, 2, 4","1, 2, 4, 8"
10,n1-standard-8,0.0,"1, 2, 4, 8","1, 2, 4","1, 2, 4","1, 2, 4","1, 2, 4, 8"
11,n1-standard-16,0.0,"2, 4, 8","1, 2, 4","1, 2, 4","1, 2, 4","2, 4, 8"
12,n1-standard-32,0.0,"4, 8","2, 4","2, 4","2, 4","4, 8"
13,n1-standard-64,0.0,0,4,0,4,8


In [None]:
# Re-organize into map where a given machine shape has a child of ACCELERATOR_TYPE and each of those has a child for # of accelerators

In [83]:
GPUS = table_vertex_compute.columns.tolist()[1:]
GPUS

['NVIDIA_TESLA_A100',
 'NVIDIA_TESLA_K80',
 'NVIDIA_TESLA_P4',
 'NVIDIA_TESLA_P100',
 'NVIDIA_TESLA_T4',
 'NVIDIA_TESLA_V100']

### Create the study configuration

The following is a sample study configuration, built as a hierarchical python dictionary. It is already filled out. Run the cell to configure the study.

In [None]:
import json


#RJV - Hardcoding for testing - these values will be populated by the dataframe above
param_machine_type = {"parameter_id": "machine_type","categorical_value_spec": {"values":["n1-standard-4","n1-standard-8"]}}

param_gpu_type = {"parameter_id": "gpu_type","categorical_value_spec": {"values":["NVIDIA_TESLA_K80","NVIDIA_TESLA_P4","NVIDIA_TESLA_P100","NVIDIA_TESLA_T4","NVIDIA_TESLA_V100"]}}

param_gpu_count = {"parameter_id": "gpu_count","discrete_value_spec": {"values":[1,2,4]}}
# - maybe only explore - 2 & 8 (if it is, then maybe just 2)


# Objective Metrics
metric_training_time = {"metric_id": "training_time", "goal": "MINIMIZE"}

metric_estimated_training_cost = {"metric_id": "estimated_training_cost", "goal": "MINIMIZE"}

metric_gpu_utilization = {"metric_id": "average_gpu_utilization", "goal": "MAXIMIZE"}


# Put it all together in a study configuration
study = {
    "display_name": 'Tabnet_GPU_Cost_Optimization_v2',
    "study_spec": {
        #"algorithm": "RANDOM_SEARCH",
        "parameters": [
            param_machine_type,
            param_gpu_type,
            param_gpu_count,
        ],
        "metrics": [metric_training_time, metric_estimated_training_cost, metric_gpu_utilization],
    },
}

print(json.dumps(study, indent=2, sort_keys=True))

In [50]:
from google.cloud import billing_v1
import re

CACHED_PRICES=False
GPU_PRICES= {}
INSTANCE_CORE_PRICES = {}
INSTANCE_RAM_PRICES = {}

def normalize_price(response):
    unit_price = response.pricing_info[0].pricing_expression.tiered_rates[0].unit_price
    cents = unit_price.nanos/1000000000
    if hasattr(unit_price, 'units'):
        price= unit_price.units + cents
    else:
        price = cents
    return price

def get_machine_specs(machine_shape):
    m_class,m_type,m_cores = machine_shape.split('-')
    if m_class == 'a2':
        #a2-highgpu: #12 vcpu, 72 Gb RAM per gpu
        if m_type == 'highgpu':
            gpu_mult = 12
            ram_mult = 85
        #a2-highgpu: #6 vcpu, 85 Gb RAM per gpu
        if m_type == 'megagpu':
            gpu_mult = 6
            ram_mult = 85
        #a2-highgpu: #6 vcpu, 85 Gb RAM per gpu
        if m_type == 'ultragpu':
            gpu_mult = 12
            ram_mult = 170
        gpus,_ = m_cores.split('g')
        gpus = int(gpus)
        return {'class':m_class,'cores':  gpus * gpu_mult, 'ram': gpus * ram_mult }
    elif m_class == 'n1':
        #n1-standard: 3.75 Gb RAM per core
        if m_type == 'standard':
            mult = 3.75        
        #n1-highmem: 6.5 Gb RAM per core
        if m_type == 'highmem':
            mult = 6.5
        #n1-highcpu: 1.8 Gb RAM per core
        if m_type == 'highcpu':
            mult = 1.8
        return {'class':m_class,'cores':  int(m_cores), 'ram': int(m_cores) * mult }
    elif m_class == 'e2' or m_class == 'c2':
        #e2-standard: 4 Gb RAM per core
        if m_type == 'standard':
            mult = 4        
        #e2-highmem: 8 Gb RAM per core
        if m_type == 'highmem':
            mult = 8
         #e2-highcpu: 1 Gb RAM per core
        if m_type == 'highcpu':
            mult = 1
        return {'class':m_class,'cores':  int(m_cores), 'ram': int(m_cores) * mult }


def get_hourly_price(machine_shape, gpu_type='ACCELERATOR_TYPE_UNSPECIFIED', gpu_count=0):
    global CACHED_PRICES
    
    # Create a client
    
    if not CACHED_PRICES:
        client = billing_v1.CloudCatalogClient()

        # Initialize request argument(s)
        request = billing_v1.ListSkusRequest(
            parent="services/C7E2-9256-1C43",
        )

        # Make the request
        page_result = client.list_skus(request=request)

        # Look specifically for Vertex AI Training Americas
        for response in page_result:
            print(response)
            #GPU
            match = re.match(r'Vertex AI: Training/Pipelines.+Nvidia.+Americas',response.description)
            if match is not None:
                for gpu in GPUS:
                    #_,gpu_id = gpu.split('NVIDIA_TESLA_')
                    # Minor hack here to format change for a100_80g
                    try:
                        _,gpu_id = gpu.split('NVIDIA_TESLA_')
                    except ValueError: # A100 80 Gb have a different format
                        gpu_id = 'A100'
                    gpu_match = re.search(gpu_id,response.description)
                    if gpu_match is not None:
                        price = normalize_price(response)
                        GPU_PRICES[gpu]=price
            match = re.match(r'Vertex AI: Training/Pipelines.+N1.+Americas',response.description)
            if match is not None:
                instance_match = re.search("Instance Core",response.description)
                if instance_match is not None:
                    price = normalize_price(response)
                    INSTANCE_CORE_PRICES['n1']=price
                ram_match = re.search("Instance Ram",response.description)
                if ram_match is not None:
                    price = normalize_price(response)
                    INSTANCE_RAM_PRICES['n1']=price
            match = re.match(r'Vertex AI: Training/Pipelines.+A2.+Americas',response.description)
            if match is not None:
                instance_match = re.search("Instance Core",response.description)
                if instance_match is not None:
                    price = normalize_price(response)
                    INSTANCE_CORE_PRICES['a2']=price
                ram_match = re.search("Instance Ram",response.description)
                if ram_match is not None:
                    price = normalize_price(response)
                    INSTANCE_RAM_PRICES['a2']=price
            match = re.match(r'Vertex AI: Training/Pipelines.+E2.+Americas',response.description)
            if match is not None:
                instance_match = re.search("Instance Core",response.description)
                if instance_match is not None:
                    price = normalize_price(response)
                    INSTANCE_CORE_PRICES['e2']=price
                ram_match = re.search("Instance Ram",response.description)
                if ram_match is not None:
                    price = normalize_price(response)
                    INSTANCE_RAM_PRICES['e2']=price
            match = re.match(r'Vertex AI: Training/Pipelines.+Compute optimized.+Americas',response.description)
            if match is not None:
                instance_match = re.search("Core",response.description)
                if instance_match is not None:
                    price = normalize_price(response)
                    INSTANCE_CORE_PRICES['c2']=price
                ram_match = re.search("Ram",response.description)
                if ram_match is not None:
                    price = normalize_price(response)
                    INSTANCE_RAM_PRICES['c2']=price
        CACHED_PRICES=True
        
        #Adding 'Unspecified' type for CPU-only jobs
        GPU_PRICES['ACCELERATOR_TYPE_UNSPECIFIED']=0
    
    # Calculate prices
    machine_specs = get_machine_specs(machine_shape)

    if machine_specs['class']=='n1':
        return (machine_specs['cores'] * INSTANCE_CORE_PRICES['n1']) + (machine_specs['ram'] * INSTANCE_RAM_PRICES['n1']) + (GPU_PRICES[gpu_type] * gpu_count)
    
    if machine_specs['class']=='a2':
        return (machine_specs['cores'] * INSTANCE_CORE_PRICES['a2']) + (machine_specs['ram'] * INSTANCE_RAM_PRICES['a2']) + (GPU_PRICES[gpu_type] * gpu_count)
    
    if machine_specs['class']=='e2':
        return (machine_specs['cores'] * INSTANCE_CORE_PRICES['e2']) + (machine_specs['ram'] * INSTANCE_RAM_PRICES['e2']) + (GPU_PRICES[gpu_type] * gpu_count)
    
    if machine_specs['class']=='c2':
        return (machine_specs['cores'] * INSTANCE_CORE_PRICES['c2']) + (machine_specs['ram'] * INSTANCE_RAM_PRICES['c2']) + (GPU_PRICES[gpu_type] * gpu_count)

In [52]:
get_hourly_price('a2-highgpu-1g','NVIDIA_TESLA_A100',1)
#get_hourly_price('e2-standard-4')
#get_hourly_price('c2-standard-8')

4.22439275

In [65]:
def get_gpu_utilization(project_id, job_id, start_time, end_time,trial_id=False):
    from google.cloud import monitoring_v3

    client = monitoring_v3.MetricServiceClient()
    project_name = f"projects/{project_id}"
    
    interval = monitoring_v3.TimeInterval(
        {
            "end_time": end_time,
            #"end_time": {"seconds": end_time},
            "start_time": start_time,
            #"start_time": {"seconds": start_time},

        }
    )
    
    aggregation = monitoring_v3.Aggregation(
        {
            "alignment_period": {"seconds": 3600 * 24},  # 24 hours
            "per_series_aligner": monitoring_v3.Aggregation.Aligner.ALIGN_MEAN,
            "cross_series_reducer": monitoring_v3.Aggregation.Reducer.REDUCE_NONE,
        }
    )
    filter = f'metric.type = "ml.googleapis.com/training/accelerator/utilization" AND resource.type="cloudml_job" AND resource.labels.job_id ="{job_id}"'
    
    if trial_id:
        filter += f' AND metric.labels.trial_id = "{trial_id}"'

    results = client.list_time_series(
        request={
            "name": project_name,
            "filter": filter,
            "interval": interval,
            "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL,
            "aggregation": aggregation,
        }
    )
    
    gpu_utilization = -1
    for result in results:
        #print(f"utilization: {result.points[0].value.double_value}")
        gpu_utilization = result.points[0].value.double_value
        break
    return gpu_utilization

In [66]:
from google.protobuf.timestamp_pb2 import Timestamp

start_time = Timestamp()
end_time = Timestamp()

start_time.FromJsonString('2022-08-12T20:14:00.0000Z')
end_time.FromJsonString('2022-08-13T20:19:00.0000Z')

get_gpu_utilization(project_id='gcp-ml-sandbox',job_id='7857859134383718400', start_time=start_time, end_time=end_time)

0.4981818181818182

In [None]:
from google.cloud import aiplatform

from datetime import datetime, timedelta

import pandas as pd

def get_job_details(
    project: str,
    custom_job: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.JobServiceClient(client_options=client_options)
    name = client.custom_job_path(
        project=project, location=location, custom_job=custom_job
    )
    response = client.get_custom_job(name=name)
    
    gpu_utilization = get_gpu_utilization(project, custom_job, response.start_time, response.end_time)
    job_duration = (response.end_time - response.start_time)/timedelta(hours=1)
    job_cost = 0
    
    compute_resources = []
    
    for i, worker_pool_spec in enumerate(response.job_spec.worker_pool_specs):
        machine_spec = worker_pool_spec.machine_spec
        job_cost += get_hourly_price(machine_spec.machine_type, machine_spec.accelerator_type.name,machine_spec.accelerator_count) * worker_pool_spec.replica_count * job_duration
        
        compute_resources.append(
            {
            'worker_pool':i,
            'machine_type': machine_spec.machine_type, 
            'accelerator_type':machine_spec.accelerator_type.name,
            'accelerator_count':machine_spec.accelerator_count,
            'replica_count':worker_pool_spec.replica_count
            }
        )
    
    return {'compute_resources': compute_resources,
            'gpu_utilization':gpu_utilization,
            'job_duration_hours': job_duration,
            'job_cost': job_cost}


In [67]:
get_job_details('gcp-ml-sandbox','2891126792574205952')

{'compute_resources': [{'worker_pool': 0,
   'machine_type': 'n1-standard-4',
   'accelerator_type': 'NVIDIA_TESLA_T4',
   'accelerator_count': 1,
   'replica_count': 1},
  {'worker_pool': 1,
   'machine_type': 'n1-standard-4',
   'accelerator_type': 'NVIDIA_TESLA_T4',
   'accelerator_count': 1,
   'replica_count': 3}],
 'gpu_utilization': -1,
 'job_duration_hours': 0.31805555555555554,
 'job_cost': 0.7900485369444444}

In [68]:
from google.cloud import aiplatform

from datetime import datetime, timedelta

import pandas as pd

def get_hpt_job_details(
    project: str,
    hyperparameter_tuning_job: str,
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}

    client = aiplatform.gapic.JobServiceClient(client_options=client_options)
    name = client.hyperparameter_tuning_job_path(
        project=project, location=location, hyperparameter_tuning_job=hyperparameter_tuning_job
    )
    response = client.get_hyperparameter_tuning_job(name=name)

    machine_spec = response.trial_job_spec.worker_pool_specs[0].machine_spec
    
    
    
    total_trials = len(response.trials)
    total_job_duration = 0
    total_job_cost = 0
    agg_gpu_utilization = 0
    trial_dict = {"trial_id":[],"job_duration":[],"job_cost":[],"gpu_utilization":[]}
    
    hpt_trial_start_time = hpt_trial_end_time = 0
    
    for trial in response.trials:
        if trial.id == '1':
            hpt_trial_start_time = trial.start_time
            
        if trial.end_time:
            job_duration = (trial.end_time - trial.start_time)/timedelta(hours=1)
            job_cost = get_hourly_price(machine_spec.machine_type, machine_spec.accelerator_type.name,machine_spec.accelerator_count) * job_duration
            
            gpu_utilization = get_gpu_utilization(project, hyperparameter_tuning_job,trial.start_time, trial.end_time, trial.id)
            
            trial_dict['trial_id'].append(trial.id)
            trial_dict['job_duration'].append(job_duration)
            trial_dict['job_cost'].append(job_cost)
            trial_dict['gpu_utilization'].append(gpu_utilization)
            
            total_job_duration+=job_duration
            total_job_cost += job_cost
            agg_gpu_utilization+=gpu_utilization
            
            #Lazily setting last trial's end time to hpt end time
            hpt_trial_end_time = trial.end_time

    
    trial_df = pd.DataFrame.from_dict(trial_dict)
    #trial_df.set_index('trial_id')
    
    # plt = trial_df.plot(y='job_cost',figsize=(20,6))
    # plt.set_xlabel('trial #')
    # plt.set_ylabel('trial cost ($)')
    
    return {'machine_type': machine_spec.machine_type, 
            'accelerator_type':machine_spec.accelerator_type.name,
            'accelerator_count':machine_spec.accelerator_count,
            'avg_gpu_utilization':agg_gpu_utilization/total_trials,
            'hpt_job_cumulative_duration_hours': total_job_duration,
            'hpt_total_trials': total_trials,
            'hpt_trials_per_training_hour' : total_trials/total_job_duration,
            'job_cost': total_job_cost,
            'job_cost_per_trial_hour':total_job_cost/total_job_duration,
            'job_cost_per_trial':total_job_cost/total_trials,
            'trial_details':trial_df
           }



In [69]:
get_hpt_job_details('gcp-ml-sandbox','9161651226507476992')

{'machine_type': 'n1-standard-4',
 'accelerator_type': 'NVIDIA_TESLA_T4',
 'accelerator_count': 2,
 'avg_gpu_utilization': -0.6666666666666666,
 'hpt_job_cumulative_duration_hours': 0.49124607638888884,
 'hpt_total_trials': 9,
 'hpt_trials_per_training_hour': 18.320757014810766,
 'job_cost': 0.50278979425104,
 'job_cost_per_trial_hour': 1.0234988500000002,
 'job_cost_per_trial': 0.055865532694559995,
 'trial_details':   trial_id  job_duration  job_cost  gpu_utilization
 0        1      0.082073  0.084001               -1
 1        2      0.079851  0.081727               -1
 2        3      0.084017  0.085991               -1
 3        4      0.080472  0.082363               -1
 4        5      0.081861  0.083785               -1
 5        6      0.082972  0.084922               -1}