### Task
- fetch data via API request
- append data to UC table

### Goal
- Understand cost implications of doing this task with Python REPL vs Spark and using various compute, such as serverless, job cluster, performance enabled, photon enabled, DLT pipeline

In [0]:
# Hard code dlt pipeline id to fetch correct billing -> dlt has no job id (???)
dlt_id_map = {
  'dlt-serverless-standard-photon': '481460ab-abce-4ba1-bc70-624d7191762a',
  'dlt-serverless-performant-photon': '9d85ee72-7a03-459e-b298-346be2571537'
}

# Hard code job id to fetch correct billing
job_id_map = {
  'repl-aeso-job-cluster': 163538606946367,
  'repl-aeso-serverless-performant': 503681326349336,
  'repl-aeso-serverless-standard': 293483550497750,
  'spark-aeso-job-cluster': 414113030992173,
  'spark-aeso-serverless-performant': 17484259545284,
  'spark-aeso-serverless-standard': 572384069053560,
  'spark-aeso-job-cluster-pooled': 485989541203671
}


# Mung into dataframe
job_id_map_df = spark.createDataFrame(
  [(key, str(value)) for key, value in job_id_map.items()], 
  ["job_name", "job_id"]
  )

dlt_id_map_df = spark.createDataFrame(
  [(key, str(value)) for key, value in dlt_id_map.items()], 
  ["job_name", "dlt_pipeline_id"]
  )

# Create views to query billing tables
job_id_map_df.createOrReplaceTempView("job_id_map_view")
dlt_id_map_df.createOrReplaceTempView("dlt_id_map_view")

In [0]:
%sql 
CREATE OR REPLACE TEMP VIEW total_cost_usd_view AS
WITH 
job_usage AS (
  SELECT
    t1.*,
    t2.job_name as job_name
  FROM 
    system.billing.usage as t1
  INNER JOIN job_id_map_view as t2
  ON t1.usage_metadata.job_id = t2.job_id
  WHERE
    identity_metadata.run_as = 'david.hurley@databricks.com'
    AND usage_unit = 'DBU'
),
dlt_usage AS (
  SELECT
    t1.*,
    t2.job_name as job_name
  FROM 
    system.billing.usage as t1
  INNER JOIN dlt_id_map_view as t2
  ON t1.usage_metadata.dlt_pipeline_id = t2.dlt_pipeline_id
  WHERE
    identity_metadata.run_as = 'david.hurley@databricks.com'
    AND usage_unit = 'DBU'
),
all_usage AS (
  SELECT * FROM job_usage
    UNION ALL
  SELECT * FROM dlt_usage
),
sku_pricing AS (
  SELECT
    *,
    coalesce(price_end_time, date_add(current_date, 1)) AS coalesced_price_end_time
  FROM system.billing.list_prices
  WHERE currency_code = 'USD'
),
job_cost AS (
  SELECT
    t1.*,
    coalesce(t1.usage_quantity * t2.pricing.default, 0) as usage_cost
  FROM all_usage AS t1
  LEFT JOIN sku_pricing AS t2
    on t1.sku_name=t2.sku_name
    and t1.usage_unit=t2.usage_unit
    and (t1.usage_end_time between t2.price_start_time and t2.coalesced_price_end_time)
),
total_cost AS (
  SELECT  
    job_name,
    sum(usage_cost) as total_cost_usd
  FROM job_cost
  GROUP BY job_name
)
SELECT * FROM total_cost

In [0]:
total_cost_usd_df = spark.table("total_cost_usd_view")
total_cost_usd_df = total_cost_usd_df.orderBy("total_cost_usd")
display(total_cost_usd_df)