## Load Metadata - TPCDS
- Prepare metadata and merge into the control table
- Copies from Delta to Delta for integration testing

In [0]:
%python
catalog = dbutils.widgets.get('src_catalog')
schema = dbutils.widgets.get('src_schema')

spark.sql(f'use catalog {catalog}')

table_list = [row.tableName for row in spark.sql(f"SHOW TABLES IN {schema}").collect()]

results = []
for table in table_list:
    df = spark.sql(f"DESCRIBE DETAIL {schema}.{table}")
    size = df.select("sizeInBytes").collect()[0][0]
    cluster_cols = df.select("clusteringColumns").collect()[0][0]
    results.append({
        "catalog": catalog,
        "schema": schema,
        "table": table,
        "size_mb": int(size / (1024 * 1024)),
        "cluster_cols": cluster_cols
    })

spark.createDataFrame(results).createOrReplaceTempView('table_details')

In [0]:
use catalog identifier(:catalog);
use schema identifier(:schema);

create or replace temporary view control_src as
select
  'lakefed_ingest' as job_name,
  -- Partitioned loads can have a large number of queries
  -- Use a unique task_collection name & run them sequentially to avoid excessive concurrency 
  case
    when array_size(cluster_cols) > 0 then
      concat('tpcds_', t.table_name)
    else 'tpcds'
  end as task_collection,
  'delta' as src_type,
  t.table_catalog as src_catalog,
  t.table_schema as src_schema,
  t.table_name as src_table,
  'lakefed_ingest' as sink_catalog,
  'tpcds_sf1000' as sink_schema,
  t.table_name as sink_table,
  d.cluster_cols as sink_cluster_cols,
  'full' as load_type,
  case when array_size(cluster_cols) > 0 then true else false end as load_partitioned,
  '*' as select_list,
  null as watermark_col_name,
  null as watermark_col_type,
  null as watermark_col_start_value,
  case when load_partitioned is true then element_at(cluster_cols, 1) else null end as partition_col,
  case when load_partitioned is true then 256 else null end as partition_size_mb,
  true as task_enabled
from tpcds.information_schema.tables t
left join table_details d
on t.table_name = d.table
where t.table_schema = 'sf_1000_liquid'
  and t.table_catalog = 'tpcds';

In [0]:
select * from control_src

In [0]:
merge with schema evolution into identifier(:table) as t
using control_src as s
on t.src_catalog = s.src_catalog
and t.src_schema = s.src_schema
and t.src_table = s.src_table
when matched
  then update set *
when not matched
  then insert *