In [0]:
from lakefed_ingest.main import *

In [0]:
dbutils.widgets.dropdown(
    "src_type",
    "sqlserver",
    ["sqlserver", "oracle", "postgresql", "redshift", "synapse", "delta"],
    "01 Source Type",
)
dbutils.widgets.text("src_catalog", "", "02 Source Catalog")
dbutils.widgets.text("src_schema", "", "03 Source Schema")
dbutils.widgets.text("src_table", "", "04 Source Table")
dbutils.widgets.text("root_dir", "", "05 Root directory for project files")
dbutils.widgets.text("partition_col", "", "08 Source Partition Column")
dbutils.widgets.text("partition_size_mb", "", "09 Partition Size MB")
dbutils.widgets.text("tgt_catalog", "", "10 Target Catalog")
dbutils.widgets.text("tgt_schema", "", "11 Target Schema")
dbutils.widgets.text("tgt_table", "", "12 Target Table")

In [0]:
src_type = dbutils.widgets.get('src_type')
src_catalog = dbutils.widgets.get('src_catalog')
src_schema = dbutils.widgets.get('src_schema')
src_table = dbutils.widgets.get('src_table')
partition_col = dbutils.widgets.get('partition_col')
partition_size_mb = int(dbutils.widgets.get('partition_size_mb'))
tgt_catalog = dbutils.widgets.get('tgt_catalog')
tgt_schema = dbutils.widgets.get('tgt_schema')
tgt_table = dbutils.widgets.get('tgt_table')

jdbc_config_file = None if jdbc_config_file == '' else jdbc_config_file

In [0]:
# Get size of source table. Table size and the specified partition size
# are used to calculate the approximate size of each individual query.
table_size_mb = get_table_size(src_catalog, src_schema, src_table, src_type)

In [0]:
# Get the lower and upper bound values of the partition column
lower_bound, upper_bound = get_partition_boundaries(src_catalog, src_schema, src_table, partition_col)

print(f'Upper and lower bound: {lower_bound}, {upper_bound}')

In [0]:
# Calculate number of partitions. Minimum is 2.
num_partitions = int(table_size_mb / partition_size_mb)
num_partitions = max(num_partitions, 2)

print(f'Number of partitions: {num_partitions}')

In [0]:
# Generate partition list
partition_list = get_partition_list(
    partition_col,
    lower_bound,
    upper_bound,
    num_partitions
)

partitions_tbl = f'{tgt_catalog}.{tgt_schema}.{tgt_table}_partitions'

# Write partitions to table
partition_df = get_partition_df(partition_list, num_partitions)
partition_df.write.option("overwriteSchema", "true").mode("overwrite").saveAsTable(partitions_tbl)

In [0]:
# Get list of ids
id_list_qry = f"""\
    select array_agg(id) as ids
    from {partitions_tbl}
"""

id_list = spark.sql(id_list_qry).collect()[0][0]
id_list.sort()
print(f'Count of ids: {len(id_list)}')

# Assign id list to job task value to make it available to a for each task.
dbutils.jobs.taskValues.set(key="id_list", value=id_list)