In [0]:
import dlt
from pyspark.sql.functions import col, length, upper, lower, trim, lit
from pyspark.sql.types import StringType, IntegerType

# Map from config data_type to Spark type
DATA_TYPE_MAP = {
    "STRING": StringType(),
    "INT": IntegerType()
}

@dlt.table(
    comment="Raw customer data after data quality checks and validations",
    table_properties={"quality": "bronze_stage"}
)
@dlt.expect_or_drop("valid_customer_id", "customerId IS NOT NULL AND LENGTH(customerId) > 0")
@dlt.expect_or_drop("valid_customer_unique_id", "ucn IS NOT NULL AND LENGTH(ucn) > 0")
@dlt.expect_or_drop("valid_zip_code", "zip_prefix RLIKE '^[0-9]{4,5}$'")  # supports 4 or 5 digit zip prefix
@dlt.expect_or_drop("valid_state", "state IS NOT NULL AND LENGTH(state) = 2")
@dlt.expect_or_drop("valid_city", "city IS NOT NULL AND LENGTH(city) > 0")
def bronze_stage():
    # 1. Read streaming source
    raw_df = dlt.read_stream("bronze_raw")

    # 2. Read configuration table
    config_df = spark.table("`unity-veersa`.config_tables.config_raw_to_stage").filter(col("is_active") == True)

    # 3. Normalize raw data (city lower, state upper)
    normalized_df = (
        raw_df
        .withColumn("city", lower(trim(col("city"))))
        .withColumn("state", upper(trim(col("state"))))
    )

    # 4. Dynamically select and alias columns based on config
    selected_cols = []
    for row in config_df.collect():
        src_col = row["column_name"]      # e.g., 'customerId', 'ucn'
        alias = row["alias_name"]         # output alias
        dtype = row["data_type"]          # STRING/INT
        default_val = row["default_value"]

        # If source column exists, use it; else fallback to default
        if src_col in normalized_df.columns:
            col_expr = col(src_col)
        elif default_val is not None:
            col_expr = lit(default_val)
        else:
            col_expr = lit(None)

        # Cast to required data type
        if dtype and dtype.upper() in DATA_TYPE_MAP:
            col_expr = col_expr.cast(DATA_TYPE_MAP[dtype.upper()])

        selected_cols.append(col_expr.alias(alias))

    # Select columns as per config
    transformed_df = normalized_df.select(*selected_cols)

    return transformed_df
