In [1]:
def get_widget_or_default(widget_name: str, default_value: str) -> str:
    try:
        return dbutils.widgets.get(widget_name)
    except Exception:
        return default_value

gold_run_id = get_widget_or_default("gold_run_id", "MANUAL_RUN")

SILVER_TABLE = "silver_mcc"
GOLD_TABLE   = "gold_dim_mcc"

print(f"gold_run_id  = {gold_run_id}")
print(f"SILVER_TABLE = {SILVER_TABLE}")
print(f"GOLD_TABLE   = {GOLD_TABLE}")



StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 3, Finished, Available, Finished)

gold_run_id  = MANUAL_RUN
SILVER_TABLE = silver_mcc
GOLD_TABLE   = gold_dim_mcc


In [2]:
#-------------------------------
# Silver Contract first
#-------------------------------

from pyspark.sql import functions as F

CONTRACT = {
  "natural_key": ["mcc_code"],
  "business_columns": [
    ("mcc_code","STRING",False),
    ("mcc_description","STRING",True),
  ],
  "technical_columns": [
    ("source_file","STRING",False),
    ("ingestion_date","DATE",False),
    ("ingestion_ts","TIMESTAMP",False),
    ("record_hash","STRING",False),
  ],
  "allow_additional_columns": False,
  "dedup": {
    "keys": ["mcc_code"],
    "order_by": "ingestion_ts",
    "strategy": "keep_latest"
  }
}

BUSINESS_COLS = [c[0] for c in CONTRACT["business_columns"]]
TECH_COLS     = [c[0] for c in CONTRACT["technical_columns"]]
EXPECTED_COLS = BUSINESS_COLS + TECH_COLS


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 4, Finished, Available, Finished)

In [3]:
#-------------------------------
# Lecture Silver + fail fast
#-------------------------------

df_silver = spark.table(SILVER_TABLE)

actual_cols = df_silver.columns
missing = sorted(list(set(EXPECTED_COLS) - set(actual_cols)))
extra   = sorted(list(set(actual_cols) - set(EXPECTED_COLS)))

if missing:
    raise ValueError(f"[FAIL FAST] Missing columns in {SILVER_TABLE}: {missing}")

if not CONTRACT["allow_additional_columns"] and extra:
    raise ValueError(f"[FAIL FAST] Additional columns not allowed in {SILVER_TABLE}: {extra}")

print(f"Schema validation OK for {SILVER_TABLE}. Column count = {len(actual_cols)}")


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 5, Finished, Available, Finished)

Schema validation OK for silver_mcc. Column count = 6


In [4]:
#-------------------------------
# Projection Canonique + cast 
#-------------------------------

def cast_expr(col_name: str, spark_type: str):
    return F.col(col_name).cast(spark_type).alias(col_name)

select_exprs = []
for name, typ, _nullable in CONTRACT["business_columns"]:
    select_exprs.append(cast_expr(name, typ))

for name, typ, _nullable in CONTRACT["technical_columns"]:
    select_exprs.append(cast_expr(name, typ))

df_proj = df_silver.select(*select_exprs)


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 6, Finished, Available, Finished)

In [5]:
#-------------------------------
# Data Quality minimal
#-------------------------------

dq_null_cnt = df_proj.filter(F.col("mcc_code").isNull() | (F.trim(F.col("mcc_code")) == "")).count()
if dq_null_cnt > 0:
    raise ValueError(f"[FAIL FAST] mcc_code has null/empty values: count={dq_null_cnt}")


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 7, Finished, Available, Finished)

In [6]:
#-------------------------------
# Deduplication
#-------------------------------

from pyspark.sql.window import Window

w = Window.partitionBy(*CONTRACT["dedup"]["keys"]).orderBy(F.col(CONTRACT["dedup"]["order_by"]).desc())

df_dedup = (
    df_proj
    .withColumn("_rn", F.row_number().over(w))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

dup_cnt = (
    df_dedup
    .groupBy("mcc_code")
    .count()
    .filter(F.col("count") > 1)
    .count()
)

if dup_cnt != 0:
    raise ValueError(f"[FAIL FAST] Deduplication failed: still have duplicates on mcc_code (count={dup_cnt})")

print(f"Dedup OK. Rows = {df_dedup.count()}")


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 8, Finished, Available, Finished)

Dedup OK. Rows = 109


In [7]:
#-------------------------------
#Technical columns Gold  + Write 
#-------------------------------

df_gold = (
    df_dedup
    .withColumn("gold_run_id", F.lit(gold_run_id))
    .withColumn("gold_load_ts", F.current_timestamp())
)

spark.sql(f"TRUNCATE TABLE {GOLD_TABLE}")

(
    df_gold
    .write
    .mode("append")
    .format("delta")
    .saveAsTable(GOLD_TABLE)
)

print(f"Loaded {GOLD_TABLE} successfully.")


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 9, Finished, Available, Finished)

Loaded gold_dim_mcc successfully.


In [8]:
#-------------------------------
#Post Load Control
#-------------------------------

spark.sql(f"""
SELECT
  count(*) AS row_count,
  max(gold_load_ts) AS last_gold_load_ts
FROM {GOLD_TABLE}
""").show(truncate=False)

spark.sql(f"SELECT * FROM {GOLD_TABLE} ORDER BY mcc_code LIMIT 50").show(truncate=False)


StatementMeta(, f8e72689-05d3-49bf-9a08-4fb9b639909f, 10, Finished, Available, Finished)

+---------+--------------------------+
|row_count|last_gold_load_ts         |
+---------+--------------------------+
|109      |2026-01-05 19:21:50.875575|
+---------+--------------------------+

+--------+-----------------------------------------------+-----------------------------+--------------+--------------------------+----------------------------------------------------------------+-----------+--------------------------+
|mcc_code|mcc_description                                |source_file                  |ingestion_date|ingestion_ts              |record_hash                                                     |gold_run_id|gold_load_ts              |
+--------+-----------------------------------------------+-----------------------------+--------------+--------------------------+----------------------------------------------------------------+-----------+--------------------------+
|1711    |Heating, Plumbing, Air Conditioning Contractors|Files/landing/mcc/2025-12-07/|2025-12-07 