In [1]:
def get_widget_or_default(widget_name: str, default_value: str) -> str:
    try:
        return dbutils.widgets.get(widget_name)
    except Exception:
        return default_value

gold_run_id = get_widget_or_default("gold_run_id", "MANUAL_RUN")

SILVER_TABLE     = "silver_cards"
GOLD_TABLE       = "gold_dim_card"
GOLD_USER_TABLE  = "gold_dim_user"

# Optional: anomalies table (highly recommended)
ANOMALY_TABLE    = "gold_anom_dim_card_orphans"

print(f"gold_run_id        = {gold_run_id}")
print(f"SILVER_TABLE       = {SILVER_TABLE}")
print(f"GOLD_TABLE         = {GOLD_TABLE}")
print(f"GOLD_USER_TABLE    = {GOLD_USER_TABLE}")
print(f"ANOMALY_TABLE      = {ANOMALY_TABLE}")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 3, Finished, Available, Finished)

gold_run_id        = MANUAL_RUN
SILVER_TABLE       = silver_cards
GOLD_TABLE         = gold_dim_card
GOLD_USER_TABLE    = gold_dim_user
ANOMALY_TABLE      = gold_anom_dim_card_orphans


In [2]:
# ----------------------------------------------
# Read Silver Contract first
# ----------------------------------------------


from pyspark.sql import functions as F

CONTRACT = {
  "natural_key": ["card_id"],
  "business_columns": [
    ("card_id","BIGINT",False),
    ("client_id","BIGINT",True),
    ("card_brand","STRING",True),
    ("card_type","STRING",True),
    ("card_number","STRING",True),
    ("expires_raw","STRING",True),
    ("expires_month","DATE",True),
    ("cvv","STRING",True),
    ("has_chip","BOOLEAN",True),
    ("num_cards_issued","INT",True),
    ("credit_limit","DECIMAL(18,2)",True),
    ("acct_open_date","DATE",True),
    ("year_pin_last_changed","INT",True),
    ("card_on_dark_web","BOOLEAN",True),
  ],
  "technical_columns": [
    ("source_file","STRING",False),
    ("ingestion_date","DATE",False),
    ("ingestion_ts","TIMESTAMP",False),
    ("record_hash","STRING",False),
  ],
  "allow_additional_columns": False,
  "dedup": {
    "keys": ["card_id"],
    "order_by": "ingestion_ts",
    "strategy": "keep_latest"
  }
}

BUSINESS_COLS = [c[0] for c in CONTRACT["business_columns"]]
TECH_COLS     = [c[0] for c in CONTRACT["technical_columns"]]
EXPECTED_COLS = BUSINESS_COLS + TECH_COLS


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 4, Finished, Available, Finished)

In [3]:
# ----------------------------------------------
# Check Structure + fail fast
# ----------------------------------------------


df_silver = spark.table(SILVER_TABLE)

actual_cols = df_silver.columns
missing = sorted(list(set(EXPECTED_COLS) - set(actual_cols)))
extra   = sorted(list(set(actual_cols) - set(EXPECTED_COLS)))

if missing:
    raise ValueError(f"[FAIL FAST] Missing columns in {SILVER_TABLE}: {missing}")

if not CONTRACT["allow_additional_columns"] and extra:
    raise ValueError(f"[FAIL FAST] Additional columns not allowed in {SILVER_TABLE}: {extra}")

print(f"Schema validation OK for {SILVER_TABLE}. Column count = {len(actual_cols)}")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 5, Finished, Available, Finished)

Schema validation OK for silver_cards. Column count = 18


In [4]:
# ----------------------------------------------
# Projection canonique + cast contractuel
# ----------------------------------------------


def cast_expr(col_name: str, spark_type: str):
    return F.col(col_name).cast(spark_type).alias(col_name)

select_exprs = []
for name, typ, _nullable in CONTRACT["business_columns"]:
    select_exprs.append(cast_expr(name, typ))

for name, typ, _nullable in CONTRACT["technical_columns"]:
    select_exprs.append(cast_expr(name, typ))

df_proj = df_silver.select(*select_exprs)


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 6, Finished, Available, Finished)

In [5]:
# ----------------------------------------------
# Deduplication
# ----------------------------------------------


from pyspark.sql.window import Window

w = Window.partitionBy(*CONTRACT["dedup"]["keys"]).orderBy(F.col(CONTRACT["dedup"]["order_by"]).desc())

df_dedup = (
    df_proj
    .withColumn("_rn", F.row_number().over(w))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

# Fail fast uniqueness
dup_cnt = (
    df_dedup
    .groupBy("card_id")
    .count()
    .filter(F.col("count") > 1)
    .count()
)

if dup_cnt != 0:
    raise ValueError(f"[FAIL FAST] Deduplication failed: still have duplicates on card_id (count={dup_cnt})")

print(f"Dedup OK. Rows = {df_dedup.count()}")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 7, Finished, Available, Finished)

Dedup OK. Rows = 6146


In [6]:
# ----------------------------------------------
# Conformance Check client_id (orphans)
# ----------------------------------------------

df_users = spark.table(GOLD_USER_TABLE).select(F.col("client_id").cast("BIGINT").alias("client_id")).dropDuplicates()

df_orphans = (
    df_dedup
    .filter(F.col("client_id").isNotNull())
    .join(df_users, on="client_id", how="left_anti")
)

orphans_cnt = df_orphans.count()
print(f"Orphan cards (client_id not found in {GOLD_USER_TABLE}) = {orphans_cnt}")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 8, Finished, Available, Finished)

Orphan cards (client_id not found in gold_dim_user) = 0


In [7]:
# ----------------------------------------------
# Persist orphans
# ----------------------------------------------


if orphans_cnt > 0:
    df_anom = (
        df_orphans
        .withColumn("anom_type", F.lit("ORPHAN_CLIENT_ID"))
        .withColumn("gold_run_id", F.lit(gold_run_id))
        .withColumn("gold_load_ts", F.current_timestamp())
    )

    # Append-only anomalies (audit-friendly)
    df_anom.write.mode("append").format("delta").saveAsTable(ANOMALY_TABLE)

    print(f"Anomalies written to {ANOMALY_TABLE}")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 9, Finished, Available, Finished)

In [8]:
# ----------------------------------------------
# Filter orphans
# ----------------------------------------------


df_conformed = (
    df_dedup
    .join(df_users, on="client_id", how="left_semi")
    .unionByName(df_dedup.filter(F.col("client_id").isNull()), allowMissingColumns=False)
)


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 10, Finished, Available, Finished)

In [9]:
# ----------------------------------------------
# Colonnes techniques Gold + write
# ----------------------------------------------


df_gold = (
    df_conformed
    .withColumn("gold_run_id", F.lit(gold_run_id))
    .withColumn("gold_load_ts", F.current_timestamp())
)

spark.sql(f"TRUNCATE TABLE {GOLD_TABLE}")

(
    df_gold
    .write
    .mode("append")
    .format("delta")
    .saveAsTable(GOLD_TABLE)
)

print(f"Loaded {GOLD_TABLE} successfully.")


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 11, Finished, Available, Finished)

Loaded gold_dim_card successfully.


In [10]:
# ----------------------------------------------
# Post Load Control
# ----------------------------------------------


spark.sql(f"""
SELECT
  count(*) AS row_count,
  max(gold_load_ts) AS last_gold_load_ts
FROM {GOLD_TABLE}
""").show(truncate=False)

spark.sql(f"""
SELECT card_id, client_id, card_brand, card_type, credit_limit, has_chip
FROM {GOLD_TABLE}
ORDER BY card_id
LIMIT 20
""").show(truncate=False)


StatementMeta(, ea1d8572-3596-4aad-ba84-2340305ca5bb, 12, Finished, Available, Finished)

+---------+--------------------------+
|row_count|last_gold_load_ts         |
+---------+--------------------------+
|6146     |2026-01-05 18:25:25.799229|
+---------+--------------------------+

+-------+---------+----------+---------+------------+--------+
|card_id|client_id|card_brand|card_type|credit_limit|has_chip|
+-------+---------+----------+---------+------------+--------+
|0      |1362     |AMEX      |CREDIT   |33900.00    |true    |
|1      |550      |MASTERCARD|CREDIT   |11600.00    |true    |
|2      |556      |MASTERCARD|DEBIT    |19948.00    |true    |
|3      |1937     |VISA      |CREDIT   |16400.00    |true    |
|4      |1981     |MASTERCARD|DEBIT    |19439.00    |true    |
|5      |619      |VISA      |DEBIT    |21883.00    |true    |
|6      |1046     |AMEX      |CREDIT   |9400.00     |true    |
|7      |511      |MASTERCARD|DEBIT    |9664.00     |true    |
|8      |1107     |MASTERCARD|CREDIT   |10300.00    |false   |
|9      |1046     |AMEX      |CREDIT   |13000.00