In [None]:
# ============================================================
# nb_silver_mcc
# ============================================================

# %run ./nb_silver_utils
from pyspark.sql import functions as F

df_bronze = spark.table("bronze_mcc_raw")

assert_required_columns(
    df_bronze,
    ["mcc", "mcc_description", "source_file", "ingestion_ts", "ingestion_date"]
)

# Renaming / Normalisation
df = df_bronze

# Normaliser code MCC en string 4 chars
df = normalize_mcc_code(df, col="mcc")

# Nettoyage description
df = df.withColumn("mcc_description", F.trim(F.col("mcc_description")))

# Colonnes techniques
df = add_tech_columns(df, source_file_col="source_file")

# Hash
df = add_record_hash(df, cols=["mcc_code", "mcc_description"])

# Dédup (clé naturelle = mcc) :contentReference[oaicite:7]{index=7}
df = deduplicate_latest(df, key_cols=["mcc_code"], order_col="ingestion_ts")

# Projection Silver
df = df.select(
    "mcc_code",
    "mcc_description",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
)

# Fail fast : clé non nulle
if df.filter(F.col("mcc_code").isNull()).limit(1).count() > 0:
    raise ValueError("Null mcc_code detected in silver_mcc.")

# Fail fast supplémentaire : format MCC = 4 digits
bad_mcc = df.filter(
    F.col("mcc_code").isNull() |
    (~F.col("mcc_code").rlike("^[0-9]{4}$"))
)

if bad_mcc.limit(1).count() > 0:
    raise ValueError("Invalid mcc_code detected in silver_mcc (expected 4 digits, e.g., '5411').")

#write dataset
write_silver_mcc(df, table_name="silver_mcc", mode="overwrite")
print("silver_mcc successfully written.")

