In [None]:
# ============================================================
# nb_silver_mcc
# ============================================================

# ------------------------------------------------------------
# 0) Paramètres d'exécution (STANDARD ENTITY INTERFACE)
# ------------------------------------------------------------
# Dans Microsoft Fabric, déclarez ces paramètres dans l'UI du notebook
# (première cellule). Ils seront injectés comme variables Python :
#   - run_id
#   - entity_code
#   - load_mode
#   - as_of_date
#
# Defaults ci-dessous uniquement pour exécution manuelle (interactive).
from datetime import datetime, timezone
import time
import json

try:
    run_id
except NameError:
    run_id = f"manual-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}"
    entity_code = "mcc"        # doit matcher ctl_entity_silver.entity_code
    load_mode = "full"         # full|incremental (non utilisé ici)
    as_of_date = ""            # YYYY-MM-DD ou vide

def _iso_utc(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# Timing (contract v1.0)
_t0 = time.time()
_started = datetime.now(timezone.utc)

In [None]:
# ------------------------------------------------------------
# 1) Import des utilitaires Silver
# ------------------------------------------------------------
# The command is not a standard IPython magic command. It is designed for use within Fabric notebooks only.

In [None]:
%run ./nb_silver_utils

In [None]:
from pyspark.sql import functions as F

df_bronze = spark.table("bronze_mcc_raw")

# ------------------------------------------------------------
# 1bis) métrique row_in (STANDARD)
# ------------------------------------------------------------
row_in = df_bronze.count()

# Contract normalization
as_of_date_norm = (as_of_date if (as_of_date or "").strip() else None)

assert_required_columns(
    df_bronze,
    ["mcc", "mcc_description", "source_file", "ingestion_ts", "ingestion_date"]
)

# Renaming / Normalisation
df = df_bronze

# Normaliser code MCC en string 4 chars
df = normalize_mcc_code(df, col="mcc")

# Nettoyage description
df = df.withColumn("mcc_description", F.trim(F.col("mcc_description")))

# Colonnes techniques
df = add_tech_columns(df, source_file_col="source_file")

# Hash
df = add_record_hash(df, cols=["mcc_code", "mcc_description"])

# ------------------------------------------------------------
# 2) Déduplication Silver (métrique dedup_dropped)
# ------------------------------------------------------------
_pre_dedup_count = df.count()

# Dédup (clé naturelle = mcc_code)
df = deduplicate_latest(df, key_cols=["mcc_code"], order_col="ingestion_ts")

_post_dedup_count = df.count()
dedup_dropped = int(_pre_dedup_count - _post_dedup_count)

# Projection Silver
df = df.select(
    "mcc_code",
    "mcc_description",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
)
# ------------------------------------------------------------
# 3) Fail fast + quality checks (contract v1.0)
# ------------------------------------------------------------
fail_fast_checks = []


# Fail fast : clé non nulle
if df.filter(F.col("mcc_code").isNull()).limit(1).count() > 0:
    raise ValueError("Null mcc_code detected in silver_mcc.")

fail_fast_checks.append({"name": "mcc_code_not_null", "passed": True})

# Fail fast supplémentaire : format MCC = 4 digits
bad_mcc = df.filter(
    F.col("mcc_code").isNull() |
    (~F.col("mcc_code").rlike("^[0-9]{4}$"))
)

if bad_mcc.limit(1).count() > 0:
    raise ValueError("Invalid mcc_code detected in silver_mcc (expected 4 digits, e.g., '5411').")

fail_fast_checks.append({"name": "mcc_code_is_4_digits", "passed": True})

# ------------------------------------------------------------
# 4) Métriques standard + écriture Silver (no partition)
# ------------------------------------------------------------
row_out = df.count()
partition_count = 0  # table non partitionnée

write_silver_mcc(df, table_name="silver_mcc", mode="overwrite")

# ------------------------------------------------------------
# 5) Payload runtime contract v1.0 + exit
# ------------------------------------------------------------
_ended = datetime.now(timezone.utc)
duration_ms = int((time.time() - _t0) * 1000)

payload = {
    "contract_version": "1.0",
    "layer": "silver",
    "run_id": run_id,
    "entity_code": entity_code,
    "load_mode": load_mode,
    "as_of_date": as_of_date_norm,
    "status": "SUCCESS",
    "metrics": {
        "row_in": int(row_in) if row_in is not None else None,
        "row_out": int(row_out),
        "partition_count": int(partition_count),
        "dedup_dropped": int(dedup_dropped)
    },
    "table": {
        "target_table": "silver_mcc",
        "partition_cols": []
    },
    "timing": {
        "started_utc": _iso_utc(_started),
        "ended_utc": _iso_utc(_ended),
        "duration_ms": duration_ms
    },
    "quality": {
        "fail_fast_checks": fail_fast_checks
    },
    "notes": {
        "message": None
    }
}

mssparkutils.notebook.exit(json.dumps(payload))