In [None]:
#!/usr/bin/env python
# coding: utf-8

# ============================================================
# nb_silver_cards
# ------------------------------------------------------------
# Bronze -> Silver : bronze_card_raw -> silver_cards
# Pattern aligné nb_silver_fx :
#   read -> structural assert -> rename -> normalize/cast -> tech cols
#   -> derives -> record hash -> dedupe -> project canonical -> fail fast -> write
#   -> exit payload v1.0
# ============================================================

from datetime import datetime, timezone
import time
import json
from pyspark.sql import functions as F

# ------------------------------------------------------------
# 0) Paramètres d'exécution (STANDARD ENTITY INTERFACE)
# ------------------------------------------------------------
try:
    run_id
except NameError:
    run_id = f"manual-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}"
    entity_code = "cards"
    load_mode = "full"     # full|incremental
    as_of_date = ""        # YYYY-MM-DD ou vide

def _iso_utc(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

_t0 = time.time()
_started = datetime.now(timezone.utc)

# ------------------------------------------------------------
# 1) Import des utilitaires Silver
# ------------------------------------------------------------

In [None]:
%run ./nb_silver_utils

In [None]:
# ------------------------------------------------------------
# 2) Lecture de la source Bronze
# ------------------------------------------------------------
df_bronze = spark.table("bronze_card_raw")

# ------------------------------------------------------------
# 2bis) Filtrage incremental (OPTIONNEL) + métrique row_in (STANDARD)
# ------------------------------------------------------------
# Dimension/référentiel : incremental minimal = ingestion_date == as_of_date
if (load_mode or "").strip().lower() == "incremental" and (as_of_date or "").strip():
    df_bronze = df_bronze.where(F.col("ingestion_date") == F.to_date(F.lit(as_of_date)))

row_in = df_bronze.count()
as_of_date_norm = (as_of_date if (as_of_date or "").strip() else None)

# ------------------------------------------------------------
# 3) Contrôles structurels (fail fast) — Bronze contract (figé)
# ------------------------------------------------------------
expected_bronze_cols = [
    # business
    "id",
    "client_id",
    "card_brand",
    "card_type",
    "card_number",
    "expires",
    "cvv",
    "has_chip",
    "num_cards_issued",
    "credit_limit",
    "acct_open_date",
    "year_pin_last_changed",
    "card_on_dark_web",
    # technical
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "entity"
]
assert_required_columns(df_bronze, expected_bronze_cols)
assert_no_additional_columns(df_bronze, expected_bronze_cols)

# ------------------------------------------------------------
# 4) Renommage des colonnes métier
# ------------------------------------------------------------
df = (
    df_bronze
    .withColumnRenamed("id", "card_id")
    .withColumnRenamed("expires", "expires_raw")
)

# ------------------------------------------------------------
# 5) Normalisations / Casting Silver (selon contrat Silver)
# ------------------------------------------------------------
# 5.0 Cast IDs (Silver: BIGINT)
df = df.withColumn("card_id", F.col("card_id").cast("bigint"))
df = df.withColumn("client_id", F.col("client_id").cast("bigint"))

# 5.1 Text normalize (UPPER/TRIM)
df = normalize_text(df, cols=["card_brand", "card_type"])

# 5.2 Booleans : STRING -> BOOLEAN
#     - has_chip : BOOLEAN (Silver)
#     - card_on_dark_web : BOOLEAN (Silver)
df = parse_bool_yn(df, col="has_chip")
df = parse_bool_yn(df, col="card_on_dark_web")

# 5.3 credit_limit : DOUBLE -> DECIMAL(18,2)
df = cast_decimal(df, col="credit_limit", precision=18, scale=2)

# 5.4 acct_open_date : STRING -> DATE (multi formats)
#     Contrat bronze: STRING ; contrat silver: DATE
df = parse_date_multi(
    df,
    col="acct_open_date",
    formats=["yyyy-MM-dd", "MM/dd/yyyy", "yyyy/MM/dd", "dd.MM.yyyy"]
)

# ------------------------------------------------------------
# 5bis) Dérivations Silver
# ------------------------------------------------------------
# expires_month : DATE (nullable) dérivé de expires_raw (STRING)
# Accepté si format type "MM/YYYY" ou "MM/YY" (best-effort, sinon NULL)
mm_yyyy = F.to_date(
    F.concat(
        F.lpad(F.substring(F.col("expires_raw"), 1, 2), 2, "0"),
        F.lit("/01/"),
        F.substring(F.col("expires_raw"), -4, 4)
    ),
    "MM/dd/yyyy"
)
mm_yy = F.to_date(
    F.concat(
        F.lpad(F.substring(F.col("expires_raw"), 1, 2), 2, "0"),
        F.lit("/01/20"),
        F.substring(F.col("expires_raw"), -2, 2)
    ),
    "MM/dd/yyyy"
)
df = df.withColumn("expires_month", F.coalesce(mm_yyyy, mm_yy))

# ------------------------------------------------------------
# 6) Colonnes techniques & traçabilité
# ------------------------------------------------------------
df = add_tech_columns(df, source_file_col="source_file")

# ------------------------------------------------------------
# 7) Hash métier (audit)
# ------------------------------------------------------------
df = add_record_hash(
    df,
    cols=[
        "card_id",
        "client_id",
        "card_brand",
        "card_type",
        "card_number",
        "expires_raw",
        "expires_month",
        "cvv",
        "has_chip",
        "num_cards_issued",
        "credit_limit",
        "acct_open_date",
        "year_pin_last_changed",
        "card_on_dark_web"
    ]
)

# ------------------------------------------------------------
# 8) Déduplication Silver (keep latest ingestion_ts)
# ------------------------------------------------------------
_pre_dedup_count = df.count()

df = deduplicate_latest(
    df,
    key_cols=["card_id"],
    order_col="ingestion_ts"
)

_post_dedup_count = df.count()
dedup_dropped = int(_pre_dedup_count - _post_dedup_count)

# ------------------------------------------------------------
# 9) Projection finale du contrat Silver (ordre canonique figé)
#    + Contrôle no additional columns côté Silver (contrainte)
# ------------------------------------------------------------
canonical_silver_cols = [
    "card_id",
    "client_id",
    "card_brand",
    "card_type",
    "card_number",
    "expires_raw",
    "expires_month",
    "cvv",
    "has_chip",
    "num_cards_issued",
    "credit_limit",
    "acct_open_date",
    "year_pin_last_changed",
    "card_on_dark_web",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
]

df = df.select(*canonical_silver_cols)
assert_no_additional_columns(df, canonical_silver_cols)

# ------------------------------------------------------------
# 9bis) Fail fast minimal (policy projet)
# ------------------------------------------------------------
fail_fast_checks = []

# Contrat Silver: card_id non nullable (clé naturelle)
if df.filter(F.col("card_id").isNull()).limit(1).count() > 0:
    raise ValueError("Natural key NULL detected in cards (card_id).")
fail_fast_checks.append({"name": "card_id_not_null", "passed": True})

# Policy projet (actée) : client_id NOT NULL pour cards
if df.filter(F.col("client_id").isNull()).limit(1).count() > 0:
    raise ValueError("Invalid cards detected: client_id IS NULL (required by Silver governance policy).")
fail_fast_checks.append({"name": "client_id_not_null", "passed": True})

# ------------------------------------------------------------
# 10) Écriture Silver (sans partition)
# ------------------------------------------------------------
row_out = df.count()
partition_count = 0

write_silver_cards(
    df,
    table_name="silver_cards",
    mode="overwrite"
)

# ------------------------------------------------------------
# 11) Fin du notebook — payload v1.0
# ------------------------------------------------------------
_ended = datetime.now(timezone.utc)
duration_ms = int((time.time() - _t0) * 1000)

payload = {
    "contract_version": "1.0",
    "layer": "silver",
    "run_id": run_id,
    "entity_code": entity_code,
    "load_mode": load_mode,
    "as_of_date": as_of_date_norm,
    "status": "SUCCESS",
    "metrics": {
        "row_in": int(row_in) if row_in is not None else None,
        "row_out": int(row_out),
        "partition_count": int(partition_count),
        "dedup_dropped": int(dedup_dropped)
    },
    "table": {
        "target_table": "silver_cards",
        "partition_cols": []
    },
    "timing": {
        "started_utc": _iso_utc(_started),
        "ended_utc": _iso_utc(_ended),
        "duration_ms": duration_ms
    },
    "quality": {
        "fail_fast_checks": fail_fast_checks
    },
    "notes": {
        "message": None
    }
}

mssparkutils.notebook.exit(json.dumps(payload))
