In [None]:
#!/usr/bin/env python
# coding: utf-8

# ============================================================
# nb_silver_transactions
# ------------------------------------------------------------
# Bronze -> Silver : bronze_transaction_raw -> silver_transactions
# Pattern aligné nb_silver_fx :
#   read -> structural assert -> rename -> normalize/cast -> derives -> tech cols
#   -> record hash -> dedupe -> project canonical -> fail fast -> partition validation -> write
#   -> exit payload v1.0
# ============================================================

from datetime import datetime, timezone
import time
import json
from pyspark.sql import functions as F

# ------------------------------------------------------------
# 0) Paramètres d'exécution (STANDARD ENTITY INTERFACE)
# ------------------------------------------------------------
try:
    run_id
except NameError:
    run_id = f"manual-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}"
    entity_code = "transactions"
    load_mode = "full"     # full|incremental
    as_of_date = ""        # YYYY-MM-DD ou vide

def _iso_utc(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

_t0 = time.time()
_started = datetime.now(timezone.utc)

# ------------------------------------------------------------
# 1) Import des utilitaires Silver
# ------------------------------------------------------------

In [None]:
%run ./nb_silver_utils

In [None]:
# ------------------------------------------------------------
# 2) Lecture de la source Bronze
# ------------------------------------------------------------
df_bronze = spark.table("bronze_transaction_raw")

# ------------------------------------------------------------
# 2bis) Filtrage incremental (OPTIONNEL) + métrique row_in (STANDARD)
# ------------------------------------------------------------
# Incremental minimal (alignement pattern) :
# - si as_of_date est fourni, filtrer sur la DATE de txn_ts == as_of_date
if (load_mode or "").strip().lower() == "incremental" and (as_of_date or "").strip():
    df_bronze = df_bronze.where(F.to_date(F.col("date")) == F.to_date(F.lit(as_of_date)))

row_in = df_bronze.count()
as_of_date_norm = (as_of_date if (as_of_date or "").strip() else None)

# ------------------------------------------------------------
# 3) Contrôles structurels (fail fast) — Bronze contract (figé)
# ------------------------------------------------------------
expected_bronze_cols = [
    "id",
    "date",
    "client_id",
    "card_id",
    "amount",
    "use_chip",
    "merchant_id",
    "merchant_city",
    "merchant_state",
    "zip",
    "mcc",
    "errors",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "entity"
]
assert_required_columns(df_bronze, expected_bronze_cols)
assert_no_additional_columns(df_bronze, expected_bronze_cols)

# ------------------------------------------------------------
# 4) Renommage des colonnes métier
# ------------------------------------------------------------
df = (
    df_bronze
    .withColumnRenamed("id", "transaction_id")
    .withColumnRenamed("date", "txn_ts")
    .withColumnRenamed("errors", "error_code")
)

# ------------------------------------------------------------
# 5) Normalisations / Casting Silver (selon contrat Silver)
# ------------------------------------------------------------
# 5.0 Cast IDs (Silver: BIGINT) + error_code (Silver: INT)
df = df.withColumn("transaction_id", F.col("transaction_id").cast("bigint"))
df = df.withColumn("client_id", F.col("client_id").cast("bigint"))
df = df.withColumn("card_id", F.col("card_id").cast("bigint"))
df = df.withColumn("merchant_id", F.col("merchant_id").cast("bigint"))
df = df.withColumn("error_code", F.col("error_code").cast("int"))

# 5.1 txn_ts : TIMESTAMP (Bronze déjà TIMESTAMP, on force le cast)
df = df.withColumn("txn_ts", F.col("txn_ts").cast("timestamp"))

# 5.2 amount : STRING -> DECIMAL(18,2)
#     (contrat Bronze: STRING ; contrat Silver: decimal(18,2))
df = cast_decimal(df, col="amount", precision=18, scale=2)

# 5.3 zip : DOUBLE -> STRING (contrat Bronze: DOUBLE ; contrat Silver: string)
df = df.withColumn("zip", F.col("zip").cast("string"))

# 5.4 mcc : INT -> mcc_code (string 4 chars)
#     (contrat Silver: mcc_code string non nullable)
df = df.withColumn("mcc_code", F.lpad(F.col("mcc").cast("string"), 4, "0"))

# ------------------------------------------------------------
# 5bis) Dérivations Silver (dates + succès)
# ------------------------------------------------------------
# txn_date : DATE(txn_ts)
df = df.withColumn("txn_date", F.to_date(F.col("txn_ts")))

# txn_month : DATE_TRUNC('month', txn_ts) (1er jour du mois)
df = df.withColumn("txn_month", F.date_trunc("month", F.col("txn_ts")).cast("date"))

# is_success : (error_code == 0)
df = df.withColumn("is_success", (F.col("error_code") == F.lit(0)))

# ------------------------------------------------------------
# 6) Colonnes techniques & traçabilité
# ------------------------------------------------------------
df = add_tech_columns(df, source_file_col="source_file")

# ------------------------------------------------------------
# 7) Hash métier (audit)
# ------------------------------------------------------------
df = add_record_hash(
    df,
    cols=[
        "transaction_id",
        "txn_ts",
        "txn_date",
        "txn_month",
        "client_id",
        "card_id",
        "merchant_id",
        "mcc_code",
        "amount",
        "use_chip",
        "merchant_city",
        "merchant_state",
        "zip",
        "error_code",
        "is_success"
    ]
)

# ------------------------------------------------------------
# 8) Déduplication Silver (keep latest ingestion_ts)
# ------------------------------------------------------------
_pre_dedup_count = df.count()

df = deduplicate_latest(
    df,
    key_cols=["transaction_id"],
    order_col="ingestion_ts"
)

_post_dedup_count = df.count()
dedup_dropped = int(_pre_dedup_count - _post_dedup_count)

# ------------------------------------------------------------
# 9) Projection finale du contrat Silver (ordre canonique figé)
#    + Contrôle no additional columns côté Silver (contrainte)
# ------------------------------------------------------------
canonical_silver_cols = [
    "transaction_id",
    "txn_ts",
    "txn_date",
    "txn_month",
    "client_id",
    "card_id",
    "merchant_id",
    "mcc_code",
    "amount",
    "use_chip",
    "merchant_city",
    "merchant_state",
    "zip",
    "error_code",
    "is_success",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
]

df = df.select(*canonical_silver_cols)
assert_no_additional_columns(df, canonical_silver_cols)

# ------------------------------------------------------------
# 9bis) Fail fast minimal + quality rules clés (contrat Silver)
# ------------------------------------------------------------
fail_fast_checks = []

# transaction_id NOT NULL
if df.filter(F.col("transaction_id").isNull()).limit(1).count() > 0:
    raise ValueError("Natural key NULL detected in transactions (transaction_id).")
fail_fast_checks.append({"name": "transaction_id_not_null", "passed": True})

# txn_ts NOT NULL
if df.filter(F.col("txn_ts").isNull()).limit(1).count() > 0:
    raise ValueError("Invalid transaction detected: txn_ts IS NULL.")
fail_fast_checks.append({"name": "txn_ts_not_null", "passed": True})

# txn_month NOT NULL (partition)
if df.filter(F.col("txn_month").isNull()).limit(1).count() > 0:
    raise ValueError("Invalid transaction detected: txn_month IS NULL.")
fail_fast_checks.append({"name": "txn_month_not_null", "passed": True})

# amount NOT NULL
if df.filter(F.col("amount").isNull()).limit(1).count() > 0:
    raise ValueError("Invalid transaction detected: amount IS NULL.")
fail_fast_checks.append({"name": "amount_not_null", "passed": True})

# mcc_code NOT NULL (Silver contract)
if df.filter(F.col("mcc_code").isNull()).limit(1).count() > 0:
    raise ValueError("Invalid transaction detected: mcc_code IS NULL.")
fail_fast_checks.append({"name": "mcc_code_not_null", "passed": True})

# business keys NOT NULL (Silver contract)
for k in ["client_id", "card_id", "merchant_id", "error_code", "is_success"]:
    if df.filter(F.col(k).isNull()).limit(1).count() > 0:
        raise ValueError(f"Invalid transaction detected: {k} IS NULL.")
fail_fast_checks.append({"name": "required_business_keys_not_null", "passed": True})

# ------------------------------------------------------------
# 9ter) Validation partitions (policy)
# ------------------------------------------------------------
# Contract Silver: cardinalité partitions < 240 mois (threshold par défaut)
validate_partitions(
    df,
    partition_cols=["txn_month"]
)

# Metrics
row_out = df.count()
partition_count = df.select("txn_month").distinct().count()

# ------------------------------------------------------------
# 10) Écriture Silver (partition txn_month)
# ------------------------------------------------------------
write_silver_transactions(
    df,
    table_name="silver_transactions",
    mode="overwrite"
)

# ------------------------------------------------------------
# 11) Fin du notebook — payload v1.0
# ------------------------------------------------------------
_ended = datetime.now(timezone.utc)
duration_ms = int((time.time() - _t0) * 1000)

payload = {
    "contract_version": "1.0",
    "layer": "silver",
    "run_id": run_id,
    "entity_code": entity_code,
    "load_mode": load_mode,
    "as_of_date": as_of_date_norm,
    "status": "SUCCESS",
    "metrics": {
        "row_in": int(row_in) if row_in is not None else None,
        "row_out": int(row_out),
        "partition_count": int(partition_count),
        "dedup_dropped": int(dedup_dropped)
    },
    "table": {
        "target_table": "silver_transactions",
        "partition_cols": ["txn_month"]
    },
    "timing": {
        "started_utc": _iso_utc(_started),
        "ended_utc": _iso_utc(_ended),
        "duration_ms": duration_ms
    },
    "quality": {
        "fail_fast_checks": fail_fast_checks
    },
    "notes": {
        "message": None
    }
}

mssparkutils.notebook.exit(json.dumps(payload))
