In [None]:
#!/usr/bin/env python
# coding: utf-8

# ============================================================
# nb_silver_users
# ------------------------------------------------------------
# Bronze -> Silver : bronze_user_raw -> silver_users
# Pattern aligné nb_silver_fx :
#   read -> structural assert -> rename -> normalize/cast -> tech cols
#   -> record hash -> dedupe -> project canonical -> fail fast -> write
#   -> exit payload v1.0
# ============================================================

from datetime import datetime, timezone
import time
import json
from pyspark.sql import functions as F

# ------------------------------------------------------------
# 0) Paramètres d'exécution (STANDARD ENTITY INTERFACE)
# ------------------------------------------------------------
try:
    run_id
except NameError:
    run_id = f"manual-{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}"
    entity_code = "users"
    load_mode = "full"     # full|incremental
    as_of_date = ""        # YYYY-MM-DD ou vide

def _iso_utc(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

_t0 = time.time()
_started = datetime.now(timezone.utc)

# ------------------------------------------------------------
# 1) Import des utilitaires Silver
# ------------------------------------------------------------

In [None]:
%run ./nb_silver_utils

In [None]:
# ------------------------------------------------------------
# 2) Lecture de la source Bronze
# ------------------------------------------------------------
df_bronze = spark.table("bronze_user_raw")

# ------------------------------------------------------------
# 2bis) Filtrage incremental (OPTIONNEL) + métrique row_in (STANDARD)
# ------------------------------------------------------------
# USERS incremental minimal = ingestion_date == as_of_date
if (load_mode or "").strip().lower() == "incremental" and (as_of_date or "").strip():
    df_bronze = df_bronze.where(F.col("ingestion_date") == F.to_date(F.lit(as_of_date)))

row_in = df_bronze.count()
as_of_date_norm = (as_of_date if (as_of_date or "").strip() else None)

# ------------------------------------------------------------
# 3) Contrôles structurels (fail fast) — Bronze contract (figé)
# ------------------------------------------------------------
expected_bronze_cols = [
    "id",
    "current_age",
    "retirement_age",
    "birth_year",
    "birth_month",
    "gender",
    "address",
    "latitude",
    "longitude",
    "per_capita_income",
    "yearly_income",
    "total_debt",
    "credit_score",
    "num_credit_cards",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "entity"
]
assert_required_columns(df_bronze, expected_bronze_cols)
assert_no_additional_columns(df_bronze, expected_bronze_cols)

# ------------------------------------------------------------
# 4) Renommage des colonnes métier
# ------------------------------------------------------------
df = df_bronze.withColumnRenamed("id", "client_id")

# ------------------------------------------------------------
# 5) Normalisations Silver (typage + standardisations)
# ------------------------------------------------------------
# 5.0 Cast client_id (Silver contract: BIGINT)
df = df.withColumn("client_id", F.col("client_id").cast("bigint"))

# 5.1 Normalisation texte
df = normalize_text(df, cols=["gender", "address"])

# 5.2 Typage geo (DOUBLE -> DECIMAL(9,6))
df = cast_decimal(df, col="latitude", precision=9, scale=6)
df = cast_decimal(df, col="longitude", precision=9, scale=6)

# 5.3 Typage financier (DOUBLE -> DECIMAL(18,2))
df = cast_decimal(df, col="per_capita_income", precision=18, scale=2)
df = cast_decimal(df, col="yearly_income", precision=18, scale=2)
df = cast_decimal(df, col="total_debt", precision=18, scale=2)

# ------------------------------------------------------------
# 6) Colonnes techniques & traçabilité
# ------------------------------------------------------------
df = add_tech_columns(df, source_file_col="source_file")

# ------------------------------------------------------------
# 7) Hash métier (audit)
# ------------------------------------------------------------
df = add_record_hash(
    df,
    cols=[
        "client_id",
        "current_age",
        "retirement_age",
        "birth_year",
        "birth_month",
        "gender",
        "address",
        "latitude",
        "longitude",
        "per_capita_income",
        "yearly_income",
        "total_debt",
        "credit_score",
        "num_credit_cards"
    ]
)

# ------------------------------------------------------------
# 8) Déduplication Silver (keep latest ingestion_ts)
# ------------------------------------------------------------
_pre_dedup_count = df.count()

df = deduplicate_latest(
    df,
    key_cols=["client_id"],
    order_col="ingestion_ts"
)

_post_dedup_count = df.count()
dedup_dropped = int(_pre_dedup_count - _post_dedup_count)

# ------------------------------------------------------------
# 9) Projection finale du contrat Silver (ordre canonique figé)
#    + Contrôle no additional columns côté Silver (contrainte)
# ------------------------------------------------------------
canonical_silver_cols = [
    "client_id",
    "current_age",
    "retirement_age",
    "birth_year",
    "birth_month",
    "gender",
    "address",
    "latitude",
    "longitude",
    "per_capita_income",
    "yearly_income",
    "total_debt",
    "credit_score",
    "num_credit_cards",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
]

df = df.select(*canonical_silver_cols)
assert_no_additional_columns(df, canonical_silver_cols)

# ------------------------------------------------------------
# 9bis) Fail fast minimal (contract)
# ------------------------------------------------------------
fail_fast_checks = []

if df.filter(F.col("client_id").isNull()).limit(1).count() > 0:
    raise ValueError("Natural key NULL detected in users (client_id).")

fail_fast_checks.append({"name": "client_id_not_null", "passed": True})

# ------------------------------------------------------------
# 10) Écriture Silver (sans partition)
# ------------------------------------------------------------
row_out = df.count()
partition_count = 0

write_silver_users(
    df,
    table_name="silver_users",
    mode="overwrite"
)

# ------------------------------------------------------------
# 11) Fin du notebook — payload v1.0
# ------------------------------------------------------------
_ended = datetime.now(timezone.utc)
duration_ms = int((time.time() - _t0) * 1000)

payload = {
    "contract_version": "1.0",
    "layer": "silver",
    "run_id": run_id,
    "entity_code": entity_code,
    "load_mode": load_mode,
    "as_of_date": as_of_date_norm,
    "status": "SUCCESS",
    "metrics": {
        "row_in": int(row_in) if row_in is not None else None,
        "row_out": int(row_out),
        "partition_count": int(partition_count),
        "dedup_dropped": int(dedup_dropped)
    },
    "table": {
        "target_table": "silver_users",
        "partition_cols": []
    },
    "timing": {
        "started_utc": _iso_utc(_started),
        "ended_utc": _iso_utc(_ended),
        "duration_ms": duration_ms
    },
    "quality": {
        "fail_fast_checks": fail_fast_checks
    },
    "notes": {
        "message": None
    }
}

mssparkutils.notebook.exit(json.dumps(payload))
