In [None]:
# ============================================================
# nb_silver_users
# ============================================================

# %run ./nb_silver_utils
from pyspark.sql import functions as F

df_bronze = spark.table("bronze_user_raw")

assert_required_columns(
    df_bronze,
    ["id", "birth_year", "birth_month", "gender",
     "latitude", "longitude", "per_capita_income", "yearly_income",
     "total_debt", "credit_score", "num_credit_cards",
     "source_file", "ingestion_ts", "ingestion_date"]
)

# Renaming
df = (
    df_bronze
    .withColumnRenamed("id", "client_id")
)

# Normalisations / Typage
df = normalize_text(df, cols=["gender"])

# lat/long : DECIMAL(9,6) (standard geo)
df = cast_decimal(df, "latitude", precision=9, scale=6)
df = cast_decimal(df, "longitude", precision=9, scale=6)

# Montants : DECIMAL(18,2)
for c in ["per_capita_income", "yearly_income", "total_debt"]:
    df = cast_decimal(df, c, precision=18, scale=2)

# Colonnes techniques
df = add_tech_columns(df, source_file_col="source_file")

# Hash (tous attributs business pertinents)
hash_cols = [
    "client_id", "current_age", "retirement_age", "birth_year", "birth_month",
    "gender", "address", "latitude", "longitude",
    "per_capita_income", "yearly_income", "total_debt",
    "credit_score", "num_credit_cards"
]
df = add_record_hash(df, cols=[c for c in hash_cols if c in df.columns])

# Dédup (clé naturelle = id -> client_id) :contentReference[oaicite:11]{index=11}
df = deduplicate_latest(df, key_cols=["client_id"], order_col="ingestion_ts")

# Projection Silver
df = df.select(
    "client_id",
    "current_age",
    "retirement_age",
    "birth_year",
    "birth_month",
    "gender",
    "address",
    "latitude",
    "longitude",
    "per_capita_income",
    "yearly_income",
    "total_debt",
    "credit_score",
    "num_credit_cards",
    "source_file",
    "ingestion_date",
    "ingestion_ts",
    "record_hash"
)

# Fail fast : clé non nulle
if df.filter(F.col("client_id").isNull()).limit(1).count() > 0:
    raise ValueError("Null client_id detected in silver_users.")

write_silver_users(df, table_name="silver_users", mode="overwrite")
print("silver_users successfully written.")

