In [1]:
# --------------------------------------------------
# Gold notebook parameters (Fabric-safe)
# --------------------------------------------------

def get_widget_or_default(widget_name: str, default_value: str) -> str:
    try:
        return dbutils.widgets.get(widget_name)
    except Exception:
        return default_value

gold_run_id = get_widget_or_default("gold_run_id", "MANUAL_RUN")

# Tables (alignées à votre convention gold_*)
SILVER_TABLE = "silver_users"
GOLD_TABLE   = "gold_dim_user"

print(f"gold_run_id = {gold_run_id}")
print(f"SILVER_TABLE = {SILVER_TABLE}")
print(f"GOLD_TABLE   = {GOLD_TABLE}")


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 3, Finished, Available, Finished)

gold_run_id = MANUAL_RUN
SILVER_TABLE = silver_users
GOLD_TABLE   = gold_dim_user


In [2]:
# --------------------------------------------------
# Silver contract (from silver_users.yaml)
# --------------------------------------------------

CONTRACT = {
  "natural_key": ["client_id"],
  "business_columns": [
    ("client_id","BIGINT",False),
    ("current_age","INT",True),
    ("retirement_age","INT",True),
    ("birth_year","INT",True),
    ("birth_month","INT",True),
    ("gender","STRING",True),
    ("address","STRING",True),
    ("latitude","DECIMAL(9,6)",True),
    ("longitude","DECIMAL(9,6)",True),
    ("per_capita_income","DECIMAL(18,2)",True),
    ("yearly_income","DECIMAL(18,2)",True),
    ("total_debt","DECIMAL(18,2)",True),
    ("credit_score","INT",True),
    ("num_credit_cards","INT",True),
  ],
  "technical_columns": [
    ("source_file","STRING",False),
    ("ingestion_date","DATE",False),
    ("ingestion_ts","TIMESTAMP",False),
    ("record_hash","STRING",False),
  ],
  "allow_additional_columns": False,
  "dedup": {
    "keys": ["client_id"],
    "order_by": "ingestion_ts",
    "strategy": "keep_latest"
  }
}

BUSINESS_COLS = [c[0] for c in CONTRACT["business_columns"]]
TECH_COLS     = [c[0] for c in CONTRACT["technical_columns"]]
EXPECTED_COLS = BUSINESS_COLS + TECH_COLS


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 4, Finished, Available, Finished)

In [3]:
# --------------------------------------------------
# Lecture Silver + Validations Structurelles
# --------------------------------------------------

from pyspark.sql import functions as F

df_silver = spark.table(SILVER_TABLE)

actual_cols = df_silver.columns
missing = sorted(list(set(EXPECTED_COLS) - set(actual_cols)))
extra   = sorted(list(set(actual_cols) - set(EXPECTED_COLS)))

if missing:
    raise ValueError(f"[FAIL FAST] Missing columns in {SILVER_TABLE}: {missing}")

if not CONTRACT["allow_additional_columns"] and extra:
    raise ValueError(f"[FAIL FAST] Additional columns not allowed in {SILVER_TABLE}: {extra}")

print(f"Schema validation OK for {SILVER_TABLE}. Column count = {len(actual_cols)}")


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 5, Finished, Available, Finished)

Schema validation OK for silver_users. Column count = 18


In [4]:
# --------------------------------------------------
# Projection canonique + cast contractuel
# --------------------------------------------------

def cast_expr(col_name: str, spark_type: str):
    # spark_type examples: BIGINT, INT, STRING, DATE, TIMESTAMP, DECIMAL(18,2), DECIMAL(9,6)
    return F.col(col_name).cast(spark_type).alias(col_name)

select_exprs = []
for name, typ, _nullable in CONTRACT["business_columns"]:
    select_exprs.append(cast_expr(name, typ))

for name, typ, _nullable in CONTRACT["technical_columns"]:
    select_exprs.append(cast_expr(name, typ))

df_proj = df_silver.select(*select_exprs)


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 6, Finished, Available, Finished)

In [5]:
# --------------------------------------------------
# Deduplication keep latest per ingestion_ts
# --------------------------------------------------

from pyspark.sql.window import Window

w = Window.partitionBy(*CONTRACT["dedup"]["keys"]).orderBy(F.col(CONTRACT["dedup"]["order_by"]).desc())

df_dedup = (
    df_proj
    .withColumn("_rn", F.row_number().over(w))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

# Uniqueness check (fail fast)
dup_cnt = (
    df_dedup
    .groupBy("client_id")
    .count()
    .filter(F.col("count") > 1)
    .count()
)

if dup_cnt != 0:
    raise ValueError(f"[FAIL FAST] Deduplication failed: still have duplicates on client_id (count={dup_cnt})")

print(f"Dedup OK. Rows = {df_dedup.count()}")


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 7, Finished, Available, Finished)

Dedup OK. Rows = 2000


In [6]:
# --------------------------------------------------
# Gold technical columns 
# --------------------------------------------------

df_gold = (
    df_dedup
    .withColumn("gold_run_id", F.lit(gold_run_id))
    .withColumn("gold_load_ts", F.current_timestamp())
)


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 8, Finished, Available, Finished)

In [7]:

# --------------------------------------------------
# Gold load
# --------------------------------------------------
spark.sql(f"TRUNCATE TABLE {GOLD_TABLE}")

(
    df_gold
    .write
    .mode("append")
    .format("delta")
    .saveAsTable(GOLD_TABLE)
)

print(f"Loaded {GOLD_TABLE} successfully.")


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 9, Finished, Available, Finished)

Loaded gold_dim_user successfully.


In [8]:
# --------------------------------------------------
# Post Load Control
# --------------------------------------------------

spark.sql(f"""
SELECT
  count(*) AS row_count,
  min(ingestion_ts) AS min_ingestion_ts,
  max(ingestion_ts) AS max_ingestion_ts,
  max(gold_load_ts) AS last_gold_load_ts
FROM {GOLD_TABLE}
""").show(truncate=False)

spark.sql(f"SELECT client_id, gender, current_age, yearly_income FROM {GOLD_TABLE} ORDER BY client_id LIMIT 20").show(truncate=False)


StatementMeta(, 836c83dd-791a-4892-aed0-df65acdd0082, 10, Finished, Available, Finished)

+---------+--------------------------+--------------------------+--------------------------+
|row_count|min_ingestion_ts          |max_ingestion_ts          |last_gold_load_ts         |
+---------+--------------------------+--------------------------+--------------------------+
|2000     |2025-12-07 21:59:50.515224|2025-12-07 21:59:50.515224|2026-01-05 18:03:07.381773|
+---------+--------------------------+--------------------------+--------------------------+

+---------+------+-----------+-------------+
|client_id|gender|current_age|yearly_income|
+---------+------+-----------+-------------+
|0        |MALE  |33         |59613.00     |
|1        |FEMALE|43         |45360.00     |
|2        |MALE  |48         |27447.00     |
|3        |MALE  |49         |27943.00     |
|4        |FEMALE|54         |76431.00     |
|5        |MALE  |65         |20614.00     |
|6        |FEMALE|19         |55854.00     |
|7        |MALE  |74         |32682.00     |
|8        |MALE  |21         |38497.00 