In [2]:
%run ./nb_gold_utils

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 5, Finished, Available, Finished)

In [3]:


from pyspark.sql import functions as F
import json
# --------------------------------------------------
# 0) Parameters (data driven from gold_log_steps)
# --------------------------------------------------

LOG_STEPS_TABLE = "gold_log_steps"
THIS_NOTEBOOK = "nb_gold_dim_date"

def _json_load_safe(s: str) -> dict:
    try:
        return json.loads(s) if s else {}
    except Exception:
        return {}

def read_ctx_from_steps() -> dict:
    df = (
        spark.table(LOG_STEPS_TABLE)
        .filter(
            (F.col("notebook_name") == THIS_NOTEBOOK) &
            (F.col("status") == "RUNNING")
        )
        .orderBy(F.col("start_ts").desc())
        .limit(1)
    )

    rows = df.collect()
    if not rows:
        raise ValueError(
            f"[{THIS_NOTEBOOK}] No RUNNING step found in {LOG_STEPS_TABLE}. "
            "Dispatcher must write RUNNING ctx before execution."
        )

    payload = _json_load_safe(rows[0]["payload_json"])
    ctx = payload.get("ctx", {})
    if not isinstance(ctx, dict) or not ctx:
        raise ValueError(f"[{THIS_NOTEBOOK}] RUNNING step payload_json has no ctx.")

    # Hard validations (banking-grade)
    if str(ctx.get("notebook_name", "")).strip() != THIS_NOTEBOOK:
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.notebook_name mismatch: {ctx.get('notebook_name')}")
    if str(ctx.get("gold_run_id", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.gold_run_id missing")
    if str(ctx.get("step_exec_id", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.step_exec_id missing")
    if str(ctx.get("entity_code", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.entity_code missing")

    return ctx

ctx = read_ctx_from_steps()

gold_run_id = normalize_run_id(ctx["gold_run_id"])
step_exec_id = ctx["step_exec_id"]
entity_code = ctx["entity_code"]
load_mode   = ctx.get("load_mode", "")
as_of_date  = ctx.get("as_of_date", "")

# dim_date specific
start_date  = ctx.get("start_date", "2015-01-01")
end_date    = ctx.get("end_date", "2035-12-31")

print(f"gold_run_id    = {gold_run_id}")
print(f"step_exec_id   = {step_exec_id}")
print(f"entity_code    = {entity_code}")
print(f"start_date     = {start_date}")
print(f"end_date       = {end_date}")

ENTITY = "gold_dim_date"



StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 6, Finished, Available, Finished)

gold_run_id = MANUAL_RUN
start_date  = 2015-01-01
end_date    = 2035-12-31


In [4]:
# --------------------------------------------------
# 1) Generate date spine (inclusive)
# --------------------------------------------------
df_dates = spark.sql(f"""
    SELECT explode(
        sequence(
            to_date('{start_date}'),
            to_date('{end_date}'),
            interval 1 day
        )
    ) AS date_value
""")

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 7, Finished, Available, Finished)

In [5]:
# --------------------------------------------------
# 2) Derive calendar attributes
#    Notes:
#    - ISO weekday: Spark dayofweek() => 1=Sunday ... 7=Saturday
#      We convert to ISO (Mon=1..Sun=7)
# --------------------------------------------------
df_dim_date = (
    df_dates
    .withColumn("date_id", F.date_format("date_value", "yyyyMMdd").cast("int"))
    .withColumn("day_of_month", F.dayofmonth("date_value").cast("int"))
    .withColumn(
        "day_of_week_iso",
        F.when(F.dayofweek("date_value") == 1, F.lit(7)).otherwise(F.dayofweek("date_value") - 1)
        .cast("int")
    )
    .withColumn("day_name", F.date_format("date_value", "EEEE"))
    .withColumn("week_of_year", F.weekofyear("date_value").cast("int"))
    .withColumn("month_number", F.month("date_value").cast("int"))
    .withColumn("month_name", F.date_format("date_value", "MMMM"))
    .withColumn("quarter_number", F.quarter("date_value").cast("int"))
    .withColumn("year_number", F.year("date_value").cast("int"))
    .withColumn("is_weekend", F.col("day_of_week_iso").isin(6, 7).cast("boolean"))
    .withColumn("gold_run_id", F.lit(gold_run_id))
    .withColumn("gold_load_ts", F.current_timestamp())
)

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 8, Finished, Available, Finished)

In [6]:
# --------------------------------------------------
# 3) Load contract + project/cast to contract
# --------------------------------------------------
contract = load_gold_contract(ENTITY)  # reads Files/governance/schema_registry/gold/gold_dim_date.yaml
df_final = project_to_gold_contract(df_dim_date, contract)

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 9, Finished, Available, Finished)

In [7]:
# --------------------------------------------------
# 4) Contract-first assertions (banking-grade)
# --------------------------------------------------
apply_gold_contract_assertions(
    df=df_final,
    contract=contract,
    ctx=f"{ENTITY} (final)",
    enforce_types=True,
    enforce_not_null=True,
    enforce_unique=True
)

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 10, Finished, Available, Finished)

In [8]:
# --------------------------------------------------
# 5) Row-count control (defensive)
# --------------------------------------------------
expected_count = (
    spark.sql(f"""
        SELECT datediff(to_date('{end_date}'), to_date('{start_date}')) + 1 AS cnt
    """)
    .collect()[0]["cnt"]
)
actual_count = df_final.count()

if actual_count != expected_count:
    raise ValueError(f"[{ENTITY}] rowcount mismatch: expected {expected_count}, got {actual_count}")

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 11, Finished, Available, Finished)

In [9]:
# --------------------------------------------------
# 6) Rebuild idempotent (TRUNCATE + APPEND)
# --------------------------------------------------
rebuild_gold_table(df_final, table_name=ENTITY)

StatementMeta(, 21a83038-4a12-420e-bafc-6d0da8e743e8, 12, Finished, Available, Finished)

In [None]:
# --------------------------------------------------
# 7) Post-load sanity query
# --------------------------------------------------
spark.sql(f"""
    SELECT
        min(date_value) AS min_date,
        max(date_value) AS max_date,
        count(*) AS row_count,
        min(day_of_week_iso) AS min_iso_dow,
        max(day_of_week_iso) AS max_iso_dow
    FROM {ENTITY}
""").show(truncate=False)

spark.sql(f"SELECT * FROM {ENTITY} ORDER BY date_value LIMIT 10").show(truncate=False)
spark.sql(f"SELECT * FROM {ENTITY} ORDER BY date_value DESC LIMIT 10").show(truncate=False)

In [None]:
def exit_payload(**kwargs):
    payload = {"status": "SUCCESS", **kwargs}
    mssparkutils.notebook.exit(json.dumps(payload, ensure_ascii=False))

# IMPORTANT: `ENTITY` dans anomalies = table gold (ex: gold_fact_transactions)
row_in = expected_count
row_out = actual_count

exit_payload(
    gold_run_id=gold_run_id,
    step_exec_id=step_exec_id,
    entity_code=entity_code,
    entity=ENTITY,              
    target_table=ENTITY,
    row_in=row_in,
    row_out=row_out,
    row_rejected=0,
    dedup_dropped=0,
    partition_count=0,
    anom_count=0
)

