In [None]:
#!/usr/bin/env python
# coding: utf-8

# ============================================================
# nb_gold_dim_card_v1_0_final â€” Gold Dimension: Card
#
# Contract-first + data-driven execution (banking-ready)
# - No runtime args / widgets
# - Reads execution context from gold_log_steps (latest RUNNING for this notebook)
# - Applies Gold YAML contract (schema/order/types/not-null/unique)
# - Dedup keep_latest by ingestion_ts
# - Conformance to gold_dim_user (orphan cards are excluded + logged as anomalies)
# - Idempotent rebuild (TRUNCATE + APPEND)
# ============================================================

# The command is not a standard IPython magic command. It is designed for use within Fabric notebooks only.

In [None]:
%run ./nb_gold_utils

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import json

# -----------------------------
# 0) Data-driven execution context
# -----------------------------
LOG_STEPS_TABLE = "gold_log_steps"
THIS_NOTEBOOK   = "nb_gold_dim_card"

def _json_load_safe(s: str) -> dict:
    try:
        return json.loads(s) if s else {}
    except Exception:
        return {}

def read_ctx_from_steps() -> dict:
    df = (
        spark.table(LOG_STEPS_TABLE)
        .filter((F.col("notebook_name") == THIS_NOTEBOOK) & (F.col("status") == "RUNNING"))
        .orderBy(F.col("start_ts").desc())
        .limit(1)
    )
    rows = df.collect()
    if not rows:
        raise ValueError(
            f"[{THIS_NOTEBOOK}] No RUNNING step found in {LOG_STEPS_TABLE}. "
            "Dispatcher must write RUNNING ctx before execution."
        )

    payload = _json_load_safe(rows[0]["payload_json"])
    ctx = payload.get("ctx", {})
    if not isinstance(ctx, dict) or not ctx:
        raise ValueError(f"[{THIS_NOTEBOOK}] RUNNING step payload_json has no ctx.")

    # Hard validations (banking-grade)
    if str(ctx.get("notebook_name", "")).strip() != THIS_NOTEBOOK:
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.notebook_name mismatch: {ctx.get('notebook_name')}")
    if str(ctx.get("gold_run_id", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.gold_run_id missing")
    if str(ctx.get("step_exec_id", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.step_exec_id missing")
    if str(ctx.get("entity_code", "")).strip() == "":
        raise ValueError(f"[{THIS_NOTEBOOK}] ctx.entity_code missing")

    return ctx

ctx = read_ctx_from_steps()

gold_run_id  = normalize_run_id(ctx["gold_run_id"])
step_exec_id = ctx["step_exec_id"]
entity_code  = ctx["entity_code"]
load_mode    = ctx.get("load_mode", "")
as_of_date   = ctx.get("as_of_date", "")

print(f"gold_run_id   = {gold_run_id}")
print(f"step_exec_id  = {step_exec_id}")
print(f"entity_code   = {entity_code}")
print(f"load_mode     = {load_mode}")
print(f"as_of_date    = {as_of_date}")

In [None]:
# -----------------------------
# 1) Tables / constants
# -----------------------------
SILVER_TABLE    = "silver_cards"
GOLD_TABLE      = "gold_dim_card"
GOLD_USER_TABLE = "gold_dim_user"

ENTITY       = GOLD_TABLE
SOURCE_TABLE = SILVER_TABLE

In [None]:
# -----------------------------
# 2) Read Silver (input sanity)
# -----------------------------
df_silver = spark.table(SILVER_TABLE)

INPUT_REQUIRED = [
    "card_id", "client_id", "card_brand", "card_type", "card_number", "expires_raw",
    "expires_month", "cvv", "has_chip", "num_cards_issued", "credit_limit", "acct_open_date",
    "year_pin_last_changed", "card_on_dark_web",
    "source_file", "ingestion_date", "ingestion_ts", "record_hash"
]
assert_required_columns(df_silver, INPUT_REQUIRED, ctx=f"{SILVER_TABLE}")

df_work = df_silver.select(*[F.col(c) for c in INPUT_REQUIRED])
row_in = df_work.count()

In [None]:
# -----------------------------
# 3) Dedup keep_latest (card_id by ingestion_ts)
# -----------------------------
w = Window.partitionBy("card_id").orderBy(F.col("ingestion_ts").desc())

df_dedup = (
    df_work
    .withColumn("_rn", F.row_number().over(w))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

row_after_dedup = df_dedup.count()
dedup_dropped = row_in - row_after_dedup

# banking-grade uniqueness post-dedup
assert_unique_key(df_dedup, ["card_id"], ctx=f"{ENTITY} (post-dedup)")

In [None]:
# -----------------------------
# 4) Conformance to gold_dim_user (client_id)
#    - Keep rows with NULL client_id (allowed)
#    - Exclude orphans where client_id is not null but missing in gold_dim_user
#    - Log anomalies centrally (gold_anomaly_event + gold_anomaly_kpi)
# -----------------------------
df_users = (
    spark.table(GOLD_USER_TABLE)
         .select(F.col("client_id").cast("BIGINT").alias("client_id"))
         .dropDuplicates()
)

df_with_client = df_dedup.filter(F.col("client_id").isNotNull())
df_null_client = df_dedup.filter(F.col("client_id").isNull())

df_ok_client, anom_orphans = split_orphans_left_anti(
    fact_df=df_with_client,
    dim_df=df_users,
    fact_key="client_id",
    dim_key="client_id",
    rule_id="GOLD.CARD.CONF.001",
    anom_type="ORPHAN_DIM_USER",
    entity=ENTITY,
    gold_run_id=gold_run_id,
    source_table=SOURCE_TABLE,
    severity="HIGH",
    natural_key_cols_for_event=["card_id", "client_id"]
)

# Single action for orphans
row_rejected = anom_orphans.count()
if row_rejected > 0:
    write_anomaly_events(anom_orphans, table_name="gold_anomaly_event")
    write_anomaly_kpis(
        anom_orphans,
        gold_run_id=gold_run_id,
        entity=ENTITY,
        table_name="gold_anomaly_kpi",
        sample_limit=10
    )

anom_count = row_rejected

# Conformed dataset = ok clients + null client rows
df_conformed = df_ok_client.unionByName(df_null_client, allowMissingColumns=False)

In [None]:
# -----------------------------
# 5) Add Gold technical columns
# -----------------------------
df_gold_raw = (
    df_conformed
    .withColumn("gold_run_id", F.lit(gold_run_id))
    .withColumn("gold_load_ts", F.current_timestamp())
)

In [None]:
# -----------------------------
# 6) Contract load + canonical projection + assertions
# -----------------------------
contract = load_gold_contract(GOLD_TABLE)  # Files/governance/gold/gold_dim_card.yaml
df_final = project_to_gold_contract(df_gold_raw, contract)

apply_gold_contract_assertions(
    df=df_final,
    contract=contract,
    ctx=f"{ENTITY} (final)",
    enforce_types=True,
    enforce_not_null=True,
    enforce_unique=True
)

In [None]:
# -----------------------------
# 7) Rebuild idempotent (TRUNCATE + APPEND)
# -----------------------------
rebuild_gold_table(df_final, table_name=GOLD_TABLE)

row_out = df_final.count()
partition_count = 0  # dims are not partitioned in this model

print(f"Loaded {GOLD_TABLE} successfully. row_in={row_in}, row_out={row_out}, row_rejected={row_rejected}")

In [None]:
# -----------------------------
# 8) Exit payload (consumed by dispatcher)
# -----------------------------
def exit_payload(**kwargs):
    payload = {"status": "SUCCESS", **kwargs}
    mssparkutils.notebook.exit(json.dumps(payload, ensure_ascii=False))
    
exit_payload(
    status="SUCCESS",
    gold_run_id=gold_run_id,
    step_exec_id=step_exec_id,
    entity_code=entity_code,
    entity=ENTITY,
    target_table=GOLD_TABLE,
    row_in=row_in,
    row_out=row_out,
    row_rejected=row_rejected,
    dedup_dropped=dedup_dropped,
    partition_count=partition_count,
    anom_count=anom_count
)
