In [0]:
dbutils.widgets.text("catalog", "etl_demo")  
catalog        = dbutils.widgets.get("catalog").strip()
spark.sql(f"USE CATALOG {catalog}")
print("CATALOG:", catalog)

In [0]:
%sql
-- =========================================
-- 05 DATA QUALITY CHECKS (Spark SQL)
-- Fail-fast using assert_true()
-- =========================================

-- ---------- STAGING CHECKS ----------

-- STG_QA_1: exactly one snapshot_date in staging
SELECT assert_true(
  (SELECT COUNT(DISTINCT snapshot_date) FROM silver_staging.customer_snapshot_stg) = 1,
  'STG_QA_1 FAILED: staging must contain exactly 1 snapshot_date'
);

-- STG_QA_2: no NULL customer_id in staging
SELECT assert_true(
  (SELECT COUNT(*) FROM silver_staging.customer_snapshot_stg WHERE customer_id IS NULL) = 0,
  'STG_QA_2 FAILED: staging has NULL customer_id'
);

-- STG_QA_3: no duplicates per customer_id in staging
SELECT assert_true(
  (SELECT COUNT(*) FROM (
      SELECT customer_id
      FROM silver_staging.customer_snapshot_stg
      GROUP BY customer_id
      HAVING COUNT(*) > 1
  ) d) = 0,
  'STG_QA_3 FAILED: staging has duplicate customer_id'
);

-- STG_QA_4: row_hash must be present
SELECT assert_true(
  (SELECT COUNT(*) FROM silver_staging.customer_snapshot_stg WHERE row_hash IS NULL) = 0,
  'STG_QA_4 FAILED: staging has NULL row_hash'
);

-- ---------- SILVER CHECKS ----------

-- SILVER_QA_1: exactly one current record per customer_id
SELECT assert_true(
  (SELECT COUNT(*) FROM (
      SELECT customer_id
      FROM silver.customer_dim
      WHERE is_current = true
      GROUP BY customer_id
      HAVING COUNT(*) != 1
  ) x) = 0,
  'SILVER_QA_1 FAILED: not exactly 1 current row per customer_id'
);

-- SILVER_QA_2: valid_to must not be earlier than valid_from
SELECT assert_true(
  (SELECT COUNT(*) FROM silver.customer_dim
   WHERE valid_to IS NOT NULL AND valid_to < valid_from) = 0,
  'SILVER_QA_2 FAILED: valid_to < valid_from exists'
);

-- SILVER_QA_3: no open records that are not current
SELECT assert_true(
  (SELECT COUNT(*) FROM silver.customer_dim
   WHERE valid_to IS NULL AND is_current = false) = 0,
  'SILVER_QA_3 FAILED: found open (valid_to IS NULL) but is_current=false'
);

-- ---------- GOLD CHECKS ----------

-- GOLD_QA_1: gold current count equals silver current count
SELECT assert_true(
  (SELECT COUNT(*) FROM gold.customer_current) =
  (SELECT COUNT(*) FROM silver.customer_dim WHERE is_current = true),
  'GOLD_QA_1 FAILED: gold.customer_current count != silver current count'
);

-- GOLD_QA_2: KPI max 1 row per date
SELECT assert_true(
  (SELECT COUNT(*) FROM (
      SELECT kpi_date
      FROM gold.customer_daily_kpi
      GROUP BY kpi_date
      HAVING COUNT(*) > 1
  ) k) = 0,
  'GOLD_QA_2 FAILED: KPI has duplicate kpi_date rows'
);

-- Helpful summaries (optional)
SELECT 'STAGING distinct snapshot_date' AS metric,
       COUNT(DISTINCT snapshot_date) AS value
FROM silver_staging.customer_snapshot_stg;

SELECT 'SILVER current rows' AS metric,
       COUNT(*) AS value
FROM silver.customer_dim
WHERE is_current = true;

SELECT 'GOLD current rows' AS metric,
       COUNT(*) AS value
FROM gold.customer_current;
