In [None]:
# ==============================
# Delta Load Script (Snowpark)
# ==============================
from snowflake.snowpark import Session
from datetime import datetime

# ---------- CONFIG ----------
folder_path = datetime.utcnow().strftime("%Y/%b/%d/").title()  # e.g. 2025/Sep/24/
print("📌 Using folder path:", folder_path)

connection_parameters = {
    "account": "MJ13681.ap-southeast-1",
    "user": "mzs1988",
    "password": "Datalabs@193001",
    "role": "ACCOUNTADMIN",
    "warehouse": "COMPUTE_WH",
    "database": "DEV",
    "schema": "DATASCIENCE"
}

session = Session.builder.configs(connection_parameters).create()

# ---------- CONSTANTS (all NTZ, UTC, seconds precision) ----------
NOW_NTZ = "TO_TIMESTAMP_NTZ(TO_CHAR(CONVERT_TIMEZONE('UTC', CURRENT_TIMESTAMP()), 'YYYY-MM-DD HH24:MI:SS'))"
OPEN_END_DATE = "TO_TIMESTAMP_NTZ('2099-12-31 23:59:59')"

In [None]:
# ---------- ENTITIES ---------- (adjust compare_cols if you change schema)
entities = {
    "customers": {
        "bronze": "bronze_customers_delta",
        "silver": "silver_customers_delta",
        "stage_entity": "customers",
        "pattern": r".*Customers_.*?/part-.*\.parquet",
        "key": "customer_id",
        "compare_cols": ["customer_name", "start_date"]
    },
    "products": {
        "bronze": "bronze_products_delta",
        "silver": "silver_products_delta",
        "stage_entity": "products",
        "pattern": r".*Products_.*?/part-.*\.parquet",
        "key": "product_id",
        "compare_cols": ["product_name", "start_date"]
    },
    "orders": {
        "bronze": "bronze_orders_delta",
        "silver": "silver_orders_delta",
        "stage_entity": "orders",
        "pattern": r".*Orders_.*?/part-.*\.parquet",
        "key": "order_id",
        "compare_cols": ["customer_id", "product_id", "start_date"]
    }
}

In [None]:
# ---------- Helper: check stage for today's files ----------
def stage_has_files(stage_entity, pattern):
    """
    Returns True if @bronze_csv/{stage_entity}/Delta_load/csv_files/{folder_path} has any objects
    """
    stage_path = f"@bronze_csv/{stage_entity}/Delta_load/csv_files/{folder_path}"
    try:
        rows = session.sql(f"LIST {stage_path};").collect()
        return len(rows) > 0
    except Exception as e:
        # treat failure (no path) as no files
        print(f"⚠️ LIST failed for {stage_path}: {e}")
        return False

In [None]:
# ---------- STEP 1: Refresh Bronze tables (truncate + COPY) ----------
for name, meta in entities.items():
    bronze_table = meta["bronze"]
    stage_entity = meta["stage_entity"]
    stage_path = f"@bronze_csv/{stage_entity}/Delta_load/csv_files/{folder_path}"
    pat = meta["pattern"]

    if stage_has_files(stage_entity, pat):
        print(f"📥 Found parquet(s) for {name}. Refreshing {bronze_table} from {stage_path}")
        session.sql(f"TRUNCATE TABLE {bronze_table};").collect()
        copy_sql = f"""
        COPY INTO {bronze_table}
        FROM {stage_path}
        FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
        MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
        PATTERN = '{pat}';
        """
        session.sql(copy_sql).collect()
        print(f"✅ Loaded {bronze_table}")
    else:
        print(f"ℹ️ No new files for {name}. Keeping existing {bronze_table}.")

In [None]:
df = session.table("bronze_customers_delta")
df.show(15)


In [None]:
# ---------- STEP 2: Delta transformations: bronze_clean -> silver (SCD2) ----------
# Overview per entity:
# 0) Create cleaned bronze tables ({bronze}_clean) with trimming, casting, deduplication, and date checks
# 1) Expire changed records (set end_date = new.start_date) when bronze_clean has updated rows (delete_flag='N').
# 2) Insert new/updated rows (where no active row exists).
# 3) Expire rows flagged deleted in bronze_clean where no new row exists (set end_date = NOW_NTZ).
# 4) Expire rows missing in bronze_clean (i.e., key not present in today's bronze_clean) - logical delete at NOW_NTZ.

for name, meta in entities.items():
    bronze = meta["bronze"]
    bronze_clean = bronze.replace("_delta", "_clean")  # ✅ NEW: clean table name
    silver = meta["silver"]
    key = meta["key"]
    compare_cols = meta["compare_cols"]

    print(f"\n🧹 Cleaning bronze table for {name} -> {bronze_clean}")

    # ✅ NEW: cleaning rules per entity
    if name == "customers":
        clean_sql = f"""
        CREATE OR REPLACE TABLE {bronze_clean} AS
        SELECT DISTINCT
            TRY_CAST(customer_id AS INT) AS customer_id,
            TRIM(customer_name) AS customer_name,
            CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
            LOWER(COALESCE(delete_flag,'N')) AS delete_flag
        FROM {bronze}
        WHERE customer_id IS NOT NULL
          AND TRY_CAST(customer_id AS INT) > 0
          AND start_date IS NOT NULL
          AND start_date <= CURRENT_TIMESTAMP();
        """
    elif name == "products":
        clean_sql = f"""
        CREATE OR REPLACE TABLE {bronze_clean} AS
        SELECT DISTINCT
            CAST(product_id AS STRING) AS product_id,
            TRIM(product_name) AS product_name,
            CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
            LOWER(COALESCE(delete_flag,'N')) AS delete_flag
        FROM {bronze}
        WHERE product_id IS NOT NULL
          AND start_date IS NOT NULL
          AND start_date <= CURRENT_TIMESTAMP();
        """
    elif name == "orders":
        clean_sql = f"""
        CREATE OR REPLACE TABLE {bronze_clean} AS
        SELECT DISTINCT
            CAST(order_id AS STRING) AS order_id,
            TRY_CAST(customer_id AS INT) AS customer_id,
            CAST(product_id AS STRING) AS product_id,
            CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
            LOWER(COALESCE(delete_flag,'N')) AS delete_flag
        FROM {bronze}
        WHERE order_id IS NOT NULL
          AND customer_id IS NOT NULL
          AND TRY_CAST(customer_id AS INT) > 0
          AND product_id IS NOT NULL
          AND start_date IS NOT NULL
          AND start_date <= CURRENT_TIMESTAMP();
        """
    else:
        raise ValueError(f"❌ Unknown entity {name}")

    session.sql(clean_sql).collect()
    print(f"✅ Created cleaned table: {bronze_clean}")

    print(f"\n🔁 Processing entity: {name} (using {bronze_clean})")

    # A) Expire changed records
    change_condition = " OR ".join([f"(tgt.{c} IS DISTINCT FROM src_new.{c})" for c in compare_cols])
    expire_changed_sql = f"""
    UPDATE {silver} tgt
    SET end_date = TO_TIMESTAMP_NTZ(TO_CHAR(src_new.start_date, 'YYYY-MM-DD HH24:MI:SS'))
    FROM (
      SELECT {key}, {', '.join(compare_cols)}
      FROM {bronze_clean}   -- ✅ CHANGED: use bronze_clean
      WHERE delete_flag = 'n'
    ) src_new
    WHERE tgt.{key} = src_new.{key}
      AND tgt.end_date = {OPEN_END_DATE}
      AND ({change_condition});
    """
    session.sql(expire_changed_sql).collect()
    print("  ✅ Expired changed rows (set end_date = new.start_date)")

    # B) Insert new/updated rows
    insert_new_sql = f"""
    INSERT INTO {silver} ({key}, {', '.join(compare_cols)}, load_date, end_date)
    SELECT
      src.{key},
      {', '.join([f"src.{c}" for c in compare_cols])},
      {NOW_NTZ} AS load_date,
      {OPEN_END_DATE} AS end_date
    FROM {bronze_clean} src   -- ✅ CHANGED: use bronze_clean
    WHERE src.delete_flag = 'n'
      AND NOT EXISTS (
        SELECT 1 FROM {silver} tgt
        WHERE tgt.{key} = src.{key}
          AND tgt.end_date = {OPEN_END_DATE}
      );
    """
    session.sql(insert_new_sql).collect()
    print("  ✅ Inserted new/updated rows (no active row existed)")

    # C) Expire rows marked deleted (delete_flag='y' without 'n')
    expire_flag_y_without_n_sql = f"""
    UPDATE {silver} tgt
    SET end_date = {NOW_NTZ}
    WHERE tgt.end_date = {OPEN_END_DATE}
      AND tgt.{key} IN (
         SELECT key_del.{key} FROM (
           SELECT {key}, delete_flag FROM {bronze_clean}   -- ✅ CHANGED: use bronze_clean
         ) key_del
         WHERE key_del.delete_flag = 'y'
         AND key_del.{key} NOT IN (
           SELECT {key} FROM {bronze_clean} WHERE delete_flag = 'n'   -- ✅ CHANGED: use bronze_clean
         )
      );
    """
    session.sql(expire_flag_y_without_n_sql).collect()
    print("  ✅ Expired rows marked delete_flag='Y' with no 'N' counterpart (set end_date = now)")

    # D) Expire silver active rows missing from today's bronze_clean entirely (logical deletes)
    expire_missing_sql = f"""
    UPDATE {silver}
    SET end_date = {NOW_NTZ}
    WHERE end_date = {OPEN_END_DATE}
      AND {key} NOT IN (SELECT {key} FROM {bronze_clean});   -- ✅ CHANGED: use bronze_clean
    """
    session.sql(expire_missing_sql).collect()
    print("  ✅ Expired silver rows missing from bronze_clean (logical deletes)")

print("\n🎉 Delta load complete: silver_*_delta tables updated in Snowflake.")
session.close()
print("🔒 Session closed")


In [None]:
session.table("silver_customers_delta").order_by("customer_id").show(15)

