In [None]:
CREATE OR REPLACE STAGE bronze_xml
  URL = 's3://datascience-output-bucket/bronze/xml/'
  STORAGE_INTEGRATION = aws_snowflake_connect
  FILE_FORMAT = my_parquet_format;

In [None]:
from snowflake.snowpark import Session

connection_parameters = {
    "account": "PL49411.ap-southeast-1",
    "user": "ZUBAIR2216",
    "password": "Datalabs@193001",
    "role": "ACCOUNTADMIN",
    "warehouse": "COMPUTE_WH",
    "database": "DEV",
    "schema": "DATASCIENCE"
}

session = Session.builder.configs(connection_parameters).create()
print("✅ Snowflake session started")

In [None]:
from datetime import datetime, timedelta

today = datetime.utcnow()
yesterday = today - timedelta(days=1)

folder_today = today.strftime("%Y/%b/%d/").title()
folder_yesterday = yesterday.strftime("%Y/%b/%d/").title()

bronze_path_today = f"@bronze_xml/{folder_today}"
silver_path_today = f"@silver_xml/{folder_today}"

print("📂 Today’s folder:", bronze_path_today)
print("📂 Yesterday’s folder:", folder_yesterday)

In [None]:
import sys

files_today = session.sql(f"LIST {bronze_path_today}").collect()

if not any(
    "customers_" in f["name"].lower() or
    "products_" in f["name"].lower() or
    "orders_" in f["name"].lower()
    for f in files_today
):
    print("❌ No new files found. Exiting.")
    session.close()
    raise SystemExit
else:
    print("✅ Files found, proceeding with Day 1 processing")


In [None]:
session.sql("TRUNCATE TABLE customers").collect()
session.sql("TRUNCATE TABLE products").collect()
session.sql("TRUNCATE TABLE orders").collect()

# COPY INTO with pattern matching
for cmd in [
    f"""
    COPY INTO customers FROM {bronze_path_today}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*customers_.*\\.parquet';
    """,
    f"""
    COPY INTO products FROM {bronze_path_today}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*products_.*\\.parquet';
    """,
    f"""
    COPY INTO orders FROM {bronze_path_today}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*orders_.*\\.parquet';
    """
]:
    session.sql(cmd).collect()

print("✅ Bronze tables truncated and overwritten with today's data")

In [None]:
df = session.table("orders").limit(10)
df.show()

In [None]:
from snowflake.snowpark.functions import col, lit, row_number, sql_expr
from snowflake.snowpark.window import Window

now = sql_expr("TO_TIMESTAMP_NTZ(TO_CHAR(CURRENT_TIMESTAMP(), 'YYYY-MM-DD HH24:MI:SS'))")
end_datetime = sql_expr("TO_TIMESTAMP_NTZ('2099-12-31 23:59:59')")

customers_today = session.table("customers") \
    .filter((col("customer_id").is_not_null()) & (col("customer_name").is_not_null())) \
    .with_columns(
        ["customer_id", "customer_name", "start_date", "load_date"],
        [
            col("customer_id").cast("int"),
            col("customer_name").cast("string"),
            col("start_date").cast("timestamp"),
            now
        ]
    )

products_today = session.table("products") \
    .filter((col("product_id").is_not_null()) & (col("product_name").is_not_null())) \
    .with_columns(
        ["product_id", "product_name", "start_date", "load_date"],
        [
            col("product_id").cast("string"),
            col("product_name").cast("string"),
            col("start_date").cast("timestamp"),
            now
        ]
    )

orders_today = session.table("orders") \
    .filter((col("order_id").is_not_null()) & (col("customer_id").is_not_null()) & (col("product_id").is_not_null())) \
    .with_columns(
        ["order_id", "customer_id", "product_id", "start_date", "load_date", "end_date"],
        [
            col("order_id").cast("string"),
            col("customer_id").cast("int"),
            col("product_id").cast("string"),
            col("start_date").cast("timestamp"),
            now,
            end_datetime
        ]
    )

print("✅ Today's data cleaned and casted")

In [None]:
prev_customers = session.table("customers_silver")
prev_products = session.table("products_silver")
prev_orders = session.table("orders_silver")

print("✅ Loaded yesterday’s silver tables")

In [None]:
new_customers = customers_today.join(prev_customers, "customer_id", "left_anti")
max_cust_sk = prev_customers.agg({"customer_sk": "max"}).collect()[0][0] or 1000

new_customers = new_customers.with_column(
    "customer_sk", row_number().over(Window.order_by("customer_id")) + max_cust_sk
)

final_customers = prev_customers.union_all(
    new_customers.select("customer_id", "customer_sk", "customer_name", "start_date", "load_date")
)

print("✅ Customer surrogate keys handled")

In [None]:
new_products = products_today.join(prev_products, "product_id", "left_anti")
max_prod_sk = prev_products.agg({"product_sk": "max"}).collect()[0][0] or 2000

new_products = new_products.with_column(
    "product_sk", row_number().over(Window.order_by("product_id")) + max_prod_sk
)

final_products = prev_products.union_all(
    new_products.select("product_id", "product_sk", "product_name", "start_date", "load_date")
)

print("✅ Product surrogate keys handled")

In [None]:
# Re-alias tables for join
t = orders_today.alias("t")          # Today's orders
y = prev_orders.alias("y")           # Yesterday's orders

# Join on order_id and customer_id
joined_orders = t.join(
    y,
    (t["order_id"] == y["order_id"]) & (t["customer_id"] == y["customer_id"]),
    "left"
)

# 1️⃣ Identify product changes → SCD2 update needed
product_changed = joined_orders.filter(
    (y["product_id"].is_not_null()) & (t["product_id"] != y["product_id"])
)

# 1a. Close the previous product with end_date = today
close_old = product_changed.select(
    y["order_id"], y["customer_id"], y["product_id"],
    y["start_date"], y["load_date"],
    lit(now).alias("end_date")
)

# 1b. Insert new product row with same order_id, customer_id but new product_id
insert_new = product_changed.select(
    t["order_id"], t["customer_id"], t["product_id"],
    t["start_date"], t["load_date"],
    lit(end_datetime).alias("end_date")
)

# 2️⃣ Keep unchanged orders as-is
unchanged_orders = joined_orders.filter(
    t["product_id"] == y["product_id"]
).select(y["order_id"], y["customer_id"], y["product_id"], y["start_date"], y["load_date"], y["end_date"])

# 3️⃣ New orders not seen yesterday → insert as-is
new_orders = t.join(y, "order_id", "left_anti")

# 4️⃣ Deleted orders → present yesterday but missing today → mark with end_date = load_date
deleted_orders = y.join(t, "order_id", "left_anti") \
    .with_column("end_date", y["load_date"])

# ✅ Combine all
final_orders = unchanged_orders.union_all(close_old).union_all(insert_new).union_all(new_orders).union_all(deleted_orders)

print("✅ Orders table updated with SCD Type 2 logic")


In [None]:
final_customers.write.mode("overwrite").save_as_table("customers_silver")
final_products.write.mode("overwrite").save_as_table("products_silver")
final_orders.write.mode("overwrite").save_as_table("orders_silver")

print("✅ Final silver tables saved to Snowflake")

In [None]:
df = session.table("orders_silver").limit(10)
df.show()

In [None]:
# Create silver stage if not exists
session.sql(f"""
CREATE OR REPLACE STAGE silver_xml
URL = 's3://datascience-output-bucket/silver/xml/'
STORAGE_INTEGRATION = aws_snowflake_connect
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
""").collect()

# Helper export function
def export_to_s3(table_name, filename):
    session.sql(f"""
    COPY INTO @silver_xml/{folder_today}{filename}
    FROM (
        SELECT * FROM {table_name}
    )
    FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
    SINGLE = TRUE
    OVERWRITE = TRUE;
    """).collect()

export_to_s3("customers_silver", f"Customers_{today.strftime('%Y%m%d')}.parquet")
export_to_s3("products_silver", f"Products_{today.strftime('%Y%m%d')}.parquet")
export_to_s3("orders_silver", f"Orders_{today.strftime('%Y%m%d')}.parquet")

session.close()
print("✅ Export complete. 🔒 Session closed.")
