In [None]:
USE DATABASE DEV;
USE SCHEMA DATASCIENCE;

In [None]:
CREATE OR REPLACE FILE FORMAT my_parquet_format
  TYPE = 'PARQUET';

In [None]:
CREATE OR REPLACE STAGE bronze_xml
  URL = 's3://datascience-output-bucket/bronze/xml/'
  STORAGE_INTEGRATION = aws_snowflake_connect
  FILE_FORMAT = my_parquet_format;

In [None]:
CREATE OR REPLACE TABLE customers (
  customer_id   INT,
  customer_name STRING,
  start_date TIMESTAMP_NTZ
);

CREATE OR REPLACE TABLE products (
  product_id    STRING,         
  product_name  STRING,
  start_date    TIMESTAMP_NTZ
);

CREATE OR REPLACE TABLE orders (
  order_id     STRING,        
  customer_id  INT,
  product_id   STRING,
  start_date   TIMESTAMP_NTZ
);

In [None]:
from datetime import datetime
from snowflake.snowpark import Session
import time

# Step 1: Use UTC to avoid local timezone mismatch
folder_path = datetime.utcnow().strftime("%Y/%b/%d/").title()

# Step 2: Snowflake connection config
connection_parameters = {
    "account": "PL49411.ap-southeast-1",
    "user": "ZUBAIR2216",
    "password": "Datalabs@193001",
    "role": "ACCOUNTADMIN",
    "warehouse": "COMPUTE_WH",
    "database": "DEV",
    "schema": "DATASCIENCE"
}

try:
    # Step 3: Create session
    session = Session.builder.configs(connection_parameters).create()

    # Step 4: COPY INTO customers
    copy_customers = f"""
    COPY INTO customers
    FROM @bronze_xml/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Customers_.*\\.parquet';
    """
    session.sql(copy_customers).collect()
    print("✅ Customers loaded")

    # Step 5: COPY INTO products
    copy_products = f"""
    COPY INTO products
    FROM @bronze_xml/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Products_.*\\.parquet';
    """
    session.sql(copy_products).collect()
    print("✅ Products loaded")

    # Step 6: COPY INTO orders
    copy_orders = f"""
    COPY INTO orders
    FROM @bronze_xml/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Orders_.*\\.parquet';
    """
    session.sql(copy_orders).collect()
    print("✅ Orders loaded")

finally:
    # Step 7: Always close session
    session.sql("COMMIT").collect()
    session.close()
    print("🔒 Session closed")

In [None]:
print(folder_path)

In [None]:
# Show top 10 rows from customers table
df = session.table("customers").limit(10)
df.show()

In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import (
    col, lit, row_number, sql_expr
)
from snowflake.snowpark.window import Window
from datetime import datetime

# Assumes already defined above
# folder_path = datetime.utcnow().strftime("%Y/%b/%d/").title()
# connection_parameters = { ... }

# Step 1: Create session
session = Session.builder.configs(connection_parameters).create()

# Step 2: Load bronze tables
customers = session.table("customers")
products = session.table("products")
orders = session.table("orders")

# Step 3: Filter NULLs in required fields
customers_clean = customers.filter((col("customer_id").is_not_null()) & (col("customer_name").is_not_null()))
products_clean = products.filter((col("product_id").is_not_null()) & (col("product_name").is_not_null()))
orders_clean = orders.filter(
    (col("order_id").is_not_null()) &
    (col("customer_id").is_not_null()) &
    (col("product_id").is_not_null())
)

# Step 4: Prepare Load Date and End Date with timestamp precision
now = sql_expr("TO_TIMESTAMP_NTZ(TO_CHAR(CURRENT_TIMESTAMP(), 'YYYY-MM-DD HH24:MI:SS'))")
end_datetime = sql_expr("TO_TIMESTAMP_NTZ('2099-12-31 23:59:59')")

# Step 5: Transform and cast columns
customers_t = customers_clean \
    .with_column("customer_id", col("customer_id").cast("int")) \
    .with_column("customer_name", col("customer_name").cast("string")) \
    .with_column("start_date", col("start_date").cast("timestamp")) \
    .with_column("load_date", now)

products_t = products_clean \
    .with_column("product_id", col("product_id").cast("string")) \
    .with_column("product_name", col("product_name").cast("string")) \
    .with_column("start_date", col("start_date").cast("timestamp")) \
    .with_column("load_date", now)

orders_t = orders_clean \
    .with_column("order_id", col("order_id").cast("string")) \
    .with_column("customer_id", col("customer_id").cast("int")) \
    .with_column("product_id", col("product_id").cast("string")) \
    .with_column("start_date", col("start_date").cast("timestamp")) \
    .with_column("load_date", now) \
    .with_column("end_date", end_datetime)

# Step 6: Add surrogate keys and reorder columns
customers_final = customers_t.with_column(
    "customer_sk", row_number().over(Window.order_by("customer_id")) + 1000
).select(
    "customer_id",
    "customer_sk",
    "customer_name",
    "start_date",
    "load_date"
)

products_final = products_t.with_column(
    "product_sk", row_number().over(Window.order_by("product_id")) + 2000
).select(
    "product_id",
    "product_sk",
    "product_name",
    "start_date",
    "load_date"
)

orders_final = orders_t.select(
    "order_id",
    "customer_id",
    "product_id",
    "start_date",
    "load_date",
    "end_date"
)

# Step 7: Save to silver Snowflake tables
customers_final.write.mode("overwrite").save_as_table("customers_silver")
products_final.write.mode("overwrite").save_as_table("products_silver")
orders_final.write.mode("overwrite").save_as_table("orders_silver")

print("✅ Silver tables created with correct column order and timestamp formatting")

# Step 8: Create silver stage if not exists
session.sql(f"""
CREATE OR REPLACE STAGE silver_xml
URL = 's3://datascience-output-bucket/silver/xml/'
STORAGE_INTEGRATION = aws_snowflake_connect
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
""").collect()

# Step 9: Export tables to S3 with TIMESTAMP_NTZ formatting
session.sql(f"""
COPY INTO @silver_xml/{folder_path}Customers.parquet
FROM (
    SELECT
        customer_id,
        customer_sk,
        customer_name,
        CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
        CAST(load_date AS TIMESTAMP_NTZ) AS load_date
    FROM customers_silver
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

session.sql(f"""
COPY INTO @silver_xml/{folder_path}Products.parquet
FROM (
    SELECT
        product_id,
        product_sk,
        product_name,
        CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
        CAST(load_date AS TIMESTAMP_NTZ) AS load_date
    FROM products_silver
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

session.sql(f"""
COPY INTO @silver_xml/{folder_path}Orders.parquet
FROM (
    SELECT
        order_id,
        customer_id,
        product_id,
        CAST(start_date AS TIMESTAMP_NTZ) AS start_date,
        CAST(load_date AS TIMESTAMP_NTZ) AS load_date,
        CAST(end_date AS TIMESTAMP_NTZ) AS end_date
    FROM orders_silver
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

print("✅ Parquet files exported with timestamps truncated to seconds")
session.close()
print("🔒 Session closed")

In [None]:
df = session.table("orders_silver").limit(10)
df.show()