In [None]:
USE DATABASE DEV;
USE SCHEMA DATASCIENCE;

In [None]:
CREATE OR REPLACE FILE FORMAT my_parquet_format
  TYPE = 'PARQUET';

In [None]:
CREATE OR REPLACE STAGE bronze_csv
URL = 's3://datascience-output-bucket/bronze/'
STORAGE_INTEGRATION = aws_snowflake_connect
FILE_FORMAT = my_parquet_format;

In [None]:
CREATE OR REPLACE TABLE bronze_customers (
  customer_id   INT,
  customer_name STRING,
  start_date TIMESTAMP_NTZ
);

CREATE OR REPLACE TABLE bronze_products (
  product_id    STRING,         
  product_name  STRING,
  start_date    TIMESTAMP_NTZ
);

CREATE OR REPLACE TABLE bronze_orders (
  order_id     STRING,        
  customer_id  INT,
  product_id   STRING,
  start_date   TIMESTAMP_NTZ
);

In [None]:
#from datetime import datetime
from datetime import datetime, timedelta
from snowflake.snowpark import Session

# Step 1: Use UTC to avoid local timezone mismatch
#folder_path = datetime.utcnow().strftime("%Y/%b/%d/").title()  # e.g. 2025/Aug/20/
#print("📌 Using folder path:", folder_path)

tomorrow = datetime.utcnow() + timedelta(days=-9)
folder_path = tomorrow.strftime("%Y/%b/%d/").title()
print("📌 Using folder path:", folder_path)

# Step 2: Snowflake connection config
connection_parameters = {
    "account": "MJ13681.ap-southeast-1",
    "user": "mzs1988",
    "password": "Datalabs@193001",
    "role": "ACCOUNTADMIN",
    "warehouse": "COMPUTE_WH",
    "database": "DEV",
    "schema": "DATASCIENCE"
}

try:
    # Step 3: Create session
    session = Session.builder.configs(connection_parameters).create()

    # =====================
    # Customers
    # =====================
    copy_customers = f"""
    COPY INTO bronze_customers
    FROM @bronze_csv/customers/Full_load/csv_files/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Customers_.*?/part-.*\\.parquet';
    """
    session.sql(copy_customers).collect()
    print("✅ Bronze Customers loaded")

    # =====================
    # Products
    # =====================
    copy_products = f"""
    COPY INTO bronze_products
    FROM @bronze_csv/products/Full_load/csv_files/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Products_.*?/part-.*\\.parquet';
    """
    session.sql(copy_products).collect()
    print("✅ Bronze Products loaded")

    # =====================
    # Orders
    # =====================
    copy_orders = f"""
    COPY INTO bronze_orders
    FROM @bronze_csv/orders/Full_load/csv_files/{folder_path}
    FILE_FORMAT = (FORMAT_NAME = my_parquet_format)
    MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
    PATTERN = '.*Orders_.*?/part-.*\\.parquet';
    """
    session.sql(copy_orders).collect()
    print("✅ Bronze Orders loaded")

finally:
    # Step 7: Always close session
    session.sql("COMMIT").collect()
    session.close()
    print("🔒 Session closed")

In [None]:
df = session.table("bronze_orders").limit(10)
df.show()

In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, sql_expr, lit
#from datetime import datetime
from datetime import datetime, timedelta

# ---- Date folder (UTC) ----
#folder_path = datetime.utcnow().strftime("%Y/%b/%d/").title()  # e.g. 2025/Aug/28/
#print("📌 Using folder path:", folder_path)

tomorrow = datetime.utcnow() + timedelta(days=-9)
folder_path = tomorrow.strftime("%Y/%b/%d/").title()
print("📌 Using folder path:", folder_path)

# ---- Snowflake connection ----
connection_parameters = {
    "account": "MJ13681.ap-southeast-1",
    "user": "mzs1988",
    "password": "Datalabs@193001",
    "role": "ACCOUNTADMIN",
    "warehouse": "COMPUTE_WH",
    "database": "DEV",
    "schema": "DATASCIENCE"
}

# ---- Session ----
session = Session.builder.configs(connection_parameters).create()

# ---- Current UTC timestamp (second precision) ----
#now = sql_expr("TO_TIMESTAMP_NTZ(TO_CHAR(CURRENT_TIMESTAMP(), 'YYYY-MM-DD HH24:MI:SS'))")
now = sql_expr("TO_TIMESTAMP_NTZ(TO_CHAR(CONVERT_TIMEZONE('UTC', CURRENT_TIMESTAMP()), 'YYYY-MM-DD HH24:MI:SS'))")
end_date = lit("2099-12-31 23:59:59").cast("timestamp")

# ---- Bronze -> Clean (dedupe, required fields, UTC casting) ----
customers = session.table("bronze_customers")
products  = session.table("bronze_products")
orders    = session.table("bronze_orders")

customers_clean = (
    customers.filter(
    (col("customer_id").is_not_null()) &
    (col("customer_name").is_not_null()) &
    (col("start_date").is_not_null()) &
    (col("customer_id") > 0) &
    (col("start_date") <= now)
).distinct()
    .with_column("customer_id", col("customer_id").cast("int"))
    .with_column("customer_name", sql_expr("TRIM(customer_name)"))
    .with_column("start_date", col("start_date").cast("timestamp"))
    .with_column("load_date", now)
    .with_column("end_date", end_date)
)

products_clean = (
products.filter(
    (col("product_id").is_not_null()) &
    (col("product_name").is_not_null()) &
    (col("start_date").is_not_null()) &
    (col("start_date") <= now)
).distinct()
    .with_column("product_id", col("product_id").cast("string"))
    .with_column("product_name", sql_expr("TRIM(product_name)"))
    .with_column("start_date", col("start_date").cast("timestamp"))
    .with_column("load_date", now)
    .with_column("end_date", end_date)
)

orders_clean = (
    orders
    .filter(
        (col("order_id").is_not_null()) &
        (col("customer_id").is_not_null()) &
        (col("product_id").is_not_null()) &
        (col("start_date").is_not_null()) &
        (col("start_date") <= now)
    )
    .distinct()
    .with_column("order_id", col("order_id").cast("string"))
    .with_column("customer_id", col("customer_id").cast("int"))
    .with_column("product_id", col("product_id").cast("string"))
    .with_column("start_date", col("start_date").cast("timestamp"))
    .with_column("load_date", now)
    .with_column("end_date", end_date)
)

# ---- Write Silver DLT tables ----
customers_clean.write.mode("overwrite").save_as_table("silver_dlt_customers")
products_clean.write.mode("overwrite").save_as_table("silver_dlt_products")
orders_clean.write.mode("overwrite").save_as_table("silver_dlt_orders")
print("✅ Silver DLT tables refreshed (deduped, cleansed, UTC load_date)")

# ---- Stage for Silver Parquet (still Parquet, named silver_csv) ----
session.sql("""
CREATE OR REPLACE STAGE silver_csv
URL = 's3://datascience-output-bucket/silver/'
STORAGE_INTEGRATION = aws_snowflake_connect
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
""").collect()

# ---- Export Silver -> S3 Parquet (new path structure) ----
# Customers
session.sql(f"""
COPY INTO @silver_csv/customers/Full_load/csv_files/{folder_path}Customers.parquet
FROM (
    SELECT
        customer_id,
        TRIM(customer_name) AS customer_name,
        TO_TIMESTAMP_NTZ(start_date) AS start_date,
        TO_TIMESTAMP_NTZ(load_date)  AS load_date,
        TO_TIMESTAMP_NTZ(end_date)   AS end_date
    FROM silver_dlt_customers
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

# Products
session.sql(f"""
COPY INTO @silver_csv/products/Full_load/csv_files/{folder_path}Products.parquet
FROM (
    SELECT
        product_id,
        TRIM(product_name) AS product_name,
        TO_TIMESTAMP_NTZ(start_date) AS start_date,
        TO_TIMESTAMP_NTZ(load_date)  AS load_date,
        TO_TIMESTAMP_NTZ(end_date)   AS end_date
    FROM silver_dlt_products
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

# Orders
session.sql(f"""
COPY INTO @silver_csv/orders/Full_load/csv_files/{folder_path}Orders.parquet
FROM (
    SELECT
        order_id,
        customer_id,
        product_id,
        TO_TIMESTAMP_NTZ(start_date) AS start_date,
        TO_TIMESTAMP_NTZ(load_date)  AS load_date,
        TO_TIMESTAMP_NTZ(end_date)   AS end_date
    FROM silver_dlt_orders
)
FILE_FORMAT = (TYPE = PARQUET COMPRESSION = SNAPPY)
SINGLE = TRUE
OVERWRITE = TRUE;
""").collect()

print("✅ Parquet exported to:")
print(f"   s3://datascience-output-bucket/silver/customers/Full_load/csv_files/{folder_path}Customers.parquet")
print(f"   s3://datascience-output-bucket/silver/products/Full_load/csv_files/{folder_path}Products.parquet")
print(f"   s3://datascience-output-bucket/silver/orders/Full_load/csv_files/{folder_path}Orders.parquet")

# ---- Close ----
session.close()
print("🔒 Session closed")


In [None]:
df = session.table("silver_dlt_customers").order_by("customer_id").limit(10)
df.show()