In [0]:
# Import DeltaTable for Delta Lake operations
from delta.tables import DeltaTable

# Import necessary PySpark SQL functions for DataFrame transformations
from pyspark.sql.functions import coalesce, current_timestamp, col, lit, max, sha2, concat_ws, round   

In [0]:
%run ../utils/config

In [0]:
cleaned_orders_table="{}.{}".format(enriched_uk_schema,cleaned_orders_table)
fact_sales_transactions_table="{}.{}".format(curated_uk_schema,fact_sales_transactions_table)

In [0]:
# Get the latest load_date from the target table, or use '1900-01-01' if the table is empty
max_load_date = (
    spark.table(fact_sales_transactions_table)
    .agg(
        coalesce(
            max(col("load_date")),
            lit("1900-01-01")
        ).alias("max_load_date")
    )
    .collect()[0]["max_load_date"]
)

In [0]:
# Load the cleaned orders table
ord = spark.table(cleaned_orders_table)

# Filter orders with _processing_timestamp greater than the latest load_date
filtered_source_df = ord.filter(col("_processing_timestamp") > max_load_date)

# Select and transform columns for fact_sales_transactions
fact_sales_transactions = filtered_source_df.filter(col("status") == "Delivered").select(
    col("order_id"),
    col("order_hash_key"),
    sha2(concat_ws('^', col("customer_id")), 256).alias("customer_hash_key"),  # Hash customer_id
    sha2(concat_ws('^',col("product_id")), 256).alias("product_hash_key"),      # Hash product_id
    sha2(concat_ws('^', col('order_date').cast('string')), 256).alias("calendar_hash_key"),  # Hash order_date
    col("quantity"),
    round(col("total_amount"), 2).alias("total_amount"),
    col("source_system"),
    current_timestamp().alias("load_date")  # Add current timestamp as load_date
)

# Insert transformed data into the fact_sales_transactions table
fact_sales_transactions.write.insertInto(fact_sales_transactions_table)