In [0]:
# Import necessary modules for working with Delta tables and Spark SQL functions
from delta.tables import DeltaTable
from pyspark.sql.functions import coalesce, current_timestamp, col, lit, max, sha2, concat_ws

In [0]:
%run ../utils/config

In [0]:
cleaned_products_table="{}.{}".format(enriched_uk_schema,cleaned_products_table)
dim_products_table="{}.{}".format(curated_uk_schema,dim_products_table)

In [0]:
# Create a default record with placeholder 'UNKNOWN' values for missing products
default_record = {
    "product_id": "UNKNOWN",
    "product_name": "UNKNOWN",
    "category": "UNKNOWN",
    "unit_price": -1,
    "start_date": "9999-12-01",
    "end_date": "9999-12-01",
    "is_current": True,
    "load_date": "9999-12-01",
    "source_system": "UNKNOWN"
}

# Convert the default record to a Spark DataFrame
default_df = spark.createDataFrame([default_record])

# Add a product_hash_key column using SHA-256 hash of the 'UNKNOWN' value
default_df = default_df.withColumn(
    "product_hash_key",
    sha2(concat_ws("^", lit("UNKNOWN")), 256)
)

In [0]:
# Get the latest load_date from the target table, or use '1900-01-01' if no records exist
max_load_date = (
    spark.table(dim_products_table)
    .agg(
        coalesce(
            max(col("load_date")),
            lit("1900-01-01")
        ).alias("max_load_date")
    )
    .collect()[0]["max_load_date"]
)

In [0]:
# Load the source data from the enriched products table
source_df = spark.table(cleaned_products_table) 

# Reference the target Delta table for dimension products
delta_table = DeltaTable.forName(spark,dim_products_table)

# Rename 'price' to 'unit_price' and add a current timestamp as 'load_date'
source_df = (source_df
            .withColumnRenamed("price", "unit_price")
            .withColumn("load_date", current_timestamp()))

# Filter source records to only those newer than the latest load_date in the target table
filtered_source_df = source_df.filter(col("_processing_timestamp") > max_load_date)

# Drop unnecessary columns before merging
filtered_source_df = filtered_source_df.drop("stock_quantity", "row_hash", "_processing_timestamp", "_source_file_path")

# Add the default record for missing products to the filtered source DataFrame
filtered_source_df = filtered_source_df.unionByName(default_df)

# Define the merge condition based on product_hash_key
merge_condition = "target.product_hash_key = source.product_hash_key"

# Define the update condition for when matched records should be updated
update_condition = (
    "target.product_name <> source.product_name OR "
    "target.category <> source.category OR "
    "target.unit_price <> source.unit_price OR "
    "target.start_date <> source.start_date OR "
    "target.end_date <> source.end_date OR "
    "target.is_current <> source.is_current OR "
    "target.source_system <> source.source_system"
)

# Perform the merge operation: update matched records, insert new records
(
    delta_table.alias("target")
    .merge(
        filtered_source_df.alias("source"),
        merge_condition
    )
    .whenMatchedUpdate(
        condition=update_condition,
        set={
            "product_id": "source.product_id",
            "product_name": "source.product_name",
            "category": "source.category",
            "unit_price": "source.unit_price",
            "start_date": "source.start_date",
            "end_date": "source.end_date",
            "is_current": "source.is_current",
            "load_date": "source.load_date",
            "source_system": "source.source_system"
        }
    )
    .whenNotMatchedInsert(
        values={
            "product_hash_key": "source.product_hash_key",
            "product_id": "source.product_id",
            "product_name": "source.product_name",
            "category": "source.category",
            "unit_price": "source.unit_price",
            "start_date": "source.start_date",
            "end_date": "source.end_date",
            "is_current": "source.is_current",
            "load_date": "source.load_date",
            "source_system": "source.source_system"
        }
    )
    .execute()
)