In [0]:
# Import DeltaTable for performing merge operations on Delta tables
from delta.tables import DeltaTable

# Import necessary Spark SQL functions for data manipulation
from pyspark.sql.functions import coalesce, current_timestamp, col, lit, max, sha2, concat_ws

In [0]:
%run ../utils/config

In [0]:
cleaned_customers_table="{}.{}".format(enriched_uk_schema,cleaned_customers_table)
dim_customers_table="{}.{}".format(curated_uk_schema,dim_customers_table)

In [0]:
# Create a default record with 'UNKNOWN' values for all fields
default_record = {
    "customer_id": "UNKNOWN",
    "first_name": "UNKNOWN",
    "last_name": "UNKNOWN",
    "email": "UNKNOWN",
    "registration_date": "9999-12-01",
    "city": "UNKNOWN",
    "country": "UNKNOWN",
    "load_date": "9999-12-01",
    "source_system": "UNKNOWN"
}

# Convert the default record to a Spark DataFrame
default_df = spark.createDataFrame([default_record])

# Add a customer_hash_key column using SHA-256 hash of 'UNKNOWN'
default_df = default_df.withColumn(
    "customer_hash_key",
    sha2(concat_ws("^", lit("UNKNOWN")), 256)
)

In [0]:
# Get the maximum load_date from the target Delta table.
# If no records exist, default to '1900-01-01'.
max_load_date = (
    spark.table(dim_customers_table)
    .agg(
        coalesce(
            max(col("load_date")),
            lit("1900-01-01")
        ).alias("max_load_date")
    )
    .collect()[0]["max_load_date"]
)

In [0]:
# Load the source DataFrame from the enriched customers table
source_df = (
    spark.table(cleaned_customers_table)  # Read data from source table
    .filter(col("is_current") == True)  # Keep only current records
    .withColumn("email", coalesce(col("email"), lit("UNKNOWN")))  # Replace null emails with 'UNKNOWN'
    .withColumn("load_date", current_timestamp())  # Add current timestamp as load_date
    .drop("start_date", "end_date", "is_current","row_hash")  # Remove unnecessary columns
)

In [0]:
# Define DeltaTable for the target customers table
target_table = DeltaTable.forName(spark,dim_customers_table)

# Filter source_df to include only records with _processing_timestamp greater than max_load_date
filtered_source_df = source_df.filter(col("_processing_timestamp") > max_load_date)
# Drop technical columns not needed in the target table
filtered_source_df = filtered_source_df.drop("_processing_timestamp", "_source_file_path")

# Add the default record to ensure at least one row is present
filtered_source_df = filtered_source_df.unionByName(default_df)

# Perform a merge (upsert) operation from source to target Delta table
target_table.alias("target").merge(
    filtered_source_df.alias("source"),
    "target.customer_hash_key = source.customer_hash_key"
).whenMatchedUpdate(set={
    # Update target fields when a match is found
    "customer_id": "source.customer_id",
    "first_name": "source.first_name",
    "last_name": "source.last_name",
    "email": "source.email",
    "registration_date": "source.registration_date",
    "city": "source.city",
    "country": "source.country",
    "load_date": "source.load_date",
    "source_system": "source.source_system"
}).whenNotMatchedInsert(values={
    # Insert new records when no match is found
    "customer_hash_key": "source.customer_hash_key",
    "customer_id": "source.customer_id",
    "first_name": "source.first_name",
    "last_name": "source.last_name",
    "email": "source.email",
    "registration_date": "source.registration_date",
    "city": "source.city",
    "country": "source.country",
    "load_date": "source.load_date",
    "source_system": "source.source_system"
}).execute()