In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable


In [None]:
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")

In [None]:
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/scm/crm_cust_info/")   # ðŸ”¹ stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/crm_cust_info/")
)

In [0]:
def process_batch(df, batch_id):
    window_spec = Window.partitionBy(col("cst_id")).orderBy(col("cst_create_date").desc())
    df=df.withColumn("row_num", row_number().over(window_spec)).filter("row_num = 1").filter("cst_id is not null")
    df=df.withColumn('cst_firstname',trim(col("cst_firstname"))).withColumn('cst_lastname',trim(col("cst_lastname")))
    df=df.withColumn('cst_marital_status',expr("""CASE 
                    WHEN UPPER(TRIM(cst_marital_status)) = 'S' THEN 'Single'
                    WHEN UPPER(TRIM(cst_marital_status)) = 'M' THEN 'Married'
                    ELSE 'n/a'
                                END """))
    df=df.withColumn('cst_gndr',expr("""CASE 
                    WHEN UPPER(TRIM(cst_gndr)) = 'F' THEN 'Female'
                    WHEN UPPER(TRIM(cst_gndr)) = 'M' THEN 'Male'
                    ELSE 'n/a'
                END  """))

    df=df.withColumn("dwh_create_date",lit(current_timestamp())).drop('row_num')

    
    
    src_df=df.withcolumn('audit_checksum',xxhash64(concat(coalesce('cst_key',lit('null')),
                                                      coalesce('cst_firstname',lit('null')),
                                                      coalesce('cst_lastname',lit('null')),
                                                      coalesce('cst_marital_status',lit('null')),
                                                      coalesce('cst_gndr',lit('null')),
                                                      coalesce('cst_create_date'.cast("string"),lit('null'))
                                                     )
                                                )
                     )
    
    
    tgt_active_df=spark.sql("select cst_id,audit_checksum from crm_cust_info where active_flag='Y'")    
    
    # ------------------------------
    # Step 2: Left join source with active target on primary key
    # ------------------------------
    join_df = (
        src_df.alias("src")
        .join(tgt_active_df.alias("tgt"), on="cst_id", how="left")
    )
    
    # ------------------------------
    # Step 3: Drop completely same rows (no change)
    # ------------------------------
    # Rows where checksum is same => unchanged
    changed_df = join_df.filter(
        (F.col("tgt.audit_checksum").isNull()) | 
        (F.col("src.audit_checksum") != F.col("tgt.audit_checksum"))
    )
    
    # ------------------------------
    # Step 4: Handle changed/new records
    # ------------------------------
    
    # Separate new rows and changed rows
    new_rows_df = changed_df.filter(F.col("tgt.cst_id").isNull())
    changed_existing_df = changed_df.filter(F.col("tgt.cst_id").isNotNull())
    
    # Create the new version rows for changed records
    new_version_rows = changed_existing_df.select(
        "src.*"
    ).withColumn("effective_start_date", F.current_timestamp()) \
    .withColumn("effective_end_date", F.lit(None).cast("timestamp")) \
    .withColumn("is_active", F.lit("Y"))
    
    # Old version rows need to be deactivated
    old_version_rows = changed_existing_df.select("tgt.*").withColumn("is_active", F.lit("N")) \
        .withColumn("effective_end_date", F.current_timestamp())
    
    # Combine all three (new inserts + new version + old version)
    final_merge_df = (
        new_rows_df.select("src.*").withColumn("merge_key", F.col("src.primary_key"))
        .unionByName(
            old_version_rows.withColumn("merge_key", F.col("primary_key"))
        )
        .unionByName(
            new_version_rows.withColumn("merge_key", F.lit(None))
        )
    )
    
    # ------------------------------
    # Step 5: Perform MERGE in a single step
    # ------------------------------
    
    from delta.tables import DeltaTable
    
    delta_tgt = DeltaTable.forName(spark, "crm_cust_info")
    
    (
        delta_tgt.alias("tgt")
        .merge(
            final_merge_df.alias("src"),
            "tgt.cst_id = src.merge_key"
        )
        # update old record to inactive
        .whenMatchedUpdate(set={
            "is_active": "'N'",
            "effective_end_date": "current_timestamp()"
        })
        # insert new or changed version
        .whenNotMatchedInsert(values={
            "cst_id": "src.cst_id",
            "cst_key": "src.cst_key",
            "cst_firstname": "src.cst_firstname",
            "cst_lastname": "src.cst_lastname",
            "cst_marital_status": "src.cst_marital_status",
            "cst_gndr": "src.cst_gndr",
            "cst_create_date": "src.cst_create_date",
            "dwh_create_date": "src.dwh_create_date",
            "audit_checksum": "src.audit_checksum",
            "is_active": "'Y'",
            "effective_start_date": "current_timestamp()",
            "effective_end_date": "NULL"
        })
        .execute()
    )
        

     
        
        
        
        
    
    
    
    



In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/crm_cust_info/").trigger(availableNow=True).start().awaitTermination()

In [0]:
%sql
select count(*) from gautham.gtk_scm.crm_cust_info 
--select distinct dwh_create_date from gautham.gtk_scm.crm_cust_info 