In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/src/erp_cust_az12/")   # ðŸ”¹ stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/erp_cust_az12/")
)

In [0]:
def process_batch(df, batch_id):
    df=df.withColumn('cid',expr("""CASE
				WHEN cid LIKE 'NAS%' THEN SUBSTRING(cid, 4, LEN(cid)) -- Remove 'NAS' prefix if present
				ELSE cid
			END """))
    df=df.withColumn('bdate',expr("""CASE
                    WHEN bdate > GETDATE() THEN NULL
                    ELSE bdate
                END"""))

    df=df.withColumn('gen',expr("""CASE
                    WHEN UPPER(TRIM(gen)) IN ('F', 'FEMALE') THEN 'Female'
                    WHEN UPPER(TRIM(gen)) IN ('M', 'MALE') THEN 'Male'
                    ELSE 'n/a'
                END"""))
    df=df.withColumn("dwh_create_date",lit(current_timestamp()))

    src_df=df.withcolumn('audit_checksum',xxhash64(concat(coalesce('bdate',lit('null')),
                                                      coalesce('gen',lit('null'))
                                                     )
                                                )
                     )
    
    
    tgt_active_df=spark.sql("select cid,audit_checksum from gautham.gtk_scm.erp_cust_az12 where active_flag='Y")    
    
    # ------------------------------
    # Step 2: Left join source with active target on primary key
    # ------------------------------
    join_df = (
        src_df.alias("src")
        .join(tgt_active_df.alias("tgt"), on="cst_id", how="left")
    )
    
    # ------------------------------
    # Step 3: Drop completely same rows (no change)
    # ------------------------------
    # Rows where checksum is same => unchanged
    changed_df = join_df.filter(
        (F.col("tgt.audit_checksum").isNull()) | 
        (F.col("src.audit_checksum") != F.col("tgt.audit_checksum"))
    )
    
    # ------------------------------
    # Step 4: Handle changed/new records
    # ------------------------------
    
    # Separate new rows and changed rows
    new_rows_df = changed_df.filter(F.col("tgt.cst_id").isNull())
    changed_existing_df = changed_df.filter(F.col("tgt.cst_id").isNotNull())
    
    # Create the new version rows for changed records
    new_version_rows = changed_existing_df.select(
        "src.*"
    ).withColumn("effective_start_date", F.current_timestamp()) \
    .withColumn("effective_end_date", F.lit(None).cast("timestamp")) \
    .withColumn("is_active", F.lit("Y"))
    
    # Old version rows need to be deactivated
    old_version_rows = changed_existing_df.select("tgt.*").withColumn("is_active", F.lit("N")) \
        .withColumn("effective_end_date", F.current_timestamp())
    
    # Combine all three (new inserts + new version + old version)
    final_merge_df = (
        new_rows_df.select("src.*").withColumn("merge_key", F.col("src.primary_key"))
        .unionByName(
            old_version_rows.withColumn("merge_key", F.col("primary_key"))
        )
        .unionByName(
            new_version_rows.withColumn("merge_key", F.lit(None))
        )
    )
    
    # ------------------------------
    # Step 5: Perform MERGE in a single step
    # ------------------------------
    
    from delta.tables import DeltaTable
    
    delta_tgt = DeltaTable.forName(spark, "gautham.gtk_scm.erp_cust_az12")
    
    (
        delta_tgt.alias("tgt")
        .merge(
            final_merge_df.alias("src"),
            "tgt.cid = src.merge_key"
        )
        # update old record to inactive
        .whenMatchedUpdate(set={
            "is_active": "'N'",
            "effective_end_date": "current_timestamp()"
        })
        # insert new or changed version
        .whenNotMatchedInsert(values={
            "cid": "src.cid",
            "bdate": "src.bdate",
            "gen": "src.gen",
            "dwh_create_date": "src.dwh_create_date",
            "audit_checksum": "src.audit_checksum",
            "is_active": "'Y'",
            "effective_start_date": "current_timestamp()",
            "effective_end_date": "NULL"
        })
        .execute()
    )
        






    
    

In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/erp_cust_az12/").trigger(availableNow=True).start().awaitTermination()