In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/src/erp_cust_az12/")   # 🔹 stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/erp_cust_az12/")
)

In [0]:
def process_batch(df, batch_id):
    df=df.withColumn('cid',expr("""CASE
				WHEN cid LIKE 'NAS%' THEN SUBSTRING(cid, 4, LEN(cid)) -- Remove 'NAS' prefix if present
				ELSE cid
			END """))
    df=df.withColumn('bdate',expr("""CASE
                    WHEN bdate > GETDATE() THEN NULL
                    ELSE bdate
                END"""))

    df=df.withColumn('gen',expr("""CASE
                    WHEN UPPER(TRIM(gen)) IN ('F', 'FEMALE') THEN 'Female'
                    WHEN UPPER(TRIM(gen)) IN ('M', 'MALE') THEN 'Male'
                    ELSE 'n/a'
                END"""))
    df=df.withColumn("dwh_create_date",lit(current_timestamp()))
    target_table = DeltaTable.forName(spark, "gautham.gtk_scm.erp_cust_az12")

    # Define merge condition
    merge_condition = "tgt.cid = src.cid  "
    update_condition = """
            tgt.bdate <> src.bdate OR
            tgt.gen <> src.gen 
        """

    # 5Perform merge
    (
        target_table.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll(condition=update_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )
    

In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/erp_cust_az12/").trigger(availableNow=True).start().awaitTermination()