In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/src/erp_loc_a101/")   # 🔹 stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/erp_loc_a101/")
)

In [0]:
def process_batch(df, batch_id):
    df=df.withColumn('cid',expr("""REPLACE(cid, '-', '')"""))
    df=df.withColumn('cntry',expr("""CASE
                    WHEN TRIM(cntry) = 'DE' THEN 'Germany'
                    WHEN TRIM(cntry) IN ('US', 'USA') THEN 'United States'
                    WHEN TRIM(cntry) = '' OR cntry IS NULL THEN 'n/a'
                    ELSE TRIM(cntry)
                END"""))

    df=df.withColumn("dwh_create_date",lit(current_timestamp()))
    target_table = DeltaTable.forName(spark, "gautham.gtk_scm.erp_loc_a101")

    # Define merge condition
    merge_condition = "tgt.cid = src.cid  "
    update_condition = """
            tgt.cntry <> src.cntry 
        """

    # 5Perform merge
    (
        target_table.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll(condition=update_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )
    

In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/erp_loc_a101/").trigger(availableNow=True).start().awaitTermination()