In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/scm/crm_sales_details/")   # 🔹 stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/crm_sales_details/")
)

In [0]:
def process_batch(df, batch_id):
    df=df.withColumn('sls_order_dt',expr("""CASE 
				WHEN sls_order_dt = 0 OR LEN(sls_order_dt) != 8 THEN NULL
				ELSE to_date(CAST(sls_order_dt AS STRING), 'yyyyMMdd')
			END"""))
    df=df.withColumn('sls_ship_dt',expr("""CASE 
                WHEN sls_ship_dt  = 0 OR LEN(sls_ship_dt ) != 8 THEN NULL
                ELSE to_date(CAST(sls_ship_dt AS STRING), 'yyyyMMdd')
            END"""))

    df=df.withColumn('sls_due_dt',expr("""CASE 
                WHEN sls_due_dt  = 0 OR LEN(sls_ship_dt ) != 8 THEN NULL
                ELSE to_date(CAST(sls_ship_dt AS STRING), 'yyyyMMdd')
            END"""))
    df = df.withColumn(
    "sls_sales",
    when(
        (col("sls_sales").isNull()) | 
        (col("sls_sales") <= 0) | 
        (col("sls_sales") != col("sls_quantity") * abs(col("sls_price"))),
        col("sls_quantity") * abs(col("sls_price"))
    ).otherwise(col("sls_sales"))
    )

    # Step 2: Recalculate sls_price (use the updated sls_sales)
    df = df.withColumn(
    "sls_price",
    when(
        (col("sls_price").isNull()) | (col("sls_price") <= 0),
        when(col("sls_quantity") == 0, lit(None))  # handle division by zero
        .otherwise(col("sls_sales") / col("sls_quantity"))
    ).otherwise(col("sls_price"))
    )

    df=df.withColumn("dwh_create_date",lit(current_timestamp()))

    target_table = DeltaTable.forName(spark, "gautham.gtk_scm.crm_sales_details")

    # Define merge condition
    merge_condition = "tgt.sls_ord_num = src.sls_ord_num and  tgt.sls_prd_key = src.sls_prd_key   "
    update_condition = """       
            tgt.sls_cust_id <> src.sls_cust_id OR
            tgt.sls_order_dt <> src.sls_order_dt OR
            tgt.sls_ship_dt <> src.sls_ship_dt OR
            tgt.sls_due_dt <> src.sls_due_dt OR
            tgt.sls_sales <> src.sls_sales  OR
            tgt.sls_quantity <> src.sls_quantity OR
            tgt.sls_price <> src.sls_price 
        """

    # 5Perform merge
    (
        target_table.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll(condition=update_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )

In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/crm_sales_details/").trigger(availableNow=True).start().awaitTermination()