In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable


In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/scm/crm_prd_info/")   # 🔹 stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/crm_prd_info/"))

In [0]:
def process_batch(df, batch_id):
    df=df.withColumn('cat_id',expr("""REPLACE(SUBSTRING(prd_key, 1, 5), '-', '_')"""))

    df=df.withColumn('prd_cost',coalesce('prd_cost',lit(0)))
    df=df.withColumn("prd_line",expr("""CASE 
                    WHEN UPPER(TRIM(prd_line)) = 'M' THEN 'Mountain'
                    WHEN UPPER(TRIM(prd_line)) = 'R' THEN 'Road'
                    WHEN UPPER(TRIM(prd_line)) = 'S' THEN 'Other Sales'
                    WHEN UPPER(TRIM(prd_line)) = 'T' THEN 'Touring'
                    ELSE 'n/a'
                END"""))

    df=df.withColumn('prd_start_dt',expr("cast(prd_start_dt as date)"))
    window_spec = Window.partitionBy(col("prd_key")).orderBy(col("prd_start_dt").asc())
    df=df.withColumn('prd_end_dt',lead('prd_start_dt').over(window_spec))
    df=df.withColumn('prd_key',expr("""SUBSTRING(prd_key, 7, LEN(prd_key))"""))
    df=df.withColumn("dwh_create_date",lit(current_timestamp()))

    target_table = DeltaTable.forName(spark, "gautham.gtk_scm.crm_prd_info")

    # Define merge condition
    merge_condition = "tgt.prd_id = src.prd_id and tgt.prd_key= src.prd_key  "
    update_condition = """
            tgt.prd_nm <> src.prd_nm OR
            tgt.prd_cost <> src.prd_cost OR
            tgt.prd_line <> src.prd_line OR
            tgt.prd_start_dt <> src.prd_start_dt OR
            tgt.prd_end_dt <> src.prd_end_dt OR
            tgt.cat_id <> src.cat_id 

        """

    # 5Perform merge
    (
        target_table.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll(condition=update_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )


In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/crm_prd_info/").trigger(availableNow=True).start().awaitTermination()

In [0]:
%sql
select* from gautham.gtk_scm.crm_cust_info