In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable


In [0]:
stream_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/gautham/gtk_scm/test_vlm/scm/crm_cust_info/")   # 🔹 stores inferred schema here
    .option("header", "true")
    .option("inferSchema", "true")                          # infer data types
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)
    .load("/Volumes/gautham/gtk_scm/test_vlm/src/crm_cust_info/")
)

In [0]:
def process_batch(df, batch_id):
    window_spec = Window.partitionBy(col("cst_id")).orderBy(col("cst_create_date").desc())
    df=df.withColumn("row_num", row_number().over(window_spec)).filter("row_num = 1").filter("cst_id is not null")
    df=df.withColumn('cst_firstname',trim(col("cst_firstname"))).withColumn('cst_lastname',trim(col("cst_lastname")))
    df=df.withColumn('cst_marital_status',expr("""CASE 
                    WHEN UPPER(TRIM(cst_marital_status)) = 'S' THEN 'Single'
                    WHEN UPPER(TRIM(cst_marital_status)) = 'M' THEN 'Married'
                    ELSE 'n/a'
                                END """))
    df=df.withColumn('cst_gndr',expr("""CASE 
                    WHEN UPPER(TRIM(cst_gndr)) = 'F' THEN 'Female'
                    WHEN UPPER(TRIM(cst_gndr)) = 'M' THEN 'Male'
                    ELSE 'n/a'
                END  """))

    df=df.withColumn("dwh_create_date",lit(current_timestamp())).drop('row_num')

    target_table = DeltaTable.forName(spark, "gautham.gtk_scm.crm_cust_info")

    # Define merge condition
    merge_condition = "tgt.cst_id = src.cst_id  "
    update_condition = """
            tgt.cst_key <> src.cst_key OR
            tgt.cst_firstname <> src.cst_firstname OR
            tgt.cst_lastname <> src.cst_lastname OR
            tgt.cst_marital_status <> src.cst_marital_status OR
            tgt.cst_gndr <> src.cst_gndr OR
            tgt.cst_create_date <> src.cst_create_date 
        """

    # 5Perform merge
    (
        target_table.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll(condition=update_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )



In [0]:
stream_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", "/Volumes/gautham/gtk_scm/test_vlm/chkp/crm_cust_info/").trigger(availableNow=True).start().awaitTermination()

In [0]:
%sql
select count(*) from gautham.gtk_scm.crm_cust_info 
--select distinct dwh_create_date from gautham.gtk_scm.crm_cust_info 