In [0]:
from pyspark.sql.functions import *
from pyspark.sql import Window


# Process CDC data using the SCD2 pattern
class SilverLayer_SCD2:
    def __init__(self):
        pass

    def load_raw_data(self):
        raw_df = (spark
                  .readStream
                  .table("DEV.scd_raw_layer.users_profile_raw")
                 )
        
        processed_df =  (raw_df
                 .selectExpr("user_id","update_type","timestamp","dob","sex","gender","first_name","last_name","address.*")
                 .withColumnRenamed("timestamp","event_timestamp")
                 .withColumn("event_timestamp",col("event_timestamp").cast("timestamp"))
                 .withColumn("dob",to_date(col("dob"),"mm/dd/yyyy"))
                 .withColumn("start_time",col("event_timestamp"))
                 .withColumn("end_time",lit(None))
              )

        return processed_df

    # Implement SCD2
    # Handle multiple events within the same file
    
    def upsert(self,df,batch_id):
    
        window_spec = Window.partitionBy("user_id").orderBy(col("event_timestamp"))
        df_rnk = df.withColumn("row_num",row_number().over(window_spec))

        #MERGE into DEV.scd_silver_layer.users_profile_scd2 t 
        #using users_profile_update_stage s
        #on s.user_id = t.user_id 
        #and s.start_time > t.start_time 
        #and t.end_time is null
        #when matched and s.update_type in ('update','delete') 
        #then update set end_time = current_timestamp

        merge_statment_update_existing_records = """
        MERGE into DEV.scd_silver_layer.users_profile_scd2 t 
        using users_profile_update_scd2_stage s
        on s.user_id = t.user_id
        and s.start_time > t.start_time
        and t.end_time is null
        when matched and s.update_type in ('update','delete') 
        then update set end_time = s.start_time
        """

        merge_statment_insert = """
        MERGE into DEV.scd_silver_layer.users_profile_scd2 t
        using users_profile_insert_scd2_stage s
        on s.user_id = t.user_id
        and t.end_time is null
        when not matched 
        then insert (
         user_id,
         event_timestamp,
         dob,
         sex,
         gender ,
         first_name ,
         last_name ,
         street_address, 
         city, 
         state, 
         zip,
         start_time,
         end_time
        ) 
        values(
            s.user_id,
            s.event_timestamp,
            s.dob,
            s.sex,
            s.gender,
            s.first_name,
            s.last_name,
            s.street_address, 
            s.city, 
            s.state, 
            s.zip,
            s.start_time,
            s.end_time
        )
        """

        # Find the max rank 
        max_rnk = df_rnk.select(max("row_num").alias("max_rnk")).collect()[0][0]

        for i in range(1,max_rnk+1):
            t = df_rnk.filter(col("row_num") == i).drop("row_num")
            t.createOrReplaceTempView("users_profile_update_scd2_stage")
            t._jdf.sparkSession().sql(merge_statment_update_existing_records)

            t1=t.filter(col("update_type") != 'delete')
            t1.createOrReplaceTempView("users_profile_insert_scd2_stage")
            t1._jdf.sparkSession().sql(merge_statment_insert)


    def implement_scd2(self,raw_df):
        streaming_query = (raw_df
                          .writeStream
                          .queryName("User_profile_silver_layer_scd2")
                          .option("checkpointLocation","/Volumes/dev/scd_silver_layer/users_profile_scd2")
                          .trigger(processingTime="1 seconds")
                          .outputMode("update")
                          .foreachBatch(self.upsert)
                          .start()
                          )
        return streaming_query
    
    def start_streaming_query(self):
        raw_df = self.load_raw_data()
        streaming_query = self.implement_scd2(raw_df)
        return streaming_query
    
    

In [0]:
silver_layer = SilverLayer_SCD2()
streaming_query_scd2 = silver_layer.start_streaming_query()

In [0]:
streaming_query_scd2.stop()

In [0]:
%sql
select * from DEV.scd_silver_layer.users_profile_scd2
order by user_id , event_timestamp;

user_id,event_timestamp,dob,sex,gender,first_name,last_name,street_address,city,state,zip,start_time,end_time
12140,2023-03-10T12:32:09Z,1999-01-02,M,M,Robert,Clark,68994 Steven Vista,Pearblossom,CA,93553,2023-03-10T12:32:09Z,2023-03-10T12:33:45Z
12140,2023-03-10T12:33:45Z,1999-01-02,M,M,Robert,Castillo,68994 Steven Vista,Pearblossom,CA,93553,2023-03-10T12:33:45Z,
12227,2023-03-10T12:40:28Z,1949-01-11,F,F,Courtney,Sheppard,47754 Angela Plaza Apt. 135,Los Angeles,CA,90010,2023-03-10T12:40:28Z,
14232,2023-03-11T12:51:27Z,1979-01-04,M,M,Edward,Simpson,92012 Bradley Shoals,Long Beach,CA,90815,2023-03-11T12:51:27Z,
14508,2023-03-11T12:50:25Z,1936-01-28,M,M,Justin,Eaton,04952 Lori Plain,Sierra Madre,CA,91024,2023-03-11T12:50:25Z,
14633,2023-03-11T12:47:10Z,1997-01-04,F,F,Hannah,Fuller,81346 Obrien Streets,Gardena,CA,90249,2023-03-11T12:47:10Z,
