In [0]:
from pyspark.sql.functions import *
from pyspark.sql import Window


class SilverLayer_SCD1:
    def __init__(self):
        pass

    def load_raw_data(self):
        raw_df = (spark
                  .readStream
                  .table("DEV.scd_raw_layer.users_profile_raw")
                 )
        
        processed_df =  (raw_df
                 .selectExpr("user_id","update_type","timestamp","dob","sex","gender","first_name","last_name","address.*")
                 .withColumnRenamed("timestamp","event_timestamp")
                 .withColumn("event_timestamp",col("event_timestamp").cast("timestamp"))
                 .withColumn("dob",to_date(col("dob"),"mm/dd/yyyy"))
              )

        return processed_df

    # Implement SCD1
    # Overwrite the changes - No need to maintain any history
    def upsert(self,df,batch_id):

        # Within microbatch pick the latest records.
        # Remove Duplicates within the microbatch
        window_spec = Window.partitionBy("user_id").orderBy(col("event_timestamp").desc())
        df_rank = df.withColumn("row_num",row_number().over(window_spec))
        dedup_df = df_rank.filter(col("row_num") == 1).drop("row_num")

        dedup_df.createOrReplaceTempView("users_profile_scd1_stage")

        # Overwrite the records across the microbatches
        merge_statment = """
        MERGE into DEV.scd_silver_layer.users_profile_scd1 t
        using users_profile_scd1_stage s
        on t.user_id = s.user_id
        when matched and s.update_type = 'update'  then update set *
        when matched and s.update_type = 'delete' then delete
        when not matched and s.update_type = 'new' then insert *
        when not matched then insert *
        """

        df._jdf.sparkSession().sql(merge_statment)

    def implement_scd1(self,raw_df):
        streaming_query = (raw_df
                          .writeStream
                          .queryName("User_profile_scd1_silver_layer")
                          .option("checkpointLocation","/Volumes/dev/scd_silver_layer/users_profile_scd1")
                          .trigger(processingTime="1 seconds")
                          .outputMode("update")
                          .foreachBatch(self.upsert)
                          .start()
                          )
        return streaming_query
    
    def start_streaming_query(self):
        raw_df = self.load_raw_data()
        streaming_query = self.implement_scd1(raw_df)
        return streaming_query
    
    

In [0]:
silver_layer = SilverLayer_SCD1()
streaming_query = silver_layer.start_streaming_query()

In [0]:
streaming_query.stop()

In [0]:
%sql
select * from DEV.scd_silver_layer.users_profile_scd1;

user_id,event_timestamp,dob,sex,gender,first_name,last_name,street_address,city,state,zip
12140,2023-03-10T12:33:45Z,1999-01-02,M,M,Robert,Castillo,68994 Steven Vista,Pearblossom,CA,93553
12227,2023-03-10T12:40:28Z,1949-01-11,F,F,Courtney,Sheppard,47754 Angela Plaza Apt. 135,Los Angeles,CA,90010
14232,2023-03-11T12:51:27Z,1979-01-04,M,M,Edward,Simpson,92012 Bradley Shoals,Long Beach,CA,90815
14508,2023-03-11T12:50:25Z,1936-01-28,M,M,Justin,Eaton,04952 Lori Plain,Sierra Madre,CA,91024
14633,2023-03-11T12:47:10Z,1997-01-04,F,F,Hannah,Fuller,81346 Obrien Streets,Gardena,CA,90249
