In [0]:
spark.sql("create catalog if not exists DEV")

In [0]:
spark.sql("drop database if exists DEV.scd_raw_layer cascade")

DataFrame[]

In [0]:


spark.sql("create database if not exists DEV.scd_raw_layer")

spark.sql("""create table if not exists DEV.scd_raw_layer.users_profile_raw(
          user_id string,
          update_type string,
          timestamp bigint,
          dob string,
          sex string,
          gender string,
          first_name string,
          last_name string,
          address struct<street_address:string, city:string, state:string, zip:string>
       ) using delta
          """)


# Input data directory
spark.sql("create external volume if not exists DEV.scd_raw_layer.input_data location 'abfss://scd-implementation@dbstorageact.dfs.core.windows.net/data/users/'")

# Checkpoint Directory
spark.sql("create external volume if not exists DEV.scd_raw_layer.users_profile_raw_checkpoint location 'abfss://scd-implementation@dbstorageact.dfs.core.windows.net/checkpoint/users_profile_raw'")

DataFrame[]

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

class BronzeLayer:
    def __init__(self):
        pass

    def getSchema(self):
        return StructType([
            StructField("user_id", StringType(), True),
            StructField("update_type", StringType(), True),
            StructField("timestamp", LongType(), True),
            StructField("dob", StringType(), True),
            StructField("sex", StringType(), True),
            StructField("gender", StringType(), True),
            StructField("first_name", StringType(), True),
            StructField("last_name", StringType(), True),
            StructField("address", StructType([
                StructField("street_address", StringType(), True),
                StructField("city", StringType(), True),
                StructField("state", StringType(), True),
                StructField("zip", StringType(), True),
            ]),True)
        ])

    def read_raw_data(self):
        raw_df = (
            spark
            .readStream
            .format("json")
            .schema(self.getSchema())
            .load("/Volumes/dev/scd_raw_layer/input_data")
        )

        return raw_df
    
    # Handle the Duplicates
    # Across Microbatch
    # Within Microbtach

    # Code inside the upsert method won't use the state store
    def upsert(self,df,batch_id):
        # Handle the duplicates within the mirobatch
        window_spec = Window.partitionBy("user_id","timestamp").orderBy(col("timestamp").desc())
        df_rank = df.withColumn("row_num",row_number().over(window_spec))
        dedup_df = df_rank.filter(col("row_num") == 1).drop("row_num")

        # Handle the duplicates across microbatch
        dedup_df.createOrReplaceTempView("users_profile_stage")
        merge_statement = """
                  MERGE into DEV.scd_raw_layer.users_profile_raw
                  USING users_profile_stage
                  on (DEV.scd_raw_layer.users_profile_raw.user_id = users_profile_stage.user_id and DEV.scd_raw_layer.users_profile_raw.timestamp = users_profile_stage.timestamp)
                  when matched then update set *
                  when not matched then insert *
                  """
        dedup_df._jdf.sparkSession().sql(merge_statement)
    
    
    def write_raw_df(self,raw_df):
        streamingQuery = (raw_df
                          .writeStream
                          .queryName("User_profile_bronze_layer")
                          .format("delta")
                          .option("checkpointLocation","/Volumes/dev/scd_raw_layer/users_profile_raw_checkpoint")
                          .trigger(processingTime="1 seconds")
                          .outputMode("update")
                          .foreachBatch(self.upsert)
                          .start()
                          )

        return streamingQuery
    
    def start_streaming(self):
        raw_df = self.read_raw_data()
        streamingQuery = self.write_raw_df(raw_df)
        return streamingQuery




In [0]:
# Bronze Layer Streaming Query

bronze_layer = BronzeLayer()
streaming_query = bronze_layer.start_streaming()

In [0]:
streaming_query.stop()

In [0]:
%sql
select * from DEV.scd_raw_layer.users_profile_raw
order by user_id , timestamp;

user_id,update_type,timestamp,dob,sex,gender,first_name,last_name,address
12140,new,1678451529,02/02/1999,M,M,Robert,Clark,"List(68994 Steven Vista, Pearblossom, CA, 93553)"
12140,update,1678451625,02/02/1999,M,M,Robert,Castillo,"List(68994 Steven Vista, Pearblossom, CA, 93553)"
12227,new,1678452028,12/11/1949,F,F,Courtney,Sheppard,"List(47754 Angela Plaza Apt. 135, Los Angeles, CA, 90010)"
14232,new,1678539087,01/04/1979,M,M,Edward,Simpson,"List(92012 Bradley Shoals, Long Beach, CA, 90815)"
14508,new,1678539025,01/28/1936,M,M,Justin,Eaton,"List(04952 Lori Plain, Sierra Madre, CA, 91024)"
14633,new,1678538830,09/04/1997,F,F,Hannah,Fuller,"List(81346 Obrien Streets, Gardena, CA, 90249)"
