In [0]:
df_silver  = spark.table('data_engineering_project.silver_orders')
df_silver.display()

Creating Widgets for full load and incremental load

In [0]:
dbutils.widgets.text('load_type',"0","0 - Full Load, 1 - Incremental Load")
load_type = int(dbutils.widgets.get('load_type'))

Creating table Dim Product with audit tables

In [0]:
%sql
CREATE TABLE IF NOT EXISTS data_engineering_project.gold_dim_product(
  product_key BIGINT,
  product_id STRING,
  category STRING, 
  sub_category STRING, 
  product_name STRING,
  start_date DATE,
  end_date DATE,
  is_current INT
)

USING DELTA;

Defining Source and Target 

In [0]:
source = "data_engineering_project.silver_orders"
target = "data_engineering_project.gold_dim_product"

Getting Unique product_id from silver table

In [0]:
spark.sql(f"""
          CREATE OR REPLACE TEMPORARY VIEW source_dedup_product AS(
              SELECT * 
              FROM (
                  SELECT *, ROW_NUMBER() OVER(PARTITION BY product_id order by order_date DESC ) AS rnk
                  FROM {source}
              ) AS t 
              WHERE rnk = 1
          )
          """)

In [0]:
spark.sql(f'''SELECT * FROM source_dedup_product''').display()

Loading Dim_Product Target using SCD 2 logic

In [0]:
if load_type == 0:
    spark.sql(f"""
              INSERT OVERWRITE TABLE {target}
               SELECT ROW_NUMBER() OVER (ORDER BY product_id) AS product_key, 
               product_id ,
               category , 
               sub_category , 
               product_name,
               CURRENT_DATE() AS start_date ,
               DATE('9999-12-31')AS end_date ,
               1 AS is_current
               FROM source_dedup_product 

              """)
else : 
    max_key = spark.sql(f'''SELECT MAX(product_key) FROM {target}''').collect()[0][0]

    spark.sql(f'''
              CREATE OR REPLACE TEMPORARY VIEW src_updateorinsert AS 
              (
                  SELECT COALESCE(t.product_key, {max_key}+ ROW_NUMBER()OVER(Order by s.product_id)) AS product_key,
                  s.product_id ,
                  s.category , 
                  s.sub_category , 
                  s.product_name 
                  FROM source_dedup_product AS s  
                  LEFT JOIN {target} AS t 
                  ON s.product_id = t.product_id
                  AND t.is_current == 1
              )
              ''')
    #Merging Logic 

    spark.sql(f''' MERGE INTO {target} tgt
              USING src_updateorinsert src
              ON tgt.product_id = src.product_id AND tgt.is_current = 1 
              WHEN MATCHED AND (
                  tgt.product_name != src.product_name OR 
                  tgt.category != src.category OR 
                  tgt.sub_category != src.sub_category
              )
              THEN UPDATE SET
                   tgt.end_date = CURRENT_DATE() - 1 ,
                   tgt.is_current = 0 
              ''')
    
    spark.sql(f""" INSERT INTO {target} 
            SELECT ({max_key}+ROW_NUMBER()OVER(Order by s.product_id)) AS product_key,
        s.product_id,
        s.category,
        s.sub_category,
        s.product_name,
        CURRENT_DATE() AS start_date,
        DATE('9999-12-31') AS end_date,
        1 AS is_current
    FROM  src_updateorinsert s
    LEFT JOIN {target} t
    ON s.product_id = t.product_id
    AND t.is_current = 1
    WHERE
        -- new customers that are not present in target
        t.product_id IS NULL
        OR
        -- customers who just expired
        (
            t.product_name <> s.product_name OR
            t.category <> s.category OR
            t.sub_category <> s.sub_category
        )
        """)

In [0]:
    max_key = spark.sql(f'''SELECT MAX(product_key) FROM {target}''').collect()[0][0]
    print(max_key)

In [0]:
spark.sql(f'''SELECT * FROM {target}''').display()

Creating a new record and changing an already existing record to evaluate the working of SCD type 2 

In [0]:
%skip
%sql
CREATE OR REPLACE TEMP VIEW silver_orders_scd2_test AS
SELECT
    'FUR-BO-10000112'      AS product_id,
    'Notebook' AS category,
    'Bookcases'      AS sub_category,
    'Bush Birmingham Collection Bookcase, Dark Cherry' AS product_name
 UNION ALL 
 SELECT
    'FUR-B1-10000112'      AS product_id,
    'Notebook' AS category,
    'Bookcases'      AS sub_category,
    'Bush Birmingham Collection Bookcase, Dark Cherry' AS product_name

In [0]:
%skip
spark.sql(f'''SELECT * FROM silver_orders_scd2_test''').display()