Deciding whether to do Full load or incremental load

In [0]:
dbutils.widgets.text('load_type',"0","0 - Full Load, 1 - Incremental Load")
load_type = int(dbutils.widgets.get('load_type'))

Creating Target Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS data_engineering_project.gold_dim_customer(
  customer_key BIGINT,
  customer_id STRING,
  customer_name STRING, 
  segment STRING, 
  country STRING,
  city STRING,
  state STRING,
  postal_code INTEGER,
  region STRING,
  start_date DATE,
  end_date DATE,
  is_current INT
)

USING DELTA;

Source and target Table 

In [0]:
source = "data_engineering_project.silver_orders"
target = "data_engineering_project.gold_dim_customer"

Getting only distinct customers from source table 

In [0]:
spark.sql(f"""
          CREATE OR REPLACE TEMPORARY VIEW source_dedup AS(
              SELECT * 
              FROM (
                  SELECT *, ROW_NUMBER() OVER(PARTITION BY customer_id order by order_year ) AS rnk
                  FROM {source}
              ) AS t 
              WHERE rnk = 1
          )
          """)

In [0]:
spark.sql(f'''SELECT * FROM source_dedup''').display()

In [0]:
if load_type == 0:
    spark.sql(f"""
              INSERT OVERWRITE TABLE {target}
               SELECT ROW_NUMBER() OVER (ORDER BY customer_id) AS customer_key, 
               customer_id ,
               customer_name , 
               segment , 
               country ,
               city ,
               state ,
               postal_code ,
               region ,
               CURRENT_DATE() AS start_date ,
               DATE('9999-12-31')AS end_date ,
               1 AS is_current
               FROM source_dedup 

              """)
else : 
    max_key = spark.sql(f'''SELECT MAX(customer_key) FROM {target}''').collect()[0][0]

    spark.sql(f'''
              CREATE OR REPLACE TEMPORARY VIEW src_updateorinsert AS 
              (
                  SELECT COALESCE(t.customer_key, {max_key}+ ROW_NUMBER()OVER(Order by s.customer_id)) AS customer_key,
                  s.customer_id ,
                  s.customer_name , 
                  s.segment , 
                  s.country ,
                  s.city ,
                  s.state ,
                  s.postal_code ,
                  s.region 
                  FROM silver_orders_scd2_test AS s  
                  LEFT JOIN {target} AS t 
                  ON s.customer_id = t.customer_id
                  AND t.is_current == 1
              )
              ''')
#Merging Logic 

    spark.sql(f'''
              MERGE INTO {target} tgt
              USING src_updateorinsert src
              ON tgt.customer_id = src.customer_id AND tgt.is_current = 1 
              WHEN MATCHED AND (
                  tgt.customer_name != src.customer_name OR 
                  tgt.segment != src.segment OR 
                  tgt.country != src.country OR 
                  tgt.city != src.city OR 
                  tgt.state != src.state OR 
                  tgt.postal_code != src.postal_code OR
                  tgt.region != src.region 
              )
              THEN UPDATE SET
                   tgt.end_date = CURRENT_DATE() - 1 ,
                   tgt.is_current = 0 
              WHEN NOT MATCHED 
                THEN INSERT (
                    customer_key,
                    customer_id,
                    customer_name,
                    segment,
                    country,
                    city,
                    state,
                    postal_code,
                    region,
                    start_date,
                    end_date,
                    is_current
                    )
                VALUES (
                    src.customer_key,
                    src.customer_id,
                    src.customer_name,
                    src.segment,
                    src.country,
                    src.city,
                    src.state,
                    src.postal_code,
                    src.region,
                    CURRENT_DATE(),
                    DATE('9999-12-31'),
                    1
                )
              
              ''')

In [0]:
    max_key = spark.sql(f'''SELECT MAX(customer_key) FROM {target}''').collect()[0][0]
    print(max_key)

In [0]:
spark.sql(f'''SELECT * FROM {target}''').display()

TESTING SCD TYPE 2 UPDATE LOGIC

In [0]:
%skip
%sql
CREATE OR REPLACE TEMP VIEW silver_orders_scd2_test AS
SELECT
    customer_id,
    customer_name,
    segment,
    country,
    city,
    state,
    postal_code,
    region
FROM source_dedup

UNION ALL

SELECT
    'AB-10315'      AS customer_id,
    'Alejandro Avila'    AS customer_name,
    'Consumer'      AS segment,
    'United States' AS country,
    'Los Angeles'   AS city,        
    'California'    AS state,
    94122           AS postal_code,
    'West'          AS region;


In [0]:
%skip
spark.sql(f'''SELECT * FROM silver_orders_scd2_test''').display()