In [0]:
df_silver  = spark.table('data_engineering_project.silver_orders')
df_silver.display()

Create Fact Table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS data_engineering_project.gold_fact_sales(
  sales_key BIGINT, 
    order_id STRING,                                
    customer_key BIGINT,                            
    product_key BIGINT,                            
    order_date_key INT,                             
    ship_date_key INT,                             
    ship_mode STRING,
    shipping_days INT,
    sales DOUBLE,
    quantity INT,
    discount DOUBLE,
    profit DOUBLE,
    ingestion_timestamp TIMESTAMP,
    file_source STRING
)
USING DELTA;

In [0]:
src_fact_sales = "data_engineering_project.silver_orders"
tgt_fact_sales = "data_engineering_project.gold_fact_sales"

Removing Duplicates from silver table to insert into fact table

In [0]:
spark.sql(f"""
          CREATE OR REPLACE TEMPORARY VIEW source_dedup_fact_sales AS(
              SELECT * 
              FROM (
                  SELECT *, ROW_NUMBER() OVER(PARTITION BY order_id order by order_date desc ) AS rnk
                  FROM {src_fact_sales}
              ) AS t 
              WHERE rnk = 1
          )
          """).display()

In [0]:
%sql
SELECT * FROM source_dedup_fact_sales

Getting max_key_sales to support incremental loading from max of rownumber

In [0]:
max_key = spark.sql(f'''
    SELECT COALESCE(MAX(sales_key),0)
    FROM {tgt_fact_sales}
    ''').collect()[0][0]
print(max_key)

In [0]:
%sql
SELECT * FROM data_engineering_project.gold_fact_sales

Loading the Fact Table

In [0]:
spark.sql(f"""
         INSERT INTO {tgt_fact_sales}
         SELECT
          {max_key} + ROW_NUMBER() OVER(ORDER BY s.order_id) AS sales_key,
          s.order_id AS order_id,
          c.customer_key AS customer_key,
          p.product_key AS product_key,
          d_o.date_key AS order_date_key,
          d_s.date_key AS ship_date_key,
          s.ship_mode AS ship_mode,
          s.shipping_days As shipping_days,
          s.sales AS sales,
          s.quantity AS quantity,
          s.discount AS discount,
          s.profit AS profit,
          s.ingestion_timestamp AS ingestion_timestamp,
          s.file_source AS file_source
          FROM source_dedup_fact_sales AS s 
          JOIN data_engineering_project.gold_dim_customer As c
          ON s.customer_id = c.customer_id
          JOIN data_engineering_project.gold_dim_product As p 
          ON s.product_id = p.product_id
          JOIN data_engineering_project.gold_dim_date As d_o
          ON s.order_date = d_o.full_date
          JOIN data_engineering_project.gold_dim_date As d_s
          ON s.ship_date = d_s.full_date
          LEFT ANTI JOIN {tgt_fact_sales} AS t
          ON s.order_id = t.order_id
""")

In [0]:
spark.sql(f"""SELECT * FROM {tgt_fact_sales}""").display()

In [0]:
%skip
%sql
SELECT * 
FROM data_engineering_project.gold_dim_date
WHERE date_key = '248'



In [0]:
%skip
spark.sql(f"""
         INSERT INTO {tgt_fact_sales}
         SELECT
          {max_key} + ROW_NUMBER() OVER(ORDER BY s.order_id) AS sales_key,
          s.order_id AS order_id,
          c.customer_key AS customer_key,
          p.product_key AS product_key,
          d_o.date_key AS order_date_key,
          d_s.date_key AS ship_date_key,
          s.ship_mode AS ship_mode,
          s.shipping_days As shipping_days,
          s.sales AS sales,
          s.quantity AS quantity,
          s.discount AS discount,
          s.profit AS profit,
          s.ingestion_timestamp AS ingestion_timestamp,
          s.file_source AS file_source
          FROM source_dedup_fact_sales AS s 
          JOIN data_engineering_project.gold_dim_customer As c
          ON s.customer_id = c.customer_id
          AND s.order_date BETWEEN c.start_date AND c.end_date
          JOIN data_engineering_project.gold_dim_product As p 
          ON s.product_id = p.product_id
          AND s.order_date BETWEEN p.start_date AND p.end_date
          JOIN data_engineering_project.gold_dim_date As d_o
          ON s.order_date = d_o.full_date
          JOIN data_engineering_project.gold_dim_date As d_s
          ON s.ship_date = d_s.full_date
          LEFT ANTI JOIN {tgt_fact_sales} AS t
          ON s.order_id = t.order_id
""")

In [0]:
%skip
%sql
SELECT * FROM data_engineering_project.gold_dim_customer