In [0]:
%fs rm -r /tmp

In [0]:
# this lakehouse script covers many essential features of a lakehouse data reporting solution, including:
# 1.	Schema Management: It starts by dropping any existing schemas and creating new ones, ensuring that the environment is clean and ready for data ingestion.
# 2.	Data Ingestion: The script handles data ingestion for both initial (Day 1) and subsequent (Day 2) data loads. It creates DataFrames and writes them to Delta Lake in the bronze layer.
# 3.	Transformation and Deduplication: It includes transformations and deduplication steps in the silver layer. For instance, it joins dimension tables with fact tables, removes duplicates, and handles customer segmentation.
# 4.	Incremental Updates: It demonstrates incremental updates by appending new data (Day 2) to existing tables in the bronze, silver, and gold layers. This shows how the script can handle ongoing data ingestion and processing.
# 5.	Layered Approach: The script follows the common lakehouse architecture with three layers:
# o	Bronze Layer: Raw data ingestion.
# o	Silver Layer: Data transformation and cleaning.
# o	Gold Layer: Aggregated and consolidated data.
# 6.	Partitioning: It uses partitioning in the bronze layer to optimize performance, such as partitioning by state and order_date.
# 7.	Data Storage: It writes the transformed data to Delta tables, ensuring efficient data storage and retrieval.

# # Ensure proper schema registration for all layers
spark.sql("DROP SCHEMA  IF EXISTS bronze CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS silver CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS gold CASCADE")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, date_add, lit

# Initialize Spark session
spark = SparkSession.builder.appName("LakehousePipeline").getOrCreate()

# Define the ingestion date
ingestion_date = current_date()  # Base date for initial load
ingestion_date_day2 = date_add(ingestion_date, 1)  # Simulate Day 2 ingestion date

# Example data for dimension tables (Day 1)
raw_customer_data = [
    (1, "John Doe", "New York", "NY", "USA", "Segment A"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA", "Segment B"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA", "Segment A"),
    (4, "Emily Brown", "Chicago", "IL", "USA", "Segment C")
]

raw_customer_segment_data = [
    ("Segment A", "High Value Customers"),
    ("Segment B", "Medium Value Customers"),
    ("Segment C", "Low Value Customers")
]

# Example data for fact table (Day 1)
raw_sales_data = [
    (101, 1, 1, "2024-03-01", 2, 100),
    (102, 2, 2, "2024-03-02", 1, 50),
    (103, 3, 3, "2024-03-02", 3, 200),
    (104, 1, 1, "2024-03-03", 1, 50),
    (105, 2, 2, "2024-03-04", 2, 150)
]

# Create schemas if not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# Create DataFrames for separate tables in the bronze layer with ingestion_date (Day 1)
raw_customer_df = spark.createDataFrame(raw_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_segment"]) \
    .withColumn("ingestion_date", lit(ingestion_date))

raw_customer_segment_df = spark.createDataFrame(raw_customer_segment_data, ["customer_segment", "segment_description"]) \
    .withColumn("ingestion_date", lit(ingestion_date))

raw_sales_df = spark.createDataFrame(raw_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", lit(ingestion_date))

# Bronze Layer: Write each table separately to Delta Lake
raw_customer_df.write.format("delta").mode("append").partitionBy("state").save("/tmp/raw_customer")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer USING DELTA LOCATION '/tmp/raw_customer'")

raw_customer_segment_df.write.format("delta").mode("append").save("/tmp/raw_customer_segment")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer_segment USING DELTA LOCATION '/tmp/raw_customer_segment'")

raw_sales_df.write.format("delta").mode("append").partitionBy("order_date").save("/tmp/raw_sales")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_sales USING DELTA LOCATION '/tmp/raw_sales'")

# Silver Layer: Join and transform data, then write to Delta Lake
silver_customer_df = spark.read.format("delta").table("bronze.raw_customer").alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_segment").alias("seg"), col("cust.customer_segment") == col("seg.customer_segment"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_segment"), col("seg.segment_description").alias("customer_segment_description"),
            col("cust.ingestion_date")) \
    .dropDuplicates(["customer_id"])  # Deduplication step

silver_customer_df.write.format("delta").mode("append").save("/tmp/silver_dim_customer")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")

silver_sales_df = spark.read.format("delta").table("bronze.raw_sales") \
    .join(silver_customer_df, "customer_id") \
    .drop(silver_customer_df["ingestion_date"]) \
    .dropDuplicates(["order_id"])  # Deduplication step

# Split fact_sales into three tables based on customer segment
silver_sales_high_value_df = silver_sales_df.filter(col("customer_segment_description") == "High Value Customers")
silver_sales_medium_value_df = silver_sales_df.filter(col("customer_segment_description") == "Medium Value Customers")
silver_sales_low_value_df = silver_sales_df.filter(col("customer_segment_description") == "Low Value Customers")

# Write each segment to a separate Delta table
silver_sales_high_value_df.write.format("delta").mode("append").save("/tmp/silver_fact_sales_high_value")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales_high_value USING DELTA LOCATION '/tmp/silver_fact_sales_high_value'")

silver_sales_medium_value_df.write.format("delta").mode("append").save("/tmp/silver_fact_sales_medium_value")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales_medium_value USING DELTA LOCATION '/tmp/silver_fact_sales_medium_value'")

silver_sales_low_value_df.write.format("delta").mode("append").save("/tmp/silver_fact_sales_low_value")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales_low_value USING DELTA LOCATION '/tmp/silver_fact_sales_low_value'")

# Gold Layer: Load Silver Layer data and perform union, then write to Delta Lake
silver_sales_high_value_df = spark.read.format("delta").table("silver.fact_sales_high_value")
silver_sales_medium_value_df = spark.read.format("delta").table("silver.fact_sales_medium_value")
silver_sales_low_value_df = spark.read.format("delta").table("silver.fact_sales_low_value")

# Perform union of the three segment fact tables
gold_fact_sales_df = silver_sales_high_value_df.unionByName(silver_sales_medium_value_df).unionByName(silver_sales_low_value_df)

# Write the combined fact table to the gold layer
gold_fact_sales_df.write.format("delta").mode("append").save("/tmp/gold_fact_sales_summary")
spark.sql("CREATE TABLE IF NOT EXISTS gold.fact_sales_summary USING DELTA LOCATION '/tmp/gold_fact_sales_summary'")

# Day 2 Example Data
day2_customer_data = [
    (5, "Alice Green", "Seattle", "WA", "USA", "Segment A"),
    (6, "Bob White", "Miami", "FL", "USA", "Segment B")
]

day2_sales_data = [
    (106, 5, 1, "2024-03-05", 2, 120),
    (107, 6, 2, "2024-03-06", 1, 80)
]

# Create DataFrames for Day 2 data with ingestion_date
day2_customer_df = spark.createDataFrame(day2_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_segment"]) \
    .withColumn("ingestion_date", lit(ingestion_date_day2))

day2_sales_df = spark.createDataFrame(day2_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", lit(ingestion_date_day2))

# Bronze Layer: Append Day 2 data to Delta Lake
day2_customer_df.write.format("delta").mode("append").partitionBy("state").save("/tmp/raw_customer")
day2_sales_df.write.format("delta").mode("append").partitionBy("order_date").save("/tmp/raw_sales")

# Silver Layer: Append Day 2 data to Silver Tables
silver_customer_df_day2 = day2_customer_df.alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_segment").alias("seg"), col("cust.customer_segment") == col("seg.customer_segment"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_segment"), col("seg.segment_description").alias("customer_segment_description"),
            col("cust.ingestion_date")) \
    .dropDuplicates(["customer_id"])  # Deduplication step

silver_customer_df_day2.write.format("delta").mode("append").save("/tmp/silver_dim_customer")

silver_sales_df_day2 = day2_sales_df \
    .join(silver_customer_df_day2, "customer_id") \
    .drop(silver_customer_df_day2["ingestion_date"]) \
    .dropDuplicates(["order_id"])  # Deduplication step

# Split Day 2 fact_sales into three tables based on customer segment
silver_sales_high_value_df_day2 = silver_sales_df_day2.filter(col("customer_segment_description") == "High Value Customers")
silver_sales_medium_value_df_day2 = silver_sales_df_day2.filter(col("customer_segment_description") == "Medium Value Customers")
silver_sales_low_value_df_day2 = silver_sales_df_day2.filter(col("customer_segment_description") == "Low Value Customers")

# Append Day 2 data to the Silver Fact Tables
silver_sales_high_value_df_day2.write.format("delta").mode("append").save("/tmp/silver_fact_sales_high_value")
silver_sales_medium_value_df_day2.write.format("delta").mode("append").save("/tmp/silver_fact_sales_medium_value")
silver_sales_low_value_df_day2.write.format("delta").mode("append").save("/tmp/silver_fact_sales_low_value")

# Gold Layer: Append Day 2 data to Gold Table
gold_fact_sales_df_day2 = silver_sales_high_value_df_day2.unionByName(silver_sales_medium_value_df_day2).unionByName(silver_sales_low_value_df_day2)
gold_fact_sales_df_day2.write.format("delta").mode("append").save("/tmp/gold_fact_sales_summary")


In [0]:
# # Ensure proper schema registration for all layers
spark.sql("DROP SCHEMA  IF EXISTS bronze CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS silver CASCADE")
spark.sql("DROP SCHEMA  IF  EXISTS gold CASCADE")
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DimensionalModeling").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 0)  # The date of the load

# Example data for dimension tables
raw_customer_data = [
    (1, "John Doe", "New York", "NY", "USA", "Segment A"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA", "Segment B"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA", "Segment A"),
    (4, "Emily Brown", "Chicago", "IL", "USA", "Segment C")
]

raw_customer_group_data = [
    ("Segment A", "High Value Customers"),
    ("Segment B", "Medium Value Customers"),
    ("Segment C", "Low Value Customers")
]

raw_product_data = [
    (1, "Product A", "Category X", "Group 1"),
    (2, "Product B", "Category Y", "Group 2"),
    (3, "Product C", "Category Z", "Group 3")
]

raw_product_group_data = [
    ("Group 1", "Electronics"),
    ("Group 2", "Clothing"),
    ("Group 3", "Home & Kitchen")
]

raw_date_data = [
    ("2024-03-01", "2024-03-01", 2024, 3),
    ("2024-03-02", "2024-03-02", 2024, 3),
    ("2024-03-03", "2024-03-03", 2024, 3),
    ("2024-03-04", "2024-03-04", 2024, 3)
]

# Example data for fact table
raw_sales_data = [
    (101, 1, 1, "2024-03-01", 2, 100),
    (102, 2, 2, "2024-03-02", 1, 50),
    (103, 3, 3, "2024-03-02", 3, 200),
    (104, 1, 1, "2024-03-03", 1, 50),
    (105, 2, 2, "2024-03-04", 2, 150)
]

# Ensure proper schema registration for all layers
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# Create DataFrames for separate tables in the bronze layer with ingestion_date
raw_customer_df = spark.createDataFrame(raw_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_customer_group_df = spark.createDataFrame(raw_customer_group_data, ["customer_group", "group_description"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_product_df = spark.createDataFrame(raw_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_product_group_df = spark.createDataFrame(raw_product_group_data, ["product_group", "group_description"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_date_df = spark.createDataFrame(raw_date_data, ["order_date", "full_date", "year", "month"]) \
    .withColumn("ingestion_date", ingestion_date)

raw_sales_df = spark.createDataFrame(raw_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Write each table separately to Delta Lake without joining
raw_customer_df.write.format("delta").mode("overwrite").partitionBy("state").save("/tmp/raw_customer")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer USING DELTA LOCATION '/tmp/raw_customer'")

raw_customer_group_df.write.format("delta").mode("overwrite").save("/tmp/raw_customer_group")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer_group USING DELTA LOCATION '/tmp/raw_customer_group'")

raw_product_df.write.format("delta").mode("overwrite").save("/tmp/raw_product")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_product USING DELTA LOCATION '/tmp/raw_product'")

raw_product_group_df.write.format("delta").mode("overwrite").save("/tmp/raw_product_group")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_product_group USING DELTA LOCATION '/tmp/raw_product_group'")

raw_date_df.write.format("delta").mode("overwrite").partitionBy("year", "month").option("zorder", "order_date").save("/tmp/raw_date")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_date USING DELTA LOCATION '/tmp/raw_date'")

raw_sales_df.write.format("delta").mode("overwrite").partitionBy("order_date").option("zorder", "customer_id").save("/tmp/raw_sales")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_sales USING DELTA LOCATION '/tmp/raw_sales'")

# Silver Layer: Join and transform data, then write to Delta Lake
silver_customer_df = spark.read.format("delta").table("bronze.raw_customer").alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_group").alias("grp"), col("cust.customer_group") == col("grp.customer_group"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_group"), col("grp.group_description").alias("customer_group_description"),
            col("cust.ingestion_date"))

silver_customer_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_customer")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")

silver_product_df = spark.read.format("delta").table("bronze.raw_product").alias("prod") \
    .join(spark.read.format("delta").table("bronze.raw_product_group").alias("grp"), col("prod.product_group") == col("grp.product_group"), "left") \
    .select(col("prod.product_id"), col("prod.product_name"), col("prod.product_category"),
            col("grp.group_description").alias("product_group_description"),
            col("prod.ingestion_date"))

silver_product_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_product")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_product USING DELTA LOCATION '/tmp/silver_dim_product'")

# Date and Sales Data - No changes needed for Silver Layer
silver_date_df = spark.read.format("delta").table("bronze.raw_date")
silver_date_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_date")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_date USING DELTA LOCATION '/tmp/silver_dim_date'")

silver_sales_df = spark.read.format("delta").table("bronze.raw_sales")
silver_sales_df.write.format("delta").mode("overwrite").save("/tmp/silver_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales USING DELTA LOCATION '/tmp/silver_fact_sales'")






In [0]:
# Gold Layer: Aggregate and enrich data, then write to Delta Lake
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# Initialize Spark session
spark = SparkSession.builder.appName("GoldFactTableWithTimeDimension").getOrCreate()

# Load Silver Layer data
silver_sales_df = spark.read.format("delta").table("silver.fact_sales")
dim_customer_df = spark.read.format("delta").table("silver.dim_customer")
dim_product_df = spark.read.format("delta").table("silver.dim_product")
dim_date_df = spark.read.format("delta").table("silver.dim_date")

# Join sales with customer, product, and date dimensions
fact_sales_with_dimensions_df = silver_sales_df \
    .join(dim_customer_df, "customer_id", "left") \
    .join(dim_product_df, "product_id", "left") \
    .join(dim_date_df, silver_sales_df["order_date"] == dim_date_df["order_date"], "left") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("product_id"),
        silver_sales_df["order_date"].alias("order_date"),  # Explicit reference to fact_sales order_date
        col("quantity"),
        col("amount"),
        col("customer_group_description"),
        col("product_group_description"),
        col("product_category"),
        col("state"),
        col("year"),
        col("month"),
        silver_sales_df["ingestion_date"].alias("ingestion_date")  # Explicit reference to fact_sales ingestion_date
    )

# Write the combined fact table to the gold layer
fact_sales_with_dimensions_df.write.format("delta").mode("overwrite").save("/tmp/gold_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS gold.fact_sales_summary  USING DELTA LOCATION '/tmp/gold_fact_sales'")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DailyDeltaLoad").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 1)  # The date of the load

# New data for the next day
next_day_customer_data = [
    (5, "Alice Williams", "Houston", "TX", "USA", "Group B"),
    (6, "Bob Davis", "Seattle", "WA", "USA", "Group A")
]

next_day_product_data = [
    (4, "Product D", "Category W", "Group 2")
]

next_day_sales_data = [
    (106, 5, 4, "2024-03-05", 1, 75),
    (107, 6, 4, "2024-03-05", 2, 100)
]

# Create DataFrames for next day's data
next_day_customer_df = spark.createDataFrame(next_day_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_product_df = spark.createDataFrame(next_day_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_sales_df = spark.createDataFrame(next_day_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Append new data to existing Delta Lake tables
next_day_customer_df.write.format("delta").mode("append").partitionBy("state").save("/tmp/raw_customer")
next_day_product_df.write.format("delta").mode("append").save("/tmp/raw_product")
next_day_sales_df.write.format("delta").mode("append").partitionBy("order_date").save("/tmp/raw_sales")

# Silver Layer: Update Silver tables
# Join new customer data with existing data
silver_customer_df = spark.read.format("delta").table("bronze.raw_customer").alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_group").alias("grp"), col("cust.customer_group") == col("grp.customer_group"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_group"), col("grp.group_description").alias("customer_group_description"),
            col("cust.ingestion_date"))

silver_customer_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_customer")

# Join new product data with existing data
silver_product_df = spark.read.format("delta").table("bronze.raw_product").alias("prod") \
    .join(spark.read.format("delta").table("bronze.raw_product_group").alias("grp"), col("prod.product_group") == col("grp.product_group"), "left") \
    .select(col("prod.product_id"), col("prod.product_name"), col("prod.product_category"),
            col("grp.group_description").alias("product_group_description"),
            col("prod.ingestion_date"))

silver_product_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_product")

# Date and Sales Data - No changes needed for Silver Layer
silver_date_df = spark.read.format("delta").table("bronze.raw_date")
silver_date_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_date")

silver_sales_df = spark.read.format("delta").table("bronze.raw_sales")
silver_sales_df.write.format("delta").mode("overwrite").save("/tmp/silver_fact_sales")


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DailyDeltaLoadFull").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 1)  # The date of the load

# New data for the next day (replacing previous day’s data)
next_day_customer_data = [
    (1, "John Doe", "New York", "NY", "USA", "Group A"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA", "Group B"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA", "Group A"),
    (4, "Emily Brown", "Chicago", "IL", "USA", "Group C"),
    (5, "Alice Williams", "Houston", "TX", "USA", "Group B"),
    (6, "Bob Davis", "Seattle", "WA", "USA", "Group A")
]

next_day_product_data = [
    (1, "Product A", "Category X", "Group 1"),
    (2, "Product B", "Category Y", "Group 2"),
    (3, "Product C", "Category Z", "Group 3"),
    (4, "Product D", "Category W", "Group 2")
]

next_day_sales_data = [
    (101, 1, 1, "2024-03-01", 2, 100),
    (102, 2, 2, "2024-03-02", 1, 50),
    (103, 3, 3, "2024-03-02", 3, 200),
    (104, 1, 1, "2024-03-03", 1, 50),
    (105, 2, 2, "2024-03-04", 2, 150),
    (106, 5, 4, "2024-03-05", 1, 75),
    (107, 6, 4, "2024-03-05", 2, 100)
]

# Create DataFrames for the full load of data
next_day_customer_df = spark.createDataFrame(next_day_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_product_df = spark.createDataFrame(next_day_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_sales_df = spark.createDataFrame(next_day_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Overwrite the entire tables with new data
next_day_customer_df.write.format("delta").mode("overwrite").partitionBy("state").save("/tmp/raw_customer")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_customer USING DELTA LOCATION '/tmp/raw_customer'")

next_day_product_df.write.format("delta").mode("overwrite").save("/tmp/raw_product")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_product USING DELTA LOCATION '/tmp/raw_product'")

next_day_sales_df.write.format("delta").mode("overwrite").partitionBy("order_date").save("/tmp/raw_sales")
spark.sql("CREATE TABLE IF NOT EXISTS bronze.raw_sales USING DELTA LOCATION '/tmp/raw_sales'")

# Silver Layer: Recompute Silver tables based on the new Bronze data
# Join customer data with customer group data
silver_customer_df = spark.read.format("delta").table("bronze.raw_customer").alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_group").alias("grp"), col("cust.customer_group") == col("grp.customer_group"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_group"), col("grp.group_description").alias("customer_group_description"),
            col("cust.ingestion_date"))

silver_customer_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_customer")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")

# Join product data with product group data
silver_product_df = spark.read.format("delta").table("bronze.raw_product").alias("prod") \
    .join(spark.read.format("delta").table("bronze.raw_product_group").alias("grp"), col("prod.product_group") == col("grp.product_group"), "left") \
    .select(col("prod.product_id"), col("prod.product_name"), col("prod.product_category"),
            col("grp.group_description").alias("product_group_description"),
            col("prod.ingestion_date"))

silver_product_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_product")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_product USING DELTA LOCATION '/tmp/silver_dim_product'")

# Overwrite Silver date and sales tables
silver_date_df = spark.read.format("delta").table("bronze.raw_date")
silver_date_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_date")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_date USING DELTA LOCATION '/tmp/silver_dim_date'")

silver_sales_df = spark.read.format("delta").table("bronze.raw_sales")
silver_sales_df.write.format("delta").mode("overwrite").save("/tmp/silver_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales USING DELTA LOCATION '/tmp/silver_fact_sales'")


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, date_add

# Initialize Spark session
spark = SparkSession.builder.appName("DailyDeltaLoadIncrementalMerge").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 1)  # The date of the load

# New data for the next day (incremental data)
next_day_customer_data = [
    (5, "Alice Williams", "Chicago", "IL", "USA", "Group C"),
    (6, "Bob Davis",  "Los Angeles", "CA", "USA", "Group B"),
    (11, "James Williams", "Houston", "TX", "USA", "Group B"),
    (12, "John Davis", "Seattle", "WA", "USA", "Group A")
]

next_day_product_data = [
    (3, "Product C", "Category X", "Group 3"),
    (4, "Product D", "Category Z", "Group 2"),
    (101, "Product 101", "Category X", "Group 1"),
    (102, "Product 102", "Category Y", "Group 3")
]

next_day_sales_data = [
    (106, 5, 4, "2024-03-05", 1, 75),
    (107, 6, 4, "2024-03-05", 2, 100),
    (201, 11, 101, "2024-03-06", 1, 75),
    (202, 12, 102, "2024-03-06", 2, 100)
]

# Create DataFrames for next day's data
next_day_customer_df = spark.createDataFrame(next_day_customer_data, ["customer_id", "customer_name", "city", "state", "country", "customer_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_product_df = spark.createDataFrame(next_day_product_data, ["product_id", "product_name", "product_category", "product_group"]) \
    .withColumn("ingestion_date", ingestion_date)

next_day_sales_df = spark.createDataFrame(next_day_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"]) \
    .withColumn("ingestion_date", ingestion_date)

# Bronze Layer: Merge new data into existing Delta Lake tables

# Merge customer data
next_day_customer_df.createOrReplaceTempView("staging_customer")

spark.sql("""
MERGE INTO bronze.raw_customer AS target
USING staging_customer AS source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *
""")

# Merge product data
next_day_product_df.createOrReplaceTempView("staging_product")

spark.sql("""
MERGE INTO bronze.raw_product AS target
USING staging_product AS source
ON target.product_id = source.product_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *
""")

# Merge sales data
next_day_sales_df.createOrReplaceTempView("staging_sales")

spark.sql("""
MERGE INTO bronze.raw_sales AS target
USING staging_sales AS source
ON target.order_id = source.order_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *
""")

# Silver Layer: Update Silver tables with merged data
# Join customer data with customer group data
silver_customer_df = spark.read.format("delta").table("bronze.raw_customer").alias("cust") \
    .join(spark.read.format("delta").table("bronze.raw_customer_group").alias("grp"), col("cust.customer_group") == col("grp.customer_group"), "left") \
    .select(col("cust.customer_id"), col("cust.customer_name"), col("cust.city"), col("cust.state"), col("cust.country"),
            col("cust.customer_group"), col("grp.group_description").alias("customer_group_description"),
            col("cust.ingestion_date"))

silver_customer_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_customer")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")

# Join product data with product group data
silver_product_df = spark.read.format("delta").table("bronze.raw_product").alias("prod") \
    .join(spark.read.format("delta").table("bronze.raw_product_group").alias("grp"), col("prod.product_group") == col("grp.product_group"), "left") \
    .select(col("prod.product_id"), col("prod.product_name"), col("prod.product_category"),
            col("grp.group_description").alias("product_group_description"),
            col("prod.ingestion_date"))

silver_product_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_product")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_product USING DELTA LOCATION '/tmp/silver_dim_product'")

# Date and Sales Data - No changes needed for Silver Layer
silver_date_df = spark.read.format("delta").table("bronze.raw_date")
silver_date_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_date")
spark.sql("CREATE TABLE IF NOT EXISTS silver.dim_date USING DELTA LOCATION '/tmp/silver_dim_date'")

silver_sales_df = spark.read.format("delta").table("bronze.raw_sales")
silver_sales_df.write.format("delta").mode("overwrite").save("/tmp/silver_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS silver.fact_sales USING DELTA LOCATION '/tmp/silver_fact_sales'")


In [0]:
# Gold Layer: Aggregate and enrich data, then write to Delta Lake
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# Initialize Spark session
spark = SparkSession.builder.appName("GoldFactTableWithTimeDimension").getOrCreate()

# Load Silver Layer data
silver_sales_df = spark.read.format("delta").table("silver.fact_sales")
dim_customer_df = spark.read.format("delta").table("silver.dim_customer")
dim_product_df = spark.read.format("delta").table("silver.dim_product")
dim_date_df = spark.read.format("delta").table("silver.dim_date")

# Join sales with customer, product, and date dimensions
fact_sales_with_dimensions_df = silver_sales_df \
    .join(dim_customer_df, "customer_id", "left") \
    .join(dim_product_df, "product_id", "left") \
    .join(dim_date_df, silver_sales_df["order_date"] == dim_date_df["order_date"], "left") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("product_id"),
        silver_sales_df["order_date"].alias("order_date"),  # Explicit reference to fact_sales order_date
        col("quantity"),
        col("amount"),
        col("customer_group_description"),
        col("product_group_description"),
        col("product_category"),
        col("state"),
        col("year"),
        col("month"),
        silver_sales_df["ingestion_date"].alias("ingestion_date")  # Explicit reference to fact_sales ingestion_date
    )

# Write the combined fact table to the gold layer
fact_sales_with_dimensions_df.write.format("delta").mode("overwrite").save("/tmp/gold_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS gold.fact_sales_summary  USING DELTA LOCATION '/tmp/gold_fact_sales'")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, date_add, current_date

# Initialize Spark session
spark = SparkSession.builder.appName("GoldFactTableWithTimeDimension").getOrCreate()

# Define ingestion date
ingestion_date = date_add(current_date(), 1)  # The date of the load

# Load Silver Layer data
silver_sales_df = spark.read.format("delta").table("silver.fact_sales").filter(col("ingestion_date") == ingestion_date)
dim_customer_df = spark.read.format("delta").table("silver.dim_customer")
dim_product_df = spark.read.format("delta").table("silver.dim_product")
dim_date_df = spark.read.format("delta").table("silver.dim_date")

# Join sales with customer, product, and date dimensions
fact_sales_with_dimensions_df = silver_sales_df \
    .join(dim_customer_df, "customer_id", "left") \
    .join(dim_product_df, "product_id", "left") \
    .join(dim_date_df, silver_sales_df["order_date"] == dim_date_df["order_date"], "left") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("product_id"),
        silver_sales_df["order_date"].alias("order_date"),  # Explicit reference to fact_sales order_date
        col("quantity"),
        col("amount"),
        col("customer_group_description"),
        col("product_group_description"),
        col("product_category"),
        col("state"),
        col("year"),
        col("month"),
        silver_sales_df["ingestion_date"].alias("ingestion_date")  # Explicit reference to fact_sales ingestion_date
    )

# Append the combined fact table to the gold layer
fact_sales_with_dimensions_df.write.format("delta").mode("append").save("/tmp/gold_fact_sales")
spark.sql("CREATE TABLE IF NOT EXISTS gold.fact_sales_summary USING DELTA LOCATION '/tmp/gold_fact_sales'")


In [0]:
display(fact_sales_with_dimensions_df)

In [0]:
%sql
-- SQL Script to update Silver Layer

-- Customer Dimension
CREATE OR REPLACE TABLE silver.tab_dim_customer
USING DELTA
LOCATION '/tmp/path/to/external/silver/tab_dim_customer'
AS
SELECT
    cust.customer_id,
    cust.customer_name,
    cust.city,
    cust.state,
    cust.country,
    cust.customer_group,
    grp.group_description AS customer_group_description,
    cust.ingestion_date
FROM bronze.raw_customer AS cust
LEFT JOIN bronze.raw_customer_group AS grp
    ON cust.customer_group = grp.customer_group;

-- Product Dimension
CREATE OR REPLACE TABLE silver.tab_dim_product
USING DELTA
LOCATION '/tmp/path/to/external/silver/tab_dim_product'
AS
SELECT
    prod.product_id,
    prod.product_name,
    prod.product_category,
    grp.group_description AS product_group_description,
    prod.ingestion_date
FROM bronze.raw_product AS prod
LEFT JOIN bronze.raw_product_group AS grp
    ON prod.product_group = grp.product_group;

-- Date Dimension
CREATE OR REPLACE TABLE silver.tab_dim_date
USING DELTA
LOCATION '/tmp/path/to/external/silver/tab_dim_date'
AS
SELECT
    order_date,
    full_date,
    year,
    month
FROM bronze.raw_date;

-- Sales Fact Table
CREATE OR REPLACE TABLE silver.tab_fact_sales
USING DELTA
LOCATION '/tmp/path/to/external/silver/tab_fact_sales'
AS
SELECT
    sales.order_id,
    sales.customer_id,
    sales.product_id,
    sales.order_date,
    sales.quantity,
    sales.amount,
    sales.ingestion_date
FROM bronze.raw_sales AS sales;


In [0]:
%sql
-- Define ingestion_date as a variable for the current load
SET ingestion_date = date_add(current_date(), 1);

-- Create staging_customer view by filtering the Bronze table on ingestion_date
CREATE OR REPLACE TEMP VIEW staging_customer AS
SELECT 
    customer_id,
    customer_name,
    city,
    state,
    country,
    customer_group
FROM bronze.raw_customer
WHERE ingestion_date = '${ingestion_date}';

-- Create staging_product view by filtering the Bronze table on ingestion_date
CREATE OR REPLACE TEMP VIEW staging_product AS
SELECT 
    product_id,
    product_name,
    product_category,
    product_group
FROM bronze.raw_product
WHERE ingestion_date = '${ingestion_date}';

-- Create staging_sales view by filtering the Bronze table on ingestion_date
CREATE OR REPLACE TEMP VIEW staging_sales AS
SELECT 
    order_id,
    customer_id,
    product_id,
    order_date,
    quantity,
    amount
FROM bronze.raw_sales
WHERE ingestion_date = '${ingestion_date}';

-- Upsert into silver.dim_customer
MERGE INTO silver.dim_customer AS target
USING staging_customer AS source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
  UPDATE SET 
    target.customer_name = source.customer_name, 
    target.city = source.city,
    target.state = source.state,
    target.country = source.country,
    target.customer_group = source.customer_group,
    target.ingestion_date = '${ingestion_date}'
WHEN NOT MATCHED THEN
  INSERT (customer_id, customer_name, city, state, country, customer_group, ingestion_date)
  VALUES (source.customer_id, source.customer_name, source.city, source.state, source.country, source.customer_group, '${ingestion_date}');

-- Upsert into silver.dim_product
MERGE INTO silver.dim_product AS target
USING staging_product AS source
ON target.product_id = source.product_id
WHEN MATCHED THEN
  UPDATE SET 
    target.product_name = source.product_name,
    target.product_category = source.product_category,
    target.product_group = source.product_group,
    target.ingestion_date = '${ingestion_date}'
WHEN NOT MATCHED THEN
  INSERT (product_id, product_name, product_category, product_group, ingestion_date)
  VALUES (source.product_id, source.product_name, source.product_category, source.product_group, '${ingestion_date}');

-- Upsert into silver.fact_sales
MERGE INTO silver.fact_sales AS target
USING staging_sales AS source
ON target.order_id = source.order_id
WHEN MATCHED THEN
  UPDATE SET 
    target.customer_id = source.customer_id,
    target.product_id = source.product_id,
    target.order_date = source.order_date,
    target.quantity = source.quantity,
    target.amount = source.amount,
    target.ingestion_date = '${ingestion_date}'
WHEN NOT MATCHED THEN
  INSERT (order_id, customer_id, product_id, order_date, quantity, amount, ingestion_date)
  VALUES (source.order_id, source.customer_id, source.product_id, source.order_date, source.quantity, source.amount, '${ingestion_date}');
