In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month

# Example data for dimension tables
raw_customer_data = [
    (1, "John Doe", "New York", "NY", "USA"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA"),
    (4, "Emily Brown", "Chicago", "IL", "USA")
]

raw_product_data = [
    (1, "Product A", "Category X"),
    (2, "Product B", "Category Y"),
    (3, "Product C", "Category Z")
]

raw_date_data = [
    ("2024-03-01", "2024-03-01", 2024, 3),
    ("2024-03-02", "2024-03-02", 2024, 3),
    ("2024-03-03", "2024-03-03", 2024, 3),
    ("2024-03-04", "2024-03-04", 2024, 3)
]

# Example data for fact table
raw_sales_data = [
    (101, 1, 1, "2024-03-01", 2, 100),
    (102, 2, 2, "2024-03-02", 1, 50),
    (103, 3, 3, "2024-03-02", 3, 200),
    (104, 1, 1, "2024-03-03", 1, 50),
    (105, 2, 2, "2024-03-04", 2, 150)
]

# Create DataFrames for dimension and fact tables
raw_customer_df = spark.createDataFrame(raw_customer_data, ["customer_id", "customer_name", "city", "state", "country"])
raw_product_df = spark.createDataFrame(raw_product_data, ["product_id", "product_name", "product_category"])
raw_date_df = spark.createDataFrame(raw_date_data, ["order_date", "full_date", "year", "month"])
raw_sales_df = spark.createDataFrame(raw_sales_data, ["order_id", "customer_id", "product_id", "order_date", "quantity", "amount"])

# Write DataFrames to Delta Lake for demonstration with partitioning and Z-Ordering
raw_customer_df.write.format("delta").mode("overwrite").partitionBy("state").save("/tmp/raw_customer")
raw_product_df.write.format("delta").mode("overwrite").save("/tmp/raw_product")
raw_date_df.write.format("delta").mode("overwrite").partitionBy("year", "month").option("zorder", "order_date").save("/tmp/raw_date")
raw_sales_df.write.format("delta").mode("overwrite").partitionBy("order_date").option("zorder", "customer_id").save("/tmp/raw_sales")

# Read data from Delta Lake for demonstration
bronze_customer_df = spark.read.format("delta").load("/tmp/raw_customer")
bronze_product_df = spark.read.format("delta").load("/tmp/raw_product")
bronze_date_df = spark.read.format("delta").load("/tmp/raw_date")
bronze_sales_df = spark.read.format("delta").load("/tmp/raw_sales")

# Perform transformations to create silver layer

# Write dimension tables to Delta Lake for silver layer
bronze_customer_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_customer")
bronze_product_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_product")
bronze_date_df.write.format("delta").mode("overwrite").save("/tmp/silver_dim_date")

# Write fact table to Delta Lake for silver layer
bronze_sales_df.write.format("delta").mode("overwrite").save("/tmp/silver_fact_sales")

# Aggregate and enrich data to create gold layer

# Aggregating sales data by month to create gold layer
gold_sales_df = bronze_sales_df.groupBy(year("order_date").alias("year"), month("order_date").alias("month")) \
    .agg({"amount": "sum"}) \
    .withColumnRenamed("sum(amount)", "total_sales")

# Write aggregated and enriched data to Delta Lake for gold layer
gold_sales_df.write.format("delta").mode("overwrite").save("/tmp/gold_sales")

# Register Delta tables into catalog with separate schemas for bronze, silver, and gold layers
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Register Delta tables into catalog with separate schemas for bronze, silver, and gold layers
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Register tables from bronze layer with bronze schema
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("USE bronze")
spark.sql("CREATE TABLE IF NOT EXISTS raw_customer USING DELTA LOCATION '/tmp/raw_customer'")
spark.sql("CREATE TABLE IF NOT EXISTS raw_product USING DELTA LOCATION '/tmp/raw_product'")
spark.sql("CREATE TABLE IF NOT EXISTS raw_date USING DELTA LOCATION '/tmp/raw_date'")
spark.sql("CREATE TABLE IF NOT EXISTS raw_sales USING DELTA LOCATION '/tmp/raw_sales'")

# Register tables from silver layer with silver schema
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("USE silver")
spark.sql("CREATE TABLE IF NOT EXISTS dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product USING DELTA LOCATION '/tmp/silver_dim_product'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_date USING DELTA LOCATION '/tmp/silver_dim_date'")
spark.sql("CREATE TABLE IF NOT EXISTS fact_sales USING DELTA LOCATION '/tmp/silver_fact_sales'")

# Register tables from gold layer with gold schema
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")
spark.sql("USE gold")
spark.sql("CREATE TABLE IF NOT EXISTS sales USING DELTA LOCATION '/tmp/gold_sales'")



Out[1]: DataFrame[]

In [0]:
# Register Delta tables into catalog with separate schemas for bronze, silver, and gold layers
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Register tables from bronze layer with bronze schema
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
spark.sql("USE bronze")
spark.sql("CREATE TABLE IF NOT EXISTS dim_customer USING DELTA LOCATION '/tmp/raw_customer'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product USING DELTA LOCATION '/tmp/raw_product'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_date USING DELTA LOCATION '/tmp/raw_date'")
spark.sql("CREATE TABLE IF NOT EXISTS fact_sales USING DELTA LOCATION '/tmp/raw_sales'")

# Register tables from silver layer with silver schema
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("USE silver")
spark.sql("CREATE TABLE IF NOT EXISTS dim_customer USING DELTA LOCATION '/tmp/silver_dim_customer'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product USING DELTA LOCATION '/tmp/silver_dim_product'")
spark.sql("CREATE TABLE IF NOT EXISTS dim_date USING DELTA LOCATION '/tmp/silver_dim_date'")
spark.sql("CREATE TABLE IF NOT EXISTS fact_sales USING DELTA LOCATION '/tmp/silver_fact_sales'")

# Register tables from gold layer with gold schema
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")
spark.sql("USE gold")
spark.sql("CREATE TABLE IF NOT EXISTS sales USING DELTA LOCATION '/tmp/gold_sales'")


Out[2]: DataFrame[]

In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Example data for dimension table
dim_customer_data = [
    (1, "John Doe", "New York", "NY", "USA"),
    (2, "Jane Smith", "Los Angeles", "CA", "USA"),
    (3, "Michael Johnson", "San Francisco", "CA", "USA"),
    (4, "Emily Brown", "Chicago", "IL", "USA")
]

# Example data for fact table
fact_sales_data = [
    (101, 1, "2024-03-01", 1, 2, 100),
    (102, 2, "2024-03-02", 2, 1, 50),
    (103, 3, "2024-03-02", 3, 3, 200),
    (104, 1, "2024-03-03", 1, 1, 50),
    (105, 2, "2024-03-04", 2, 2, 150)
]

# Create DataFrames for dimension and fact tables
dim_customer_df = spark.createDataFrame(dim_customer_data, ["customer_id", "customer_name", "city", "state", "country"])
fact_sales_df = spark.createDataFrame(fact_sales_data, ["order_id", "customer_id", "order_date", "product_id", "quantity", "amount"])

# Write DataFrames to Delta Lake for demonstration with partitioning and Z-Ordering
dim_customer_df.write.format("delta").mode("overwrite").partitionBy("state").save("/tmp/dim_customer")
fact_sales_df.write.format("delta").mode("overwrite").partitionBy("order_date").option("zorder", "customer_id").save("/tmp/fact_sales")

# Read data from Delta Lake for demonstration
dim_customer_df = spark.read.format("delta").load("/tmp/dim_customer")
fact_sales_df = spark.read.format("delta").load("/tmp/fact_sales")

# 1. Predicate Pushdown: Filter rows as early as possible to minimize data movement and processing.
filtered_df = fact_sales_df.filter(col("order_date") == "2024-03-02")

# 2. Partitioning: Organize data into partitions based on a specified column to enable partition pruning and reduce data shuffling.
partitioned_fact_sales_df = fact_sales_df.repartition("order_date")

# 3. Broadcast Join: Broadcast smaller DataFrame to all worker nodes to avoid data shuffling during join operation.
broadcasted_join_df = fact_sales_df.join(dim_customer_df, "customer_id").select(fact_sales_df["*"], dim_customer_df["customer_name"])

# 4. Caching and Persisting: Cache DataFrame in memory or persist it to disk to avoid recomputation and improve query performance.
fact_sales_df.cache()

# 5. Parallelism Control: Control the number of shuffle partitions to optimize resource utilization and improve query performance.
spark.conf.set("spark.sql.shuffle.partitions", "4")

# 6. Query Optimization: Select only necessary columns in the DataFrame to minimize data movement and processing.
optimized_df = fact_sales_df.select("order_id", "order_date", "amount")

# 7. Column Pruning: Select only necessary columns in the DataFrame to minimize data movement and processing.
pruned_df = fact_sales_df.select("order_id", "customer_id", "order_date")

# 8. Query Caching: Cache DataFrame in memory to avoid recomputation and improve query performance.
cached_result = fact_sales_df.select("customer_id").cache()

# 9. Materialized Views: Create a temporary view for DataFrame to optimize query execution and caching.
fact_sales_df.createOrReplaceTempView("fact_sales_view")

# Create temporary views for DataFrames
fact_sales_df.createOrReplaceTempView("fact_sales_view")
dim_customer_df.createOrReplaceTempView("dim_customer_view")

# 10. Join Reordering: Reorder join operations to optimize query execution.
reordered_join_df = spark.sql("""
    SELECT *
    FROM fact_sales_view
    JOIN dim_customer_view ON fact_sales_view.customer_id = dim_customer_view.customer_id
""")

# 11. Filter Pushdown: Push down filters to underlying data sources to minimize data movement and processing.
pushed_down_filter_df = spark.sql("""
    SELECT *
    FROM fact_sales_view
    WHERE order_date = '2024-03-02' AND amount > 100
""")

# 12. Vectorized Query Execution: Enable vectorized query execution to process data in batches and improve CPU efficiency.
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# 13. Dynamic Partition Pruning: Enable dynamic partition pruning to optimize query performance by skipping unnecessary partitions.
spark.conf.set("spark.sql.optimizer.dynamicPartitionPruning.enabled", "true")

# 14. Dynamic Runtime Filters: Use dynamic runtime filters to optimize join operations by filtering data dynamically based on join keys.
dynamic_runtime_filters_df = spark.sql("""
    SELECT *
    FROM fact_sales_view
    JOIN dim_customer_view ON fact_sales_view.customer_id = dim_customer_view.customer_id
    WHERE fact_sales_view.order_date = '2024-03-02'
""")

# 15. Columnar Storage Compression: Utilize columnar storage compression techniques to reduce storage footprint and improve query performance.
spark.conf.set("spark.sql.inMemoryColumnarStorage.compressed", "true")
