Performance Optimization
- Explain Plans: Shows how Spark will execute the query before running code.
- Partitioning: Organizing data into dir level folders so Spark can skip entire partitions (Coarse-grained skipping).
- Z-Ordering: Organizing data inside the files, so Spark can skip unnecessary data blocks (Fine-grained skipping).
- Caching: Keeps frequently accessed data in memory to speed up repeated queries.


In [0]:
%sql
use ecom_catalog.ecom_schema;
show tables

database,tableName,isTemporary
ecom_schema,bronze_events,False
ecom_schema,gold_events,False
ecom_schema,gold_product_performance,False
ecom_schema,silver_events,False


In [0]:
spark.sql("select category_code,brand,count(*) from silver_events where event_type= 'purchase' group by category_code, brand").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- == Initial Plan ==
   ColumnarToRow
   +- PhotonResultStage
      +- PhotonGroupingAgg(keys=[category_code#13634, brand#13635], functions=[finalmerge_count(merge count#13659L) AS count(1)#13641L])
         +- PhotonShuffleExchangeSource
            +- PhotonShuffleMapStage ENSURE_REQUIREMENTS, [id=#8667]
               +- PhotonShuffleExchangeSink hashpartitioning(category_code#13634, brand#13635, 1024)
                  +- PhotonGroupingAgg(keys=[category_code#13634, brand#13635], functions=[partial_count(1) AS count#13659L])
                     +- PhotonProject [category_code#13634, brand#13635]
                        +- PhotonScan parquet ecom_catalog.ecom_schema.silver_events[event_type#13631,category_code#13634,brand#13635] DataFilters: [isnotnull(event_type#13631), (event_type#13631 = purchase)], DictionaryFilters: [(event_type#13631 = purchase)], Format: parquet, Location: PreparedDeltaFileIndex(1 paths)[s3://dbstorage

In [0]:
# applying partitioning based on event type.
# Partitioned table
spark.sql(f"USE CATALOG ecom_catalog")
spark.sql(f"USE SCHEMA ecom_schema")

spark.sql("""
  CREATE TABLE silver_events_partioned
  USING DELTA
  PARTITIONED BY (event_date, event_type)
  AS SELECT * FROM silver_events
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
# Optimize
spark.sql("OPTIMIZE silver_events_partioned ZORDER BY (category_code, brand)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# check explain after optimize
spark.sql("select category_code,brand,count(*) from silver_events_partioned where event_type= 'purchase' group by category_code,brand").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- == Initial Plan ==
   ColumnarToRow
   +- PhotonResultStage
      +- PhotonGroupingAgg(keys=[category_code#18314, brand#18315], functions=[finalmerge_count(merge count#18366L) AS count(1)#18321L])
         +- PhotonShuffleExchangeSource
            +- PhotonShuffleMapStage ENSURE_REQUIREMENTS, [id=#10315]
               +- PhotonShuffleExchangeSink hashpartitioning(category_code#18314, brand#18315, 1024)
                  +- PhotonGroupingAgg(keys=[category_code#18314, brand#18315], functions=[partial_count(1) AS count#18366L])
                     +- PhotonProject [category_code#18314, brand#18315]
                        +- PhotonScan parquet ecom_catalog.ecom_schema.silver_events_partioned[category_code#18314,brand#18315,event_date#18319,event_type#18311] DataFilters: [], DictionaryFilters: [], Format: parquet, Location: PreparedDeltaFileIndex(1 paths)[s3://dbstorage-prod-27rwr/uc/1563c012-b284-4c86-9688-8f8dbadfdda3..., Opt

In [0]:
# bechmarking
# Scenario: Measure performance of the GROUP BY query before and after OPTIMIZE + ZORDER.

from pyspark.sql.functions import col
import time

# original unoptimized query (Lazy)
df_original = (
    spark.table("silver_events")
         .filter(col("event_type") == "purchase")
         .groupBy("category_code", "brand")
         .count()
)

print("Running benchmark on original unoptimized table...")

start = time.time()
original_count = df_original.count()
print(f"   Rows: {original_count}")
print(f"   Time (Original): {time.time() - start:.4f}s\n")

# Define the optimized query  (Lazy). This uses your partitioned (silver_events_partioned) + ZORDERed table:

df_optimized = (
    spark.table("silver_events_partioned")
         .filter(col("event_type") == "purchase")
         .groupBy("category_code", "brand")
         .count()
)

print("Running benchmark on OPTIMIZED (partitioned + ZORDER) table...")

start = time.time()
optimized_count = df_optimized.count()
print(f"   Rows: {optimized_count}")
print(f"   Time (Optimized): {time.time() - start:.4f}s\n")

# benchmarkSummary
print("📊 Benchmark Summary")
print("---------------------")
print(f"Original Runtime : {original_count}")
print(f"Optimized Runtime: {optimized_count}")
print(" Faster performance expected due to:")
print("   - Partition pruning (event_type)")
print("   - File compaction (OPTIMIZE)")
print("   - Data skipping (ZORDER on category_code, brand)")

Running benchmark on original unoptimized table...
   Rows: 4632
   Time (Original): 1.0346s

Running benchmark on OPTIMIZED (partitioned + ZORDER) table...
   Rows: 4632
   Time (Optimized): 1.0031s

📊 Benchmark Summary
---------------------
Original Runtime : 4632
Optimized Runtime: 4632
âœ” Faster performance expected due to:
   - Partition pruning (event_type)
   - File compaction (OPTIMIZE)
   - Data skipping (ZORDER on category_code, brand)


In [0]:

# Caching is not supported on serverless compute hencce skipped.

#cached = spark.table("silver_events").cache()
#cached.count()  # Materialize