In [0]:
from pyspark.sql import SparkSession


In [0]:
spark = SparkSession.builder.appName("Olist ECommerce Performance Optimization").config('spark.executor.memory','6g').config('spark.executor.cores','4').config('spark.executor.instances','2').config('spark.driver.memory','4g').config('spark.driver.maxResultSize','2g').config('spark.sql.shuffle.partitions','64').config('spark.default.parallelism','64').config('spark.sql.adaptive.enabled','true').config('spark.sql.adaptive.coalescePartition.enabled','true').config('spark.sql.autoBroadcastJoinThreshold',20*1024*1024).config('spark.sql.files.maxPartitionBytes','64MB').config('spark.sql.files.openCostInBytes','2MB').config('spark.memory.fraction',0.8).config('spark.memory.storageFraction',0.2).getOrCreate()

In [0]:
# Read all CSVs into Spark DataFrames
df_customers = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_customers_dataset.csv")
df_geolocation = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_geolocation_dataset.csv")
df_order_items = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_order_items_dataset.csv")
df_order_payments = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_order_payments_dataset.csv")
df_order_reviews = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_order_reviews_dataset.csv")
df_orders = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_orders_dataset.csv")
df_products = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_products_dataset.csv")
df_sellers = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_sellers_dataset.csv")
df_product_category_name_translation = spark.read.option("header", True).csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/product_category_name_translation.csv")

## Optimized Join Strategies

In [0]:
#Broadcast join with df_customers and df_orders
from pyspark.sql.functions import broadcast, col

df_customers = df_customers.withColumn("customer_id", col("customer_id").cast("string"))
df_orders = df_orders.withColumn("customer_id", col("customer_id").cast("string"))

df_joined = df_orders.join(broadcast(df_customers), on="customer_id", how="inner")

df_joined.show(5)


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|9ef432eb625129730...|e481f51cbdc54678b...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 

In [0]:
from pyspark.sql.functions import col

# Cast join keys to string
df_order_items = df_order_items.withColumn("product_id", col("product_id").cast("string"))
df_products = df_products.withColumn("product_id", col("product_id").cast("string"))

# Repartition both DataFrames by product_id
df_order_items_repart = df_order_items.repartition("product_id")
df_products_repart = df_products.repartition("product_id")

# Join DataFrames
df_joined = df_order_items_repart.join(df_products_repart, on="product_id", how="inner")

df_joined.show(5)


+--------------------+--------------------+-------------+--------------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|            order_id|order_item_id|           seller_id|shipping_limit_date|price|freight_value|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+--------------------+-------------+--------------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|2b4609f8948be1887...|00bbc417955452474...|            1|cc419e0650a3c5ba7...|2017-08-18 03:15:35|89.99|         7.88|         beleza_saude|                 59|         

In [0]:
#Sort And Merge
from pyspark.sql.functions import col

# Cast join keys to string to avoid type issues
df_customers = df_customers.withColumn("customer_id", col("customer_id").cast("string"))
df_orders = df_orders.withColumn("customer_id", col("customer_id").cast("string"))

# Repartition and sort df_customers by customer_id
df_customers_sorted = df_customers.repartition("customer_id").sortWithinPartitions("customer_id")

# Join with df_orders (no repartition or sort on df_orders)
df_joined = df_orders.join(df_customers_sorted, on="customer_id", how="inner")

df_joined.show(5)


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|9ef432eb625129730...|e481f51cbdc54678b...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 

In [0]:
#Bucket Join
from pyspark.sql.functions import col

# Cast join keys to string
df_customers = df_customers.withColumn("customer_id", col("customer_id").cast("string"))
df_orders = df_orders.withColumn("customer_id", col("customer_id").cast("string"))

# Repartition and sort df_customers on customer_id
df_customers_sorted = df_customers.repartition("customer_id").sortWithinPartitions("customer_id")

# Join with df_orders as is
df_joined = df_orders.join(df_customers_sorted, on="customer_id", how="inner")

df_joined.show(5)


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------+------------------------+--------------------+--------------+
|9ef432eb625129730...|e481f51cbdc54678b...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 

In [0]:
bucketed_customers = spark.table("bucketed_customers")
bucketed_orders = spark.table("bucketed_orders")


## Steps to create and manage Delta tables with optimized schemas:

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_unique_id", StringType(), True),
    StructField("customer_zip_code_prefix", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
])


In [0]:
# Read customers CSV with predefined schema
df_customers = spark.read.schema(customer_schema) \
    .option("header", True) \
    .csv("dbfs:/Volumes/commerce_spark_workspace/default/ecommerce_raw/olist_customers_dataset.csv")

In [0]:
df_customers.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("customer_state") \
    .saveAsTable("delta_olist_customers")


In [0]:
# Optimize the Delta table for better query performance
spark.sql("OPTIMIZE delta_olist_customers ZORDER BY (customer_city)")

# Read and display sample data from the Delta table
df = spark.read.format("delta").table("delta_olist_customers")
df.show(5)

+--------------------+--------------------+------------------------+-------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+--------------------+--------------------+------------------------+-------------+--------------+
|4fa19f7da692e6bf9...|a2b8841410cf77619...|                   72270|     brasilia|            DF|
|e50a30de3c32f9406...|b4d6e1b900d99b52e...|                   71540|     brasilia|            DF|
|9b7822c67a91b431e...|9d0ac1cdbfc919d67...|                   71928|     brasilia|            DF|
|8b47e5ba29a9cd994...|9bbfdf9f7f65b5848...|                   71665|     brasilia|            DF|
|d6ea00d4a2dca6a01...|1d4626b197f66aa61...|                   73020|     brasilia|            DF|
+--------------------+--------------------+------------------------+-------------+--------------+
only showing top 5 rows
