In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("Food Delivery Analytics") \
.getOrCreate()

In [3]:
orders_data = [
("O001","North","Delhi","Rest-01","Pizza","2024-02-01",450,35),
("O002","North","Delhi","Rest-01","Burger","2024-02-01",250,25),
("O003","North","Chandigarh","Rest-02","Pasta","2024-02-02",350,30),
("O004","South","Bangalore","Rest-03","Pizza","2024-02-01",500,40),
("O005","South","Chennai","Rest-04","Burger","2024-02-02",220,20),
("O006","South","Bangalore","Rest-03","Pasta","2024-02-03",380,32),
("O007","East","Kolkata","Rest-05","Pizza","2024-02-01",420,38),
("O008","East","Kolkata","Rest-05","Burger","2024-02-02",260,26),
("O009","East","Patna","Rest-06","Pasta","2024-02-03",300,28),
("O010","West","Mumbai","Rest-07","Pizza","2024-02-01",520,42),
("O011","West","Mumbai","Rest-07","Burger","2024-02-02",280,27),
("O012","West","Pune","Rest-08","Pasta","2024-02-03",340,31),
("O013","North","Delhi","Rest-01","Pizza","2024-02-04",480,37),
("O014","South","Chennai","Rest-04","Pizza","2024-02-04",510,41),
("O015","East","Patna","Rest-06","Burger","2024-02-04",240,24),
("O016","West","Pune","Rest-08","Pizza","2024-02-04",500,39),
("O017","North","Chandigarh","Rest-02","Burger","2024-02-05",260,26),
("O018","South","Bangalore","Rest-03","Burger","2024-02-05",290,29),
("O019","East","Kolkata","Rest-05","Pasta","2024-02-05",360,33),
("O020","West","Mumbai","Rest-07","Pasta","2024-02-05",390,34),
("O021","North","Delhi","Rest-01","Pasta","2024-02-06",370,30),
("O022","South","Chennai","Rest-04","Pasta","2024-02-06",330,29),
("O023","East","Patna","Rest-06","Pizza","2024-02-06",460,36),
("O024","West","Pune","Rest-08","Burger","2024-02-06",270,26)
]
columns = [
"order_id","region","city","restaurant_id",
"food_item","order_date","amount","delivery_time_min"
]
df_orders = spark.createDataFrame(orders_data, columns)
df_orders.show(5)
df_orders.printSchema()

+--------+------+----------+-------------+---------+----------+------+-----------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+----------+-------------+---------+----------+------+-----------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|
+--------+------+----------+-------------+---------+----------+------+-----------------+
only showing top 5 rows
root
 |-- order_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- food_i

#EXERCISE SET 1 — SELECT OPERATIONS

1. Select only order_id , region , food_item , amount

In [5]:
df_orders.select("order_id", "region", "food_item", "amount").show()

+--------+------+---------+------+
|order_id|region|food_item|amount|
+--------+------+---------+------+
|    O001| North|    Pizza|   450|
|    O002| North|   Burger|   250|
|    O003| North|    Pasta|   350|
|    O004| South|    Pizza|   500|
|    O005| South|   Burger|   220|
|    O006| South|    Pasta|   380|
|    O007|  East|    Pizza|   420|
|    O008|  East|   Burger|   260|
|    O009|  East|    Pasta|   300|
|    O010|  West|    Pizza|   520|
|    O011|  West|   Burger|   280|
|    O012|  West|    Pasta|   340|
|    O013| North|    Pizza|   480|
|    O014| South|    Pizza|   510|
|    O015|  East|   Burger|   240|
|    O016|  West|    Pizza|   500|
|    O017| North|   Burger|   260|
|    O018| South|   Burger|   290|
|    O019|  East|    Pasta|   360|
|    O020|  West|    Pasta|   390|
+--------+------+---------+------+
only showing top 20 rows


2. Rename amount to order_value

In [6]:
df_orders.select(
    col("amount").alias("order_value")
).show()

+-----------+
|order_value|
+-----------+
|        450|
|        250|
|        350|
|        500|
|        220|
|        380|
|        420|
|        260|
|        300|
|        520|
|        280|
|        340|
|        480|
|        510|
|        240|
|        500|
|        260|
|        290|
|        360|
|        390|
+-----------+
only showing top 20 rows


3. Create a new column amount_in_hundreds

In [7]:
df_orders = df_orders.withColumn(
    "amount_in_hundreds", col("amount") / 1000
)
df_orders.show()

+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|amount_in_hundreds|
+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|              0.45|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|              0.25|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|              0.35|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|               0.5|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|              0.22|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32|              0.38|
|    O007|  East|   Kolkata|

4. Select distinct combinations of region and food_item

In [8]:
df_orders.select("region", "food_item").distinct().show()

+------+---------+
|region|food_item|
+------+---------+
|  West|   Burger|
|  East|    Pizza|
|  West|    Pizza|
| North|    Pizza|
| South|    Pizza|
|  East|   Burger|
| North|    Pasta|
|  East|    Pasta|
| North|   Burger|
| South|    Pasta|
| South|   Burger|
|  West|    Pasta|
+------+---------+



5. Reorder columns in a logical reporting format

In [9]:
df_orders = df_orders.select(
    "order_id",
    "order_date",
    "region",
    "city",
    "restaurant_id",
    "food_item",
    "delivery_time_min",
    "amount",
    "amount_in_hundreds"
)
df_orders.show()

+--------+----------+------+----------+-------------+---------+-----------------+------+------------------+
|order_id|order_date|region|      city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|
+--------+----------+------+----------+-------------+---------+-----------------+------+------------------+
|    O001|2024-02-01| North|     Delhi|      Rest-01|    Pizza|               35|   450|              0.45|
|    O002|2024-02-01| North|     Delhi|      Rest-01|   Burger|               25|   250|              0.25|
|    O003|2024-02-02| North|Chandigarh|      Rest-02|    Pasta|               30|   350|              0.35|
|    O004|2024-02-01| South| Bangalore|      Rest-03|    Pizza|               40|   500|               0.5|
|    O005|2024-02-02| South|   Chennai|      Rest-04|   Burger|               20|   220|              0.22|
|    O006|2024-02-03| South| Bangalore|      Rest-03|    Pasta|               32|   380|              0.38|
|    O007|2024-02-01|  East|

6. Create a column order_day extracted from order_date

In [11]:
from pyspark.sql.functions import day, col
df_orders = df_orders.withColumn(
    "order_day", day(col("order_date")))
df_orders.show()

+--------+----------+------+----------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|      city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+----------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North|     Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O002|2024-02-01| North|     Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|
|    O003|2024-02-02| North|Chandigarh|      Rest-02|    Pasta|               30|   350|              0.35|        2|
|    O004|2024-02-01| South| Bangalore|      Rest-03|    Pizza|               40|   500|               0.5|        1|
|    O005|2024-02-02| South|   Chennai|      Rest-04|   Burger|               20|   220|              0.22|        2|
|    O006|2024-02-03| South| Bangalore|      Rest-03|   

#EXERCISE SET 2 — FILTER OPERATIONS

1. Filter orders where amount > 400

In [12]:
df_orders.filter(col("amount") > 400).show()

+--------+----------+------+---------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|     city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+---------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North|    Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O004|2024-02-01| South|Bangalore|      Rest-03|    Pizza|               40|   500|               0.5|        1|
|    O007|2024-02-01|  East|  Kolkata|      Rest-05|    Pizza|               38|   420|              0.42|        1|
|    O010|2024-02-01|  West|   Mumbai|      Rest-07|    Pizza|               42|   520|              0.52|        1|
|    O013|2024-02-04| North|    Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|
|    O014|2024-02-04| South|  Chennai|      Rest-04|    Pizza|  

2. Filter only Pizza orders

In [13]:
df_orders.filter(col("food_item") == "Pizza").show()

+--------+----------+------+---------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|     city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+---------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North|    Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O004|2024-02-01| South|Bangalore|      Rest-03|    Pizza|               40|   500|               0.5|        1|
|    O007|2024-02-01|  East|  Kolkata|      Rest-05|    Pizza|               38|   420|              0.42|        1|
|    O010|2024-02-01|  West|   Mumbai|      Rest-07|    Pizza|               42|   520|              0.52|        1|
|    O013|2024-02-04| North|    Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|
|    O014|2024-02-04| South|  Chennai|      Rest-04|    Pizza|  

3. Filter orders from Delhi and Mumbai

In [14]:
df_orders = df_orders.filter(
    col("city").isin("Delhi", "Mumbai"
))
df_orders.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O002|2024-02-01| North| Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|
|    O010|2024-02-01|  West|Mumbai|      Rest-07|    Pizza|               42|   520|              0.52|        1|
|    O011|2024-02-02|  West|Mumbai|      Rest-07|   Burger|               27|   280|              0.28|        2|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|
|    O020|2024-02-05|  West|Mumbai|      Rest-07|    Pasta|               34|   390|    

4. Filter orders with delivery time greater than 35 minutes

In [15]:
df_orders.filter((col("delivery_time_min") >= 35)).show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O010|2024-02-01|  West|Mumbai|      Rest-07|    Pizza|               42|   520|              0.52|        1|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+



5. Apply multiple conditions using AND and OR

In [16]:
df_orders.filter(col("amount") > 300).filter(
    col("delivery_time_min") < 40).show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|
|    O020|2024-02-05|  West|Mumbai|      Rest-07|    Pasta|               34|   390|              0.39|        5|
|    O021|2024-02-06| North| Delhi|      Rest-01|    Pasta|               30|   370|              0.37|        6|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+



In [17]:
df_orders.filter((col("region")=="east")|(col("region")=="west")).show()

+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+
+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+



6. Apply filters in different orders and compare explain(True)

In [18]:
df_case1 = df_orders.filter(col("region")=="East").filter(col("city")=="Kolkata")
df_case1.show()
df_case1.explain(True)

df_case2 = df_orders.filter(col("city")=="Delhi").filter(col("region")=="North")
df_case2.show()
df_case2.explain(True)

+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+
|order_id|order_date|region|city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|
+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+
+--------+----------+------+----+-------------+---------+-----------------+------+------------------+---------+

== Parsed Logical Plan ==
'Filter '`=`('city, Kolkata)
+- Filter (region#1 = East)
   +- Filter city#2 IN (Delhi,Mumbai)
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
         +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
            +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#

7. Identify which filters are pushed down by Spark

In [20]:
df_orders.filter(col("region")=="East").filter(col("amount")>500).explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 500)
+- Filter (region#1 = East)
   +- Filter city#2 IN (Delhi,Mumbai)
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
         +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
            +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
               +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, order_date: string, region: string, city: string, restaurant_id: string, food_item: string, delivery_time_min: bigint, amount: bigint

#EXERCISE SET 3 — TRANSFORMATIONS vs ACTIONS

1. Build a pipeline with:
select
filter
derived column

2. Do not call any action

In [21]:
df_pipeline = (
    df_orders.select("order_id", "region", "amount").filter(col("amount") > 400).withColumn("amount_in_hundreds", col("amount") *0.9)
)

3. Explain what Spark has done so far

4. Trigger count() and observe execution

In [23]:
df_pipeline.count()

3

5. Trigger show() and compare behavior

In [22]:
df_pipeline.show()

+--------+------+------+------------------+
|order_id|region|amount|amount_in_hundreds|
+--------+------+------+------------------+
|    O001| North|   450|             405.0|
|    O010|  West|   520|             468.0|
|    O013| North|   480|             432.0|
+--------+------+------+------------------+



#EXERCISE SET 4 — PARTITIONS & FILE LAYOUT

1. Check the number of partitions of df_orders

In [25]:
print(df_orders.rdd.getNumPartitions())

2


2. Repartition the DataFrame into 4 partitions

In [26]:
df_orders_repartitioned = df_orders.repartition(4)
print(df_orders_repartitioned.rdd.getNumPartitions())

4


3. Coalesce the DataFrame into 1 partition

In [27]:
df_orders_coalesced = df_orders_repartitioned.coalesce(1)
print(df_orders_coalesced.rdd.getNumPartitions())

1


4. Write repartitioned data to Parquet and count files

In [28]:
import os
import shutil

output_path = "/tmp/repartitioned_orders.parquet"

# Clean up previous run if any
if os.path.exists(output_path):
    shutil.rmtree(output_path)

# Write the repartitioned DataFrame to Parquet
df_orders_repartitioned.write.mode("overwrite").parquet(output_path)

# Count the number of Parquet files (part files)
parquet_files = [f for f in os.listdir(output_path) if f.startswith('part-') and f.endswith('.parquet')]
print(f"Number of Parquet files created: {len(parquet_files)}")

Number of Parquet files created: 4


5. Write coalesced data to Parquet and count files

In [29]:
import os
import shutil

output_path_coalesced = "/tmp/coalesced_orders.parquet"

# Clean up previous run if any
if os.path.exists(output_path_coalesced):
    shutil.rmtree(output_path_coalesced)

# Write the coalesced DataFrame to Parquet
df_orders_coalesced.write.mode("overwrite").parquet(output_path_coalesced)

# Count the number of Parquet files (part files)
parquet_files_coalesced = [f for f in os.listdir(output_path_coalesced) if f.startswith('part-') and f.endswith('.parquet')]
print(f"Number of Parquet files created for coalesced data: {len(parquet_files_coalesced)}")

Number of Parquet files created for coalesced data: 1


6. Explain why file counts differ

The number of output files when writing a Spark DataFrame to a file format like Parquet directly corresponds to the number of partitions the DataFrame has at that moment.
Here's why the counts differed:
df_orders_repartitioned (4 files):
We used df_orders.repartition(4). The repartition() operation creates a new DataFrame with the specified number of partitions (in this case, 4). This typically involves a full shuffle of the data across the cluster to distribute it evenly among the new partitions.
When df_orders_repartitioned was written to Parquet, each of its 4 partitions was written as a separate file, resulting in 4 Parquet files.
df_orders_coalesced (1 file):
We used df_orders_repartitioned.coalesce(1). The coalesce() operation is used to reduce the number of partitions. Unlike repartition(), coalesce() tries to minimize data shuffling by combining existing partitions where possible. It's often more efficient when decreasing the number of partitions.
In this case, coalesce(1) combined all the partitions of df_orders_repartitioned into a single partition.
When df_orders_coalesced (which had 1 partition) was written to Parquet, it produced only 1 Parquet file.

#EXERCISE SET 5 — GROUPBY & AGGREGATE FUNCTIONS

1. Total revenue per region

In [30]:
from pyspark.sql.functions import sum
revenue_per_region = df_orders.groupBy("region").agg(sum("amount").alias("total_revenue"))
revenue_per_region.show()

+------+-------------+
|region|total_revenue|
+------+-------------+
|  West|         1190|
| North|         1550|
+------+-------------+



2. Average order amount per food item

In [31]:
from pyspark.sql.functions import avg

average_amount_per_food_item = df_orders.groupBy("food_item").agg(avg("amount").alias("average_order_amount"))
average_amount_per_food_item.show()

+---------+--------------------+
|food_item|average_order_amount|
+---------+--------------------+
|   Burger|               265.0|
|    Pizza|   483.3333333333333|
|    Pasta|               380.0|
+---------+--------------------+



3. Maximum order amount per city

In [32]:
from pyspark.sql.functions import max

max_amount_per_city = df_orders.groupBy("city").agg(max("amount").alias("max_order_amount"))
max_amount_per_city.show()

+------+----------------+
|  city|max_order_amount|
+------+----------------+
|Mumbai|             520|
| Delhi|             480|
+------+----------------+



4. Minimum delivery time per restaurant

In [33]:
from pyspark.sql.functions import min

min_delivery_time_per_restaurant = df_orders.groupBy("restaurant_id").agg(min("delivery_time_min").alias("min_delivery_time"))
min_delivery_time_per_restaurant.show()

+-------------+-----------------+
|restaurant_id|min_delivery_time|
+-------------+-----------------+
|      Rest-01|               25|
|      Rest-07|               27|
+-------------+-----------------+



5. Count number of orders per region

In [34]:
from pyspark.sql.functions import count

orders_per_region = df_orders.groupBy("region").agg(count("order_id").alias("number_of_orders"))
orders_per_region.show()

+------+----------------+
|region|number_of_orders|
+------+----------------+
|  West|               3|
| North|               4|
+------+----------------+



6. Total revenue per restaurant

In [35]:
from pyspark.sql.functions import sum

total_revenue_per_restaurant = df_orders.groupBy("restaurant_id").agg(sum("amount").alias("total_revenue"))
total_revenue_per_restaurant.show()

+-------------+-------------+
|restaurant_id|total_revenue|
+-------------+-------------+
|      Rest-01|         1550|
|      Rest-07|         1190|
+-------------+-------------+



7. Region + food item wise total revenue

In [36]:
region_food_item_revenue = df_orders.groupBy("region", "food_item").agg(sum("amount").alias("total_revenue"))
region_food_item_revenue.show()

+------+---------+-------------+
|region|food_item|total_revenue|
+------+---------+-------------+
|  West|   Burger|          280|
|  West|    Pizza|          520|
| North|    Pizza|          930|
| North|   Burger|          250|
| North|    Pasta|          370|
|  West|    Pasta|          390|
+------+---------+-------------+



8. City wise average delivery time

In [37]:
from pyspark.sql.functions import avg

city_avg_delivery_time = df_orders.groupBy("city").agg(avg("delivery_time_min").alias("average_delivery_time"))
city_avg_delivery_time.show()

+------+---------------------+
|  city|average_delivery_time|
+------+---------------------+
|Mumbai|   34.333333333333336|
| Delhi|                31.75|
+------+---------------------+



9. Identify regions with revenue above a threshold

In [38]:
revenue_threshold = 2100
regions_above_threshold = revenue_per_region.filter(col("total_revenue") > revenue_threshold)
regions_above_threshold.show()

+------+-------------+
|region|total_revenue|
+------+-------------+
+------+-------------+



10. Use explain(True) and identify shuffle operators

In [39]:
revenue_per_region.explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, 'sum('amount) AS total_revenue#417]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
         +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
            +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string, total_revenue: bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS total_revenue#417L]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Pro

The explain(True) output for revenue_per_region shows the physical execution plan for calculating the total revenue per region. The key line indicating a shuffle operation is:
+- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=575]
The Exchange operator, specifically with hashpartitioning, signifies a shuffle in Spark. This shuffle is necessary because the groupBy("region") operation requires all rows with the same region to be brought together on the same executor to correctly calculate the sum("amount") for each region. Spark redistributes the data across the network based on the region column, which is a resource-intensive operation.

#EXERCISE SET 6 — WINDOW FUNCTIONS(OVER)

1. Compute running total of revenue per region ordered by date

In [40]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, rank, row_number, dense_rank,col
window_spec_revenue = Window.partitionBy("region").orderBy("order_date")

running_total_revenue = df_orders.withColumn(
    "running_total_revenue",
    sum(col("amount")).over(window_spec_revenue)
)

running_total_revenue.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+---------------------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|running_total_revenue|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+---------------------+
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|                  700|
|    O002|2024-02-01| North| Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|                  700|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|                 1180|
|    O021|2024-02-06| North| Delhi|      Rest-01|    Pasta|               30|   370|              0.37|        6|                 1550|
|    O010|2024-02-01|  West|Mumbai|      Rest-07

2. Rank orders by amount within each region

In [41]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window_spec_rank = Window.partitionBy("region").orderBy(col("amount").desc())

ranked_orders = df_orders.withColumn(
    "rank_by_amount_in_region",
    rank().over(window_spec_rank)
)

ranked_orders.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+------------------------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|rank_by_amount_in_region|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+------------------------+
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|                       1|
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|                       2|
|    O021|2024-02-06| North| Delhi|      Rest-01|    Pasta|               30|   370|              0.37|        6|                       3|
|    O002|2024-02-01| North| Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|                       4|
|    O010|2024-02-01|  West

3. Assign row numbers per restaurant based on delivery time

In [43]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec_row_number = Window.partitionBy("restaurant_id").orderBy("delivery_time_min")

orders_with_row_number = df_orders.withColumn(
    "row_number_by_delivery_time",
    row_number().over(window_spec_row_number)
)

orders_with_row_number.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+---------------------------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|row_number_by_delivery_time|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+---------------------------+
|    O002|2024-02-01| North| Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|                          1|
|    O021|2024-02-06| North| Delhi|      Rest-01|    Pasta|               30|   370|              0.37|        6|                          2|
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|                          3|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|                          4|
|    O

4. Use dense rank to rank food items per region by revenue

In [44]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, col

window_spec_dense_rank = Window.partitionBy("region").orderBy(col("total_revenue").desc())

dense_ranked_food_items = region_food_item_revenue.withColumn(
    "dense_rank_by_revenue_in_region",
    dense_rank().over(window_spec_dense_rank)
)

dense_ranked_food_items.show()

+------+---------+-------------+-------------------------------+
|region|food_item|total_revenue|dense_rank_by_revenue_in_region|
+------+---------+-------------+-------------------------------+
| North|    Pizza|          930|                              1|
| North|    Pasta|          370|                              2|
| North|   Burger|          250|                              3|
|  West|    Pizza|          520|                              1|
|  West|    Pasta|          390|                              2|
|  West|   Burger|          280|                              3|
+------+---------+-------------+-------------------------------+



5. Identify top 2 highest value orders per region

In [45]:
from pyspark.sql.functions import row_number, col

window_spec_top_orders = Window.partitionBy("region").orderBy(col("amount").desc())

top_2_orders_per_region = df_orders.withColumn(
    "rank_in_region",
    row_number().over(window_spec_top_orders)
).filter(col("rank_in_region") <= 2)

top_2_orders_per_region.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+--------------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|rank_in_region|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+--------------+
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|             1|
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|             2|
|    O010|2024-02-01|  West|Mumbai|      Rest-07|    Pizza|               42|   520|              0.52|        1|             1|
|    O020|2024-02-05|  West|Mumbai|      Rest-07|    Pasta|               34|   390|              0.39|        5|             2|
+--------+----------+------+------+-------------+---------+-----------------+------+-------------

6. Compare rank , dense_rank , and row_number outputs

In [46]:
window_spec_compare = Window.partitionBy("region").orderBy(col("amount").desc())

comparison_df = df_orders.withColumn(
    "rank",
    rank().over(window_spec_compare)
).withColumn(
    "dense_rank",
    dense_rank().over(window_spec_compare)
).withColumn(
    "row_number",
    row_number().over(window_spec_compare)
)

comparison_df.select("region", "order_id", "amount", "rank", "dense_rank", "row_number").orderBy("region", "amount", "order_id").show()

+------+--------+------+----+----------+----------+
|region|order_id|amount|rank|dense_rank|row_number|
+------+--------+------+----+----------+----------+
| North|    O002|   250|   4|         4|         4|
| North|    O021|   370|   3|         3|         3|
| North|    O001|   450|   2|         2|         2|
| North|    O013|   480|   1|         1|         1|
|  West|    O011|   280|   3|         3|         3|
|  West|    O020|   390|   2|         2|         2|
|  West|    O010|   520|   1|         1|         1|
+------+--------+------+----+----------+----------+



7. Calculate cumulative delivery time per restaurant

In [47]:
window_spec_cumulative_delivery = Window.partitionBy("restaurant_id").orderBy("order_date")

cumulative_delivery_time = df_orders.withColumn(
    "cumulative_delivery_time",
    sum(col("delivery_time_min")).over(window_spec_cumulative_delivery)
)

cumulative_delivery_time.show()

+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+------------------------+
|order_id|order_date|region|  city|restaurant_id|food_item|delivery_time_min|amount|amount_in_hundreds|order_day|cumulative_delivery_time|
+--------+----------+------+------+-------------+---------+-----------------+------+------------------+---------+------------------------+
|    O001|2024-02-01| North| Delhi|      Rest-01|    Pizza|               35|   450|              0.45|        1|                      60|
|    O002|2024-02-01| North| Delhi|      Rest-01|   Burger|               25|   250|              0.25|        1|                      60|
|    O013|2024-02-04| North| Delhi|      Rest-01|    Pizza|               37|   480|              0.48|        4|                      97|
|    O021|2024-02-06| North| Delhi|      Rest-01|    Pasta|               30|   370|              0.37|        6|                     127|
|    O010|2024-02-01|  West

#EXERCISE SET 7 — GROUPBY vs WINDOW(CONCEPTUAL)

1. Calculate total revenue per region using GroupBy

In [48]:
total_revenue_groupby = df_orders.groupBy("region").agg(sum("amount").alias("total_revenue"))
total_revenue_groupby.show()

+------+-------------+
|region|total_revenue|
+------+-------------+
|  West|         1190|
| North|         1550|
+------+-------------+



2. Calculate total revenue per region using Window

In [49]:
window_spec_total_revenue = Window.partitionBy("region")
total_revenue_window = df_orders.withColumn(
    "total_revenue",
    sum(col("amount")).over(window_spec_total_revenue)
)
total_revenue_window.select("region", "order_id", "amount", "total_revenue").orderBy("region", "order_id").show()

+------+--------+------+-------------+
|region|order_id|amount|total_revenue|
+------+--------+------+-------------+
| North|    O001|   450|         1550|
| North|    O002|   250|         1550|
| North|    O013|   480|         1550|
| North|    O021|   370|         1550|
|  West|    O010|   520|         1190|
|  West|    O011|   280|         1190|
|  West|    O020|   390|         1190|
+------+--------+------+-------------+



3. Compare:
Row count
Output structure
Use case

### Comparison between GROUP BY and Window Functions

**1. Row Count**
*   **`groupBy`**: Reduces the number of rows in the DataFrame. It aggregates rows based on the grouping keys, resulting in one row per unique group.
    *   `total_revenue_groupby` has 2 rows (one for 'North' and one for 'West').

*   **`Window`**: Maintains the original number of rows in the DataFrame. It adds new columns with calculated values to each row, where calculations are performed over a defined window.
    *   `total_revenue_window` has the same number of rows as `df_orders` (which is 7 rows based on the filtered data, as it simply adds a `total_revenue` column to each existing row).

**2. Output Structure**
*   **`groupBy`**: The output DataFrame contains only the grouping columns and the aggregated columns.
    *   `total_revenue_groupby`: `region`, `total_revenue`

*   **`Window`**: The output DataFrame contains all the original columns from the input DataFrame, plus the new column(s) generated by the window function.
    *   `total_revenue_window`: All original columns of `df_orders` (`order_id`, `order_date`, `region`, etc.) plus `total_revenue`.

**3. Use Case**
*   **`groupBy`**: Use when you need to **summarize** data at a higher level of granularity, reducing the dataset to one record per group. Examples include calculating total sales per region, average amount per food item, or counting orders per city.

*   **`Window`**: Use when you need to perform calculations that involve a group of rows but want to **retain the original row-level detail**. This is often for contextual analysis, such as calculating running totals, rankings within groups, moving averages, or comparing a row's value to the group's average/max/min.

4. Explain why Window does not reduce rows

Window functions do not reduce the number of rows in a DataFrame because their primary purpose is to perform calculations over a defined window of rows while retaining the original row-level detail. Unlike groupBy aggregations, which collapse groups of rows into a single summary row, window functions add new columns to the existing DataFrame, where each new column contains the result of a calculation performed on a subset of rows (the 'window') that is somehow related to the current row. This allows you to enrich each row with contextual information without losing any of your original data.

#EXERCISE SET 8 — DAG & PERFORMANCE ANALYSIS

1. Run explain(True) for:
Simple select
Filter
GroupBy aggregation
Window function

### Explain(True) for Different Spark Operations

In [50]:
# Simple Select
print("\n--- Explain for Simple Select ---")
df_orders.select("order_id", "region", "food_item").explain(True)


--- Explain for Simple Select ---
== Parsed Logical Plan ==
'Project ['order_id, 'region, 'food_item]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
         +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
            +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, food_item: string
Project [order_id#0, region#1, food_item#4]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project

In [51]:
# Filter Operation
print("\n--- Explain for Filter Operation ---")
df_orders.filter(col("amount") > 400).explain(True)


--- Explain for Filter Operation ---
== Parsed Logical Plan ==
'Filter '`>`('amount, 400)
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
         +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
            +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, order_date: string, region: string, city: string, restaurant_id: string, food_item: string, delivery_time_min: bigint, amount: bigint, amo

In [52]:
# GroupBy Aggregation
print("\n--- Explain for GroupBy Aggregation (revenue_per_region) ---")
revenue_per_region.explain(True)


--- Explain for GroupBy Aggregation (revenue_per_region) ---
== Parsed Logical Plan ==
'Aggregate ['region], ['region, 'sum('amount) AS total_revenue#417]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
         +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
            +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string, total_revenue: bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS to

In [53]:
# Window Function
print("\n--- Explain for Window Function (running_total_revenue) ---")
running_total_revenue.explain(True)


--- Explain for Window Function (running_total_revenue) ---
== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(running_total_revenue, 'sum('amount) windowspecdefinition('region, 'order_date ASC NULLS FIRST, unspecifiedframe$()), None)]
+- Filter city#2 IN (Delhi,Mumbai)
   +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51, day(cast(order_date#5 as date)) AS order_day#115]
      +- Project [order_id#0, order_date#5, region#1, city#2, restaurant_id#3, food_item#4, delivery_time_min#7L, amount#6L, amount_in_hundreds#51]
         +- Project [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_hundreds#51]
            +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==

2. Identify:
Exchange operators
Sort operations
Stage boundaries

Exchange Operators (Shuffles)

Purpose: Redistribute data across partitions; costly operation.
GroupBy Aggregation:
Exchange hashpartitioning(region, 200) → Shuffle by region to group records before HashAggregate.
Window Function:
Exchange hashpartitioning(region, 200) → Shuffle by region for Window.partitionBy("region").

Sort Operations

Purpose: Reorder data within partitions.
Triggered by: orderBy in window specs or global sort.
Example:
Sort [region ASC, order_date ASC] → After shuffle, sort within region for running total.

Stage Boundaries

Marked by: Exchange operators.
GroupBy Aggregation:

Stage 1: Scan, Project, Partial HashAggregate.
Stage 2: After shuffle → Final HashAggregate.


Window Function:

Stage 1: Scan.
Stage 2: After shuffle → Sort + Window calculations.

3. Explain why window functions require sorting

Order-Dependent Calculations:

Running Aggregates (sum, avg with frames): Need rows in correct order for cumulative totals.
Ranking Functions (rank(), row_number(), ntile()): Assign ranks based on row order.
Lag/Lead Functions: Depend on preceding/succeeding rows → requires consistent ordering.



Window Frame:

Defines rows relative to the current row (e.g., ROWS BETWEEN ...).
Only meaningful if rows are sorted.




Physical Plan Implications

Spark performs Sort after Exchange when ORDER BY is in window spec.
Ensures correct ordering within each partition before applying window logic.
Without sorting → results are non-deterministic or wrong.


Most Expensive Operations in Spark DAG

Shuffles (Exchange operators):

Data movement across network → highest cost.


Sort operations:

Reordering data → CPU/memory intensive.


Aggregations (post-shuffle):

Heavy on resources.


Cheap operations:

Simple Select and Filter (no shuffle/sort).

4. Identify expensive operations in each DAG

#EXERCISE SET 9 — THINKING QUESTIONS

1. Why does GroupBy introduce shuffle?

Answer: In Spark, groupBy aggregates data based on keys. To do this, all rows with the same key must end up on the same partition.
Since keys can be spread across multiple partitions, Spark shuffles data across the cluster so that identical keys are co-located.
Reason: Shuffle ensures correctness of aggregation but is expensive because it involves disk I/O, network transfer, and serialization.

2. Why does Window not reduce rows?

Answer: A window function computes values over a set of rows (a “window”) but does not collapse rows like groupBy does.
Each input row remains in the output; Spark just adds extra columns with computed values (e.g., rank, moving average).
Reason: Window functions are designed for analytics where you need row-level detail plus aggregated contex

3. Why does repartition always cause shuffle?

Answer: repartition(n) changes the number of partitions and redistributes data evenly across them.
To achieve even distribution, Spark must shuffle data across the cluster.
Reason: It’s a full shuffle operation because Spark cannot predict which partition each row should go to without redistributing.

4. Why is coalesce cheaper than repartition?

Answer: coalesce(n) reduces the number of partitions without a full shuffle by merging existing partitions.
It avoids moving data unnecessarily; only partitions that need merging are touched.
Reason: Coalesce is efficient for reducing partitions, but not for increasing them (which requires shuffle)

5. Why does Spark delay execution until an action?

Answer: Spark uses lazy evaluation for transformations (like map, filter, groupBy).
It builds a logical DAG of operations but does not execute until an action (like collect, count, save) is called.
Reason: This allows Spark to optimize the entire pipeline before execution (e.g., combine stages, minimize shuffles).

6. When would you avoid window functions?

Answer: Avoid window functions when:
Data is huge and window operations require sorting or shuffling (very expensive).
You only need aggregated results (use groupBy instead).
Performance is critical and simpler aggregations suffice.
Reason: Window functions often involve full partition sorting, which is costly in distributed systems.