#A food delivery company wants to improve delivery time, reduce cancellations, and understand which areas and restaurants are operationally weak.

In [80]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Food delivery")\
    .getOrCreate()


#PHASE 1 – Ingestion

## Read delivery_data.csv as all StringType

In [81]:

from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("restaurant_id", StringType(), True),
    StructField("restaurant_city", StringType(), True),
    StructField("delivery_city", StringType(), True),
    StructField("order_time", StringType(), True),
    StructField("pickup_time", StringType(), True),
    StructField("delivery_time", StringType(), True),
    StructField("delivery_status", StringType(), True),
    StructField("delivery_partner_id", StringType(), True),
    StructField("order_amount", StringType(), True),
    StructField("payment_mode", StringType(), True),
    StructField("rating", StringType(), True)
])


In [82]:
df = spark.read.csv(
    "delivery_data.csv",
    header=True,
    schema=schema
)

#Print schema

In [83]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_city: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- delivery_partner_id: string (nullable = true)
 |-- order_amount: string (nullable = true)
 |-- payment_mode: string (nullable = true)
 |-- rating: string (nullable = true)



#  Print record count

In [84]:
df.count()

150000

# Show sample rows

In [85]:
df.show(10, truncate=False)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|order_time         |pickup_time        |delivery_time      |delivery_status|delivery_partner_id|order_amount|payment_mode|rating|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
|O100000 |C9467      |R972         |Bangalore      |Chennai      |29/01/2026 04:16:00|NULL               |NULL               |CANCELLED      |D505               |invalid     |COD         |NULL  |
|O100001 |C1081      |R675         |Pune           |Bangalore    |2026-01-29 17:48:00|2026-01-29 18:16:00|2026-01-29 18:16:00|PLACED         |D359               |304         |CreditCard  |NULL  |
|O100002 |C3136     

 # Identify obvious data issues

In [86]:
df.filter(df.pickup_time.isNull()).count()

13637

In [87]:
df.filter(df.delivery_time.isNull()).count()

11539

In [88]:
df.filter((df.delivery_status == "DELIVERED") & (df.delivery_time.isNull())).show(10, truncate=False)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------+---------------+-------------------+------------+------------+------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|order_time         |pickup_time        |delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------+---------------+-------------------+------------+------------+------+
|O100078 |C7435      |R541         |Kolkata        |Hyderabad    |2026-01-04 04:37:00|2026-01-04 04:47:00|NULL         |DELIVERED      |D180               |543         |DebitCard   |3.0   |
|O100104 |C9179      |R131         |Kolkata        |Bangalore    |2026-01-30 15:50:00|2026-01-30 16:18:00|NULL         |DELIVERED      |D598               |771         |COD         |2.8   |
|O100117 |C8727      |R688         |Chennai       

In [89]:
df.filter((df.delivery_status == "DELIVERED") & (df.delivery_time.isNull())).count()

2817

# PHASE 2 – Cleaning

In [90]:

from pyspark.sql.functions import (
    col, trim, when, regexp_replace, coalesce, to_timestamp, lit
)
from pyspark.sql.types import IntegerType


#  Trim all string columns

In [91]:
time_cols = ["order_time", "pickup_time", "delivery_time"]

In [92]:
string_cols = [name for (name, dtype) in df.dtypes if dtype == "string" and name not in time_cols]

In [93]:

for c in string_cols:
    df = df.withColumn(
        c,
        when(col(c).isNull() | (trim(col(c)) == ""), None).otherwise(trim(col(c)))
    )


#  Clean order_amount

## Remove commas

In [95]:
df = df.withColumn("order_amount_nocommas", regexp_replace(col("order_amount"), ",", ""))

## Convert to IntegerType

In [96]:

df = df.withColumn(
    "order_amount_clean",
    when(col("order_amount_nocommas").rlike(r"^\d+$"), col("order_amount_nocommas").cast(IntegerType()))
    .otherwise(lit(None).cast(IntegerType()))
)


## Handle invalid values safely

In [97]:

df = df.withColumn(
    "order_amount_clean",
    when(col("order_amount_clean") < 0, None).otherwise(col("order_amount_clean"))
)


In [98]:
df = df.withColumn("order_amount_valid", col("order_amount_clean").isNotNull())

In [99]:
df = df.drop("order_amount_nocommas")

In [100]:
df.count()

150000

# Parse timestamps into:
*   order_time_clean
*   pickup_time_clean
*   delivery_time_clean

In [102]:

from pyspark.sql.functions import (
    try_to_timestamp, upper
)


In [103]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_city: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- delivery_partner_id: string (nullable = true)
 |-- order_amount: string (nullable = true)
 |-- payment_mode: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- order_amount_clean: integer (nullable = true)
 |-- order_amount_valid: boolean (nullable = false)



# Support multiple formats:
*   yyyy-MM-dd HH:mm:ss
*   dd/MM/yyyy HH:mm:ss
*   yyyy/MM/dd HH:mm:ss

# Keep original columns for audit

In [104]:
from pyspark.sql.functions import col, expr

formats = [
    "yyyy-MM-dd HH:mm:ss",
    "dd/MM/yyyy HH:mm:ss",
    "yyyy/MM/dd HH:mm:ss"
]

In [105]:
def parse_multi_format_ts_expr(colname):
    norm = f"""
      CASE
        WHEN {colname} IS NULL THEN NULL
        WHEN TRIM({colname}) = '' THEN NULL
        WHEN UPPER(TRIM({colname})) = 'NULL' THEN NULL
        ELSE TRIM({colname})
      END
    """
    tries = ", ".join([f"try_to_timestamp({norm}, '{fmt}')" for fmt in formats])
    return expr(f"coalesce({tries})")

In [106]:
df = df.withColumn("order_time_clean",    parse_multi_format_ts_expr("order_time")) \
       .withColumn("pickup_time_clean",   parse_multi_format_ts_expr("pickup_time")) \
       .withColumn("delivery_time_clean", parse_multi_format_ts_expr("delivery_time"))

# Create flags:
*   order_time_valid
*   pickup_time_valid  
*   delivery_time_valid

In [107]:
df = df.withColumn("order_time_valid",     col("order_time_clean").isNotNull()) \
       .withColumn("pickup_time_valid",    col("pickup_time_clean").isNotNull()) \
       .withColumn("delivery_time_valid",  col("delivery_time_clean").isNotNull())

In [108]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_city: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- delivery_partner_id: string (nullable = true)
 |-- order_amount: string (nullable = true)
 |-- payment_mode: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- order_amount_clean: integer (nullable = true)
 |-- order_amount_valid: boolean (nullable = false)
 |-- order_time_clean: timestamp (nullable = true)
 |-- pickup_time_clean: timestamp (nullable = true)
 |-- delivery_time_clean: timestamp (nullable = true)
 |-- order_time_valid: boolean (nullable = false)
 |-- pickup_time_valid: boolean (nullable = false)
 |-- delivery_time_valid: boolean (nullable = false)



In [109]:

df.select(
    "order_id",
    "order_time", "order_time_clean", "order_time_valid",
    "pickup_time", "pickup_time_clean", "pickup_time_valid",
    "delivery_time", "delivery_time_clean", "delivery_time_valid"
).show(20, truncate=False)

print("Order time invalid:", df.filter(~col("order_time_valid")).count())
print("Pickup time invalid:", df.filter(~col("pickup_time_valid")).count())
print("Delivery time invalid:", df.filter(~col("delivery_time_valid")).count())


+--------+-------------------+-------------------+----------------+-------------------+-------------------+-----------------+-------------------+-------------------+-------------------+
|order_id|order_time         |order_time_clean   |order_time_valid|pickup_time        |pickup_time_clean  |pickup_time_valid|delivery_time      |delivery_time_clean|delivery_time_valid|
+--------+-------------------+-------------------+----------------+-------------------+-------------------+-----------------+-------------------+-------------------+-------------------+
|O100000 |29/01/2026 04:16:00|2026-01-29 04:16:00|true            |NULL               |NULL               |false            |NULL               |NULL               |false              |
|O100001 |2026-01-29 17:48:00|2026-01-29 17:48:00|true            |2026-01-29 18:16:00|2026-01-29 18:16:00|true             |2026-01-29 18:16:00|2026-01-29 18:16:00|true               |
|O100002 |2026-01-31 16:26:00|2026-01-31 16:26:00|true            |202

# PHASE 3 – Derived Metrics

In [110]:
from pyspark.sql.functions import col, when, lit, round as spark_round

def minutes_diff(end_col, start_col):
    return (col(end_col).cast("long") - col(start_col).cast("long")) / lit(60.0)

# Create time-based metrics:   
*   prep_time    = pickup_time - order_time
*   delivery_time_gap = delivery_time - pickup_time
*   total_fulfillment_time = delivery_time - order_time

In [116]:
df = df.withColumn(
    "prep_time_min",
    when(col("pickup_time_clean").isNotNull() & col("order_time_clean").isNotNull(),
         minutes_diff("pickup_time_clean", "order_time_clean")
    ).otherwise(lit(None).cast("double"))
)

In [117]:
df = df.withColumn(
    "delivery_time_gap_min",
    when(col("delivery_time_clean").isNotNull() & col("pickup_time_clean").isNotNull(),
         minutes_diff("delivery_time_clean", "pickup_time_clean")
    ).otherwise(lit(None).cast("double"))
)

In [119]:
df.select(
    "order_id",
    "order_time_clean", "pickup_time_clean", "delivery_time_clean",
    "prep_time_min", "delivery_time_gap_min", "total_fulfillment_time_min",
    "prep_time_negative", "delivery_time_gap_negative", "total_fulfillment_time_negative"
).show(20, truncate=False)


+--------+-------------------+-------------------+-------------------+-------------+---------------------+--------------------------+------------------+--------------------------+-------------------------------+
|order_id|order_time_clean   |pickup_time_clean  |delivery_time_clean|prep_time_min|delivery_time_gap_min|total_fulfillment_time_min|prep_time_negative|delivery_time_gap_negative|total_fulfillment_time_negative|
+--------+-------------------+-------------------+-------------------+-------------+---------------------+--------------------------+------------------+--------------------------+-------------------------------+
|O100000 |2026-01-29 04:16:00|NULL               |NULL               |NULL         |NULL                 |NULL                      |false             |false                     |false                          |
|O100001 |2026-01-29 17:48:00|2026-01-29 18:16:00|2026-01-29 18:16:00|28.0         |0.0                  |28.0                      |false             |

# PHASE 4 – Data Validation

# Count rows where:
*   pickup_time < order_time
*   delivery_time < pickup_time

In [120]:
pickup_before_order = df.filter(
    col("pickup_time_clean").isNotNull() &
    col("order_time_clean").isNotNull() &
    (col("pickup_time_clean") < col("order_time_clean"))
).count()

In [121]:
delivery_before_pickup = df.filter(
    col("delivery_time_clean").isNotNull() &
    col("pickup_time_clean").isNotNull() &
    (col("delivery_time_clean") < col("pickup_time_clean"))
).count()

# Count rows where delivery_status = DELIVERED but delivery_time is null

In [122]:
delivered_without_delivery_time = df.filter(
    (col("delivery_status") == "DELIVERED") &
    (col("delivery_time_clean").isNull())
).count()

In [123]:
delivered_without_delivery_time

2817

#  Count rows with invalid amounts

In [124]:
invalid_amount_rows = df.filter(~col("order_amount_valid")).count()

In [125]:
invalid_amount_rows

14073

#  Count CANCELLED orders

In [126]:
cancelled_orders = df.filter(col("delivery_status") == "CANCELLED").count()

In [127]:
cancelled_orders

37658

# OPERATIONAL DATA QUALITY REPORT

In [61]:
quality_report = {
    "pickup_before_order_time_rows": pickup_before_order,
    "delivery_before_pickup_time_rows": delivery_before_pickup,
    "delivered_but_no_delivery_time_rows": delivered_without_delivery_time,
    "invalid_order_amount_rows": invalid_amount_rows,
    "cancelled_orders_rows": cancelled_orders
}

print("\n=== OPERATIONAL DATA QUALITY REPORT ===")
for k, v in quality_report.items():
    print(f"{k}: {v}")


=== OPERATIONAL DATA QUALITY REPORT ===
pickup_before_order_time_rows: 19095
delivery_before_pickup_time_rows: 17650
delivered_but_no_delivery_time_rows: 2817
invalid_order_amount_rows: 14073
cancelled_orders_rows: 37658


# PHASE 5 – Operational Analytics

# Average prep_time per restaurant

In [128]:
from pyspark.sql.functions import col, avg, count, sum as spark_sum, round as spark_round

prep_valid_filter = col("prep_time_min").isNotNull() & ~col("prep_time_negative")
gap_valid_filter = col("delivery_time_gap_min").isNotNull() & ~col("delivery_time_gap_negative")
total_valid_filter = col("total_fulfillment_time_min").isNotNull() & ~col("total_fulfillment_time_negative")

In [129]:
avg_prep_per_restaurant = (
    df.filter(prep_valid_filter)
      .groupBy("restaurant_id", "restaurant_city")
      .agg(
          spark_round(avg(col("prep_time_min")), 2).alias("avg_prep_time_min"),
          count(lit(1)).alias("orders_count")
      )
      .orderBy(col("avg_prep_time_min").desc())
)

avg_prep_per_restaurant.show(20, truncate=False)

+-------------+---------------+-----------------+------------+
|restaurant_id|restaurant_city|avg_prep_time_min|orders_count|
+-------------+---------------+-----------------+------------+
|R597         |Kolkata        |24.09            |11          |
|R531         |Pune           |24.0             |8           |
|R568         |Delhi          |23.12            |17          |
|R280         |Delhi          |22.87            |15          |
|R618         |Kolkata        |22.47            |15          |
|R622         |Mumbai         |22.35            |17          |
|R924         |Chennai        |22.27            |22          |
|R583         |Pune           |22.0             |16          |
|R725         |Kolkata        |22.0             |14          |
|R539         |Delhi          |21.72            |18          |
|R885         |Pune           |21.64            |14          |
|R822         |Hyderabad      |21.5             |10          |
|R841         |Chennai        |21.36            |14    

#  Average delivery_time_gap per delivery city

In [131]:
avg_gap_per_city = (
    df.filter(gap_valid_filter)
      .groupBy("delivery_city")
      .agg(
          spark_round(avg(col("delivery_time_gap_min")), 2).alias("avg_delivery_time_gap_min"),
          count(lit(1)).alias("orders_count")
      )
      .orderBy(col("avg_delivery_time_gap_min").desc())
)

avg_gap_per_city.show(20, truncate=False)

+-------------+-------------------------+------------+
|delivery_city|avg_delivery_time_gap_min|orders_count|
+-------------+-------------------------+------------+
|Mumbai       |30.16                    |15259       |
|Delhi        |30.12                    |15527       |
|Kolkata      |30.11                    |15437       |
|Pune         |30.1                     |15358       |
|Bangalore    |30.02                    |15628       |
|Chennai      |30.02                    |15748       |
|Hyderabad    |29.91                    |15266       |
+-------------+-------------------------+------------+



# Top 10 slowest restaurants

In [132]:
slowest_restaurants_top10 = (
    df.filter(total_valid_filter)
      .groupBy("restaurant_id", "restaurant_city")
      .agg(
          spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
          count(lit(1)).alias("orders_count")
      )
      .orderBy(col("avg_total_fulfillment_time_min").desc())
      .limit(10)
)

slowest_restaurants_top10.show(10, truncate=False)

+-------------+---------------+------------------------------+------------+
|restaurant_id|restaurant_city|avg_total_fulfillment_time_min|orders_count|
+-------------+---------------+------------------------------+------------+
|R614         |Chennai        |57.23                         |13          |
|R871         |Hyderabad      |56.47                         |17          |
|R769         |Delhi          |55.88                         |16          |
|R792         |Chennai        |55.82                         |17          |
|R977         |Bangalore      |55.8                          |20          |
|R904         |Mumbai         |55.0                          |18          |
|R638         |Hyderabad      |54.71                         |17          |
|R684         |Bangalore      |54.69                         |16          |
|R839         |Kolkata        |54.63                         |16          |
|R252         |Kolkata        |54.56                         |16          |
+-----------

# Cities with highest average delivery time

In [133]:
cities_highest_avg_total = (
    df.filter(total_valid_filter)
      .groupBy("delivery_city")
      .agg(
          spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
          count(lit(1)).alias("orders_count")
      )
      .orderBy(col("avg_total_fulfillment_time_min").desc())
)

cities_highest_avg_total.show(20, truncate=False)

+-------------+------------------------------+------------+
|delivery_city|avg_total_fulfillment_time_min|orders_count|
+-------------+------------------------------+------------+
|Mumbai       |39.82                         |18653       |
|Kolkata      |39.71                         |18814       |
|Bangalore    |39.7                          |18997       |
|Pune         |39.68                         |18822       |
|Delhi        |39.66                         |18873       |
|Hyderabad    |39.65                         |18592       |
|Chennai      |39.56                         |19204       |
+-------------+------------------------------+------------+



#  Cancellation percentage per city

In [134]:
from pyspark.sql.functions import when, lit

city_cancel_stats = (
    df.groupBy("delivery_city")
      .agg(
          count(lit(1)).alias("total_orders"),
          spark_sum(when(col("delivery_status") == "CANCELLED", 1).otherwise(0)).alias("cancelled_orders")
      )
      .withColumn(
          "cancellation_pct",
          spark_round((col("cancelled_orders") / col("total_orders")) * 100.0, 2)
      )
      .orderBy(col("cancellation_pct").desc(), col("total_orders").desc())
)

city_cancel_stats.show(50, truncate=False)


+-------------+------------+----------------+----------------+
|delivery_city|total_orders|cancelled_orders|cancellation_pct|
+-------------+------------+----------------+----------------+
|Kolkata      |21401       |5488            |25.64           |
|Chennai      |21742       |5499            |25.29           |
|Mumbai       |21192       |5354            |25.26           |
|Hyderabad    |21173       |5343            |25.23           |
|Delhi        |21538       |5390            |25.03           |
|Pune         |21409       |5319            |24.84           |
|Bangalore    |21545       |5265            |24.44           |
+-------------+------------+----------------+----------------+



# PHASE 6 – Window Functions

# Rank restaurants by:
*   Average fulfillment time
*   Cancellation rate

In [135]:
from pyspark.sql import Window
from pyspark.sql.functions import (
   lit, dense_rank
)

In [136]:
restaurant_time_agg = (
    df.filter(total_valid_filter)
      .groupBy("restaurant_id", "restaurant_city")
      .agg(
          spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
          count(lit(1)).alias("orders_with_valid_time")
      )
)

In [137]:
restaurant_cancel_agg = (
    df.groupBy("restaurant_id", "restaurant_city")
      .agg(
          count(lit(1)).alias("total_orders"),
          spark_sum(when(col("delivery_status") == "CANCELLED", 1).otherwise(0)).alias("cancelled_orders")
      )
      .withColumn(
          "cancellation_rate_pct",
          when(col("total_orders") > 0, (col("cancelled_orders") / col("total_orders")) * 100.0).otherwise(lit(None).cast("double"))
      )
      .withColumn("cancellation_rate_pct", spark_round(col("cancellation_rate_pct"), 2))
)

In [138]:
restaurants_ranked = (
    restaurant_time_agg.join(restaurant_cancel_agg, on=["restaurant_id", "restaurant_city"], how="outer")
)

In [139]:
w_avg_fulfillment = Window.orderBy(col("avg_total_fulfillment_time_min").asc())
w_cancel_rate     = Window.orderBy(col("cancellation_rate_pct").asc())

restaurants_ranked = (
    restaurants_ranked
      .withColumn("rank_by_avg_fulfillment", dense_rank().over(w_avg_fulfillment))
      .withColumn("rank_by_cancellation_rate", dense_rank().over(w_cancel_rate))
)

restaurants_ranked.select(
    "restaurant_id", "restaurant_city",
    "avg_total_fulfillment_time_min", "orders_with_valid_time",
    "total_orders", "cancelled_orders", "cancellation_rate_pct",
    "rank_by_avg_fulfillment", "rank_by_cancellation_rate"
).orderBy(col("rank_by_avg_fulfillment").asc()).show(20, truncate=False)


+-------------+---------------+------------------------------+----------------------+------------+----------------+---------------------+-----------------------+-------------------------+
|restaurant_id|restaurant_city|avg_total_fulfillment_time_min|orders_with_valid_time|total_orders|cancelled_orders|cancellation_rate_pct|rank_by_avg_fulfillment|rank_by_cancellation_rate|
+-------------+---------------+------------------------------+----------------------+------------+----------------+---------------------+-----------------------+-------------------------+
|R105         |Kolkata        |22.64                         |14                    |17          |6               |35.29                |1                      |147                      |
|R171         |Hyderabad      |22.64                         |14                    |14          |6               |42.86                |1                      |177                      |
|R116         |Pune           |24.0                         

# Rank delivery partners by:
*   Fastest average delivery
*   Highest ratings

In [140]:
partner_delivery_speed = (
    df.filter(gap_valid_filter)
      .groupBy("delivery_partner_id")
      .agg(
          spark_round(avg(col("delivery_time_gap_min")), 2).alias("avg_delivery_gap_min"),
          count(lit(1)).alias("orders_count_for_speed")
      )
)

In [141]:
partner_ratings = (
    df.filter((col("delivery_status") == "DELIVERED") & col("rating").isNotNull())
      .withColumn("rating_double", col("rating").cast("double"))
      .groupBy("delivery_partner_id")
      .agg(
          spark_round(avg(col("rating_double")), 2).alias("avg_rating"),
          count(lit(1)).alias("ratings_count")
      )
)

In [142]:
partners_joined = partner_delivery_speed.join(partner_ratings, on="delivery_partner_id", how="outer")

w_partner_speed  = Window.orderBy(col("avg_delivery_gap_min").asc())
w_partner_rating = Window.orderBy(col("avg_rating").desc())

partners_ranked = (
    partners_joined
      .withColumn("rank_fastest_delivery", dense_rank().over(w_partner_speed))
      .withColumn("rank_highest_rating",   dense_rank().over(w_partner_rating))
)

In [143]:
partners_ranked.select(
    "delivery_partner_id",
    "avg_delivery_gap_min", "orders_count_for_speed",
    "avg_rating", "ratings_count",
    "rank_fastest_delivery", "rank_highest_rating"
).orderBy(col("rank_fastest_delivery").asc(), col("rank_highest_rating").asc()).show(20, truncate=False)

+-------------------+--------------------+----------------------+----------+-------------+---------------------+-------------------+
|delivery_partner_id|avg_delivery_gap_min|orders_count_for_speed|avg_rating|ratings_count|rank_fastest_delivery|rank_highest_rating|
+-------------------+--------------------+----------------------+----------+-------------+---------------------+-------------------+
|D267               |24.54               |119                   |3.21      |48           |1                    |25                 |
|D417               |24.69               |104                   |3.36      |34           |2                    |10                 |
|D246               |25.06               |122                   |3.02      |44           |3                    |44                 |
|D570               |25.38               |130                   |2.86      |38           |4                    |60                 |
|D111               |25.95               |112                   |3.01

 # Identify top 3 and bottom 3 per city

In [144]:
city_restaurant_perf = (
    df.filter(total_valid_filter & col("delivery_city").isNotNull())
      .groupBy("delivery_city", "restaurant_id", "restaurant_city")
      .agg(
          spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
          count(lit(1)).alias("orders_with_valid_time")
      )
)

In [145]:
w_city_fast  = Window.partitionBy("delivery_city").orderBy(col("avg_total_fulfillment_time_min").asc())
w_city_slow  = Window.partitionBy("delivery_city").orderBy(col("avg_total_fulfillment_time_min").desc())

top3_per_city = city_restaurant_perf.withColumn("rank_in_city_fast", dense_rank().over(w_city_fast)) \
                                    .filter(col("rank_in_city_fast") <= 3) \
                                    .orderBy(col("delivery_city").asc(), col("rank_in_city_fast").asc())

bottom3_per_city = city_restaurant_perf.withColumn("rank_in_city_slow", dense_rank().over(w_city_slow)) \
                                       .filter(col("rank_in_city_slow") <= 3) \
                                       .orderBy(col("delivery_city").asc(), col("rank_in_city_slow").asc())

In [148]:
top3_per_city.select(
    "delivery_city", "restaurant_id", "restaurant_city",
    "avg_total_fulfillment_time_min", "orders_with_valid_time", "rank_in_city_fast"
).show(100, truncate=False)

+-------------+-------------+---------------+------------------------------+----------------------+-----------------+
|delivery_city|restaurant_id|restaurant_city|avg_total_fulfillment_time_min|orders_with_valid_time|rank_in_city_fast|
+-------------+-------------+---------------+------------------------------+----------------------+-----------------+
|Bangalore    |R721         |Kolkata        |0.0                           |1                     |1                |
|Bangalore    |R995         |Hyderabad      |0.0                           |1                     |1                |
|Bangalore    |R548         |Bangalore      |0.0                           |1                     |1                |
|Bangalore    |R322         |Delhi          |0.0                           |1                     |1                |
|Bangalore    |R535         |Mumbai         |0.0                           |1                     |1                |
|Bangalore    |R414         |Kolkata        |0.5        

In [149]:

bottom3_per_city.select(
    "delivery_city", "restaurant_id", "restaurant_city",
    "avg_total_fulfillment_time_min", "orders_with_valid_time", "rank_in_city_slow"
).show(100, truncate=False)

+-------------+-------------+---------------+------------------------------+----------------------+-----------------+
|delivery_city|restaurant_id|restaurant_city|avg_total_fulfillment_time_min|orders_with_valid_time|rank_in_city_slow|
+-------------+-------------+---------------+------------------------------+----------------------+-----------------+
|Bangalore    |R259         |Bangalore      |89.0                          |1                     |1                |
|Bangalore    |R416         |Kolkata        |87.0                          |1                     |2                |
|Bangalore    |R966         |Pune           |87.0                          |1                     |2                |
|Bangalore    |R743         |Kolkata        |87.0                          |1                     |2                |
|Bangalore    |R997         |Pune           |86.0                          |1                     |3                |
|Chennai      |R296         |Bangalore      |88.0       

# PHASE 7 – Customer Experience

In [150]:

from pyspark.sql.functions import col

delivered = (
    df.filter((col("delivery_status") == "DELIVERED") & col("rating").isNotNull())).withColumn("rating_double", col("rating").cast("double"))

delivered_total = delivered.filter(
    col("total_fulfillment_time_min").isNotNull() & ~col("total_fulfillment_time_negative")
)

delivered_gap = delivered.filter(
    col("delivery_time_gap_min").isNotNull() & ~col("delivery_time_gap_negative")
)


# Average rating per restaurant

In [151]:

from pyspark.sql.functions import avg, round as spark_round, count, desc

avg_rating_per_restaurant = (
    delivered.groupBy("restaurant_id", "restaurant_city")
    .agg(
        spark_round(avg("rating_double"), 2).alias("avg_rating"),
        count("*").alias("delivered_count")
    )
    .orderBy(desc("avg_rating"), desc("delivered_count"))
)

avg_rating_per_restaurant.show(30, truncate=False)


+-------------+---------------+----------+---------------+
|restaurant_id|restaurant_city|avg_rating|delivered_count|
+-------------+---------------+----------+---------------+
|R660         |Chennai        |4.95      |2              |
|R641         |Delhi          |4.9       |1              |
|R768         |Chennai        |4.9       |1              |
|R789         |Hyderabad      |4.9       |1              |
|R806         |Hyderabad      |4.9       |1              |
|R420         |Pune           |4.9       |1              |
|R864         |Pune           |4.9       |1              |
|R746         |Hyderabad      |4.85      |2              |
|R209         |Hyderabad      |4.8       |2              |
|R922         |Mumbai         |4.8       |1              |
|R486         |Bangalore      |4.8       |1              |
|R514         |Bangalore      |4.8       |1              |
|R166         |Pune           |4.7       |2              |
|R961         |Delhi          |4.7       |1             

# Relationship between:
*   Delivery time
*   Rating

In [152]:
from pyspark.sql.functions import corr

delivered_total.agg(
    corr("total_fulfillment_time_min", "rating_double").alias("corr_total_vs_rating")
).show()

delivered_gap.agg(
    corr("delivery_time_gap_min", "rating_double").alias("corr_gap_vs_rating")
).show()


+--------------------+
|corr_total_vs_rating|
+--------------------+
|-6.13801103559660...|
+--------------------+

+--------------------+
|  corr_gap_vs_rating|
+--------------------+
|-0.00185129337774...|
+--------------------+



#  Identify if slower deliveries reduce ratings

In [153]:

q25, q50, q75 = delivered_total.approxQuantile("total_fulfillment_time_min", [0.25, 0.5, 0.75], 0.01)


fast_vs_slow = (
    delivered_total
    .withColumn(
        "speed_bucket",
        when(col("total_fulfillment_time_min") <= q25, "FAST (<= P25)")
        .when(col("total_fulfillment_time_min") >= q75, "SLOW (>= P75)")
        .otherwise(None)
    )
    .filter(col("speed_bucket").isNotNull())
    .groupBy("speed_bucket")
    .agg(
        spark_round(avg("rating_double"), 2).alias("avg_rating"),
        count("*").alias("orders")
    )
    .orderBy("speed_bucket")
)

fast_vs_slow.show(truncate=False)


+-------------+----------+------+
|speed_bucket |avg_rating|orders|
+-------------+----------+------+
|FAST (<= P25)|3.0       |8686  |
|SLOW (>= P75)|2.99      |8618  |
+-------------+----------+------+



# PHASE 8 – Performance Engineering

# Cache the cleaned DataFrame

In [159]:
df_cached = df.cache()

# Use explain(True) on:
*   Restaurant ranking query
*   City delay analysis query

In [155]:

restaurant_time_agg = (
    df_cached.filter(total_valid_filter)
             .groupBy("restaurant_id", "restaurant_city")
             .agg(
                 spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
                 count("*").alias("orders_with_valid_time")
             )
)

restaurant_cancel_agg = (
    df_cached.groupBy("restaurant_id", "restaurant_city")
             .agg(
                 count("*").alias("total_orders"),
                 spark_sum(when(col("delivery_status") == "CANCELLED", 1).otherwise(0)).alias("cancelled_orders")
             )
             .withColumn("cancellation_rate_pct",
                         spark_round((col("cancelled_orders") / col("total_orders")) * 100.0, 2))
)

restaurants_ranked_q = (
    restaurant_time_agg.join(restaurant_cancel_agg, ["restaurant_id", "restaurant_city"], "outer")
                       .withColumn("rank_by_avg_fulfillment",
                                   dense_rank().over(Window.orderBy(col("avg_total_fulfillment_time_min").asc())))
                       .withColumn("rank_by_cancellation_rate",
                                   dense_rank().over(Window.orderBy(col("cancellation_rate_pct").asc())))
)

avg_gap_per_city_q = (
    df_cached.filter(gap_valid_filter)
             .groupBy("delivery_city")
             .agg(
                 spark_round(avg(col("delivery_time_gap_min")), 2).alias("avg_delivery_time_gap_min"),
                 count("*").alias("orders_count")
             )
)



In [156]:
print("Restaurant Ranking Query")
restaurants_ranked_q.explain(True)

Restaurant Ranking Query
== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank_by_cancellation_rate, 'dense_rank() windowspecdefinition('cancellation_rate_pct ASC NULLS FIRST, unspecifiedframe$()), None)]
+- Project [restaurant_id#8745, restaurant_city#8746, avg_total_fulfillment_time_min#8640, orders_with_valid_time#8641L, total_orders#8671L, cancelled_orders#8672L, cancellation_rate_pct#8702, rank_by_avg_fulfillment#8747]
   +- Project [restaurant_id#8745, restaurant_city#8746, avg_total_fulfillment_time_min#8640, orders_with_valid_time#8641L, total_orders#8671L, cancelled_orders#8672L, cancellation_rate_pct#8702, rank_by_avg_fulfillment#8747, rank_by_avg_fulfillment#8747]
      +- Window [dense_rank(avg_total_fulfillment_time_min#8640) windowspecdefinition(avg_total_fulfillment_time_min#8640 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank_by_avg_fulfillment#8747], [avg_total_fulfillment_time_min#8640 ASC NULLS FIRST]
      

In [157]:
print("City Delay Analysis Query")
avg_gap_per_city_q.explain(True)

City Delay Analysis Query
== Parsed Logical Plan ==
'Aggregate ['delivery_city], ['delivery_city, 'round('avg('delivery_time_gap_min), 2) AS avg_delivery_time_gap_min#8765, 'count(*) AS orders_count#8766]
+- Filter (isnotnull(delivery_time_gap_min#6971) AND NOT delivery_time_gap_negative#6968)
   +- Project [order_id#6811, customer_id#6812, restaurant_id#6813, restaurant_city#6814, delivery_city#6815, order_time#6624, pickup_time#6625, delivery_time#6626, delivery_status#6816, delivery_partner_id#6817, order_amount#6818, payment_mode#6819, rating#6820, order_amount_clean#6824, order_amount_valid#6825, order_time_clean#6845, pickup_time_clean#6846, delivery_time_clean#6847, order_time_valid#6848, pickup_time_valid#6849, delivery_time_valid#6850, prep_time_min#6970, CASE WHEN (isnotnull(delivery_time_clean#6847) AND isnotnull(pickup_time_clean#6846)) THEN (cast((cast(delivery_time_clean#6847 as bigint) - cast(pickup_time_clean#6846 as bigint)) as double) / cast(60.0 as double)) ELSE cast

In [158]:
df_cached.unpersist()

DataFrame[order_id: string, customer_id: string, restaurant_id: string, restaurant_city: string, delivery_city: string, order_time: string, pickup_time: string, delivery_time: string, delivery_status: string, delivery_partner_id: string, order_amount: string, payment_mode: string, rating: string, order_amount_clean: int, order_amount_valid: boolean, order_time_clean: timestamp, pickup_time_clean: timestamp, delivery_time_clean: timestamp, order_time_valid: boolean, pickup_time_valid: boolean, delivery_time_valid: boolean, prep_time_min: double, delivery_time_gap_min: double, total_fulfillment_time_min: double, prep_time_negative: boolean, delivery_time_gap_negative: boolean, total_fulfillment_time_negative: boolean]

#  Identify shuffle stages


*   Exchange SinglePartition
*   SortMergeJoin
*   Sort
*   Exchange hashpartitioning
*   HashAggregate + Exchange
*   Exchange
*   Window + Exchange (due to global window ordering)

 # Repartition by delivery_city.

In [160]:
df_city = df_cached.repartition("delivery_city").cache()

In [161]:

restaurant_time_agg_after = (
    df_city.filter(total_valid_filter)
           .groupBy("restaurant_id", "restaurant_city")
           .agg(
               spark_round(avg(col("total_fulfillment_time_min")), 2).alias("avg_total_fulfillment_time_min"),
               count("*").alias("orders_with_valid_time")
           )
)

restaurant_cancel_agg_after = (
    df_city.groupBy("restaurant_id", "restaurant_city")
           .agg(
               count("*").alias("total_orders"),
               spark_sum(when(col("delivery_status") == "CANCELLED", 1).otherwise(0)).alias("cancelled_orders")
           )
           .withColumn("cancellation_rate_pct",
                       spark_round((col("cancelled_orders") / col("total_orders")) * 100.0, 2))
)

restaurants_ranked_after = (
    restaurant_time_agg_after.join(restaurant_cancel_agg_after, ["restaurant_id", "restaurant_city"], "outer")
                             .withColumn("rank_by_avg_fulfillment",
                                         dense_rank().over(Window.orderBy(col("avg_total_fulfillment_time_min").asc())))
                             .withColumn("rank_by_cancellation_rate",
                                         dense_rank().over(Window.orderBy(col("cancellation_rate_pct").asc())))
)

avg_gap_per_city_after = (
    df_city.filter(gap_valid_filter)
           .groupBy("delivery_city")
           .agg(
               spark_round(avg(col("delivery_time_gap_min")), 2).alias("avg_delivery_time_gap_min"),
               count("*").alias("orders_count")
           )
)


# Compare plans before and after

In [162]:

print("(Restaurant Ranking) — AFTER")
restaurants_ranked_after.explain(True)



(Restaurant Ranking) — AFTER
== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank_by_cancellation_rate, 'dense_rank() windowspecdefinition('cancellation_rate_pct ASC NULLS FIRST, unspecifiedframe$()), None)]
+- Project [restaurant_id#10819, restaurant_city#10820, avg_total_fulfillment_time_min#10714, orders_with_valid_time#10715L, total_orders#10745L, cancelled_orders#10746L, cancellation_rate_pct#10776, rank_by_avg_fulfillment#10821]
   +- Project [restaurant_id#10819, restaurant_city#10820, avg_total_fulfillment_time_min#10714, orders_with_valid_time#10715L, total_orders#10745L, cancelled_orders#10746L, cancellation_rate_pct#10776, rank_by_avg_fulfillment#10821, rank_by_avg_fulfillment#10821]
      +- Window [dense_rank(avg_total_fulfillment_time_min#10714) windowspecdefinition(avg_total_fulfillment_time_min#10714 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rank_by_avg_fulfillment#10821], [avg_total_fulfillment_time_min#1071

In [163]:

print("EXPLAIN (City Delay) — AFTER")
avg_gap_per_city_after.explain(True)

EXPLAIN (City Delay) — AFTER
== Parsed Logical Plan ==
'Aggregate ['delivery_city], ['delivery_city, 'round('avg('delivery_time_gap_min), 2) AS avg_delivery_time_gap_min#10839, 'count(*) AS orders_count#10840]
+- Filter (isnotnull(delivery_time_gap_min#6971) AND NOT delivery_time_gap_negative#6968)
   +- RepartitionByExpression [delivery_city#6815]
      +- Project [order_id#6811, customer_id#6812, restaurant_id#6813, restaurant_city#6814, delivery_city#6815, order_time#6624, pickup_time#6625, delivery_time#6626, delivery_status#6816, delivery_partner_id#6817, order_amount#6818, payment_mode#6819, rating#6820, order_amount_clean#6824, order_amount_valid#6825, order_time_clean#6845, pickup_time_clean#6846, delivery_time_clean#6847, order_time_valid#6848, pickup_time_valid#6849, delivery_time_valid#6850, prep_time_min#6970, CASE WHEN (isnotnull(delivery_time_clean#6847) AND isnotnull(pickup_time_clean#6846)) THEN (cast((cast(delivery_time_clean#6847 as bigint) - cast(pickup_time_clean#68

# PHASE 9 – RDD

# Convert delivered orders to RDD

In [164]:

delivered_paid_df = df.filter(
    (col("delivery_status") == "DELIVERED") &
    col("order_amount_clean").isNotNull()
)

delivered_rdd = delivered_paid_df.rdd


# Compute:
*   Total revenue using reduce
*   Order count per city using map-reduce

In [165]:
amounts_rdd = delivered_rdd.map(lambda row: int(row["order_amount_clean"]))

if amounts_rdd.isEmpty():
    total_revenue = 0
else:
    total_revenue = amounts_rdd.reduce(lambda a, b: a + b)

total_revenue


50591938

In [166]:

city_counts_rdd = (
    delivered_rdd
    .map(lambda row: (row["delivery_city"], 1))
    .reduceByKey(lambda a, b: a + b)
)

city_counts = city_counts_rdd.collect()
print("Delivered order count per city:")
for city, cnt in sorted(city_counts, key=lambda x: (-x[1], x[0] or "")):
    print(f"{city}: {cnt}")


Delivered order count per city:
Bangalore: 4959
Delhi: 4926
Pune: 4924
Chennai: 4849
Hyderabad: 4805
Kolkata: 4764
Mumbai: 4740


# Explain why DataFrames are better here


*   Catalyst optimizer (auto logical/physical plan optimizations)
*   Fewer shuffles, faster execution
*   Built‑in aggregations/joins/window funcs (vectorized, efficient)
*   Less code, easier maintenance and debugging
*   Schema-aware, better type safety than Python lambdas
*   SQL compatibility
*   Better memory management vs Python RDD overhead


