In [None]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Food delivery")\
    .getOrCreate()


**PHASE 1 – Ingestion**
1. Read delivery_data.csv as all StringType.

2. Print schema and record count.
3. Show sample rows.
4. Identify obvious data issues.

In [None]:

from pyspark.sql.types import StructType,StructField,StringType,DateType,IntegerType,LongType
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform, get_json_object, lower

columns = [
    "order_id",
    "customer_id",
    "restaurant_id",
    "restaurant_city",
    "delivery_city",
    "order_time",
    "pickup_time",
    "delivery_time",
    "delivery_status",
    "delivery_partner_id",
    "order_amount",
    "payment_mode",
    "rating"
]

delivery_schema = StructType([StructField(c, StringType(), True) for c in columns])
path='/content/delivery_data.csv'
delivery_df = (
    spark.read
         .option("header", "true")
         .schema(delivery_schema)
         .csv(path)
)


In [None]:

delivery_df.printSchema()
record_count = delivery_df.count()
print(f"Total records: {record_count}")


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_city: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- delivery_partner_id: string (nullable = true)
 |-- order_amount: string (nullable = true)
 |-- payment_mode: string (nullable = true)
 |-- rating: string (nullable = true)

Total records: 150000


In [None]:
delivery_df.show(10)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|         order_time|        pickup_time|      delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
| O100000|      C9467|         R972|      Bangalore|      Chennai|29/01/2026 04:16:00|               NULL|               NULL|      CANCELLED|               D505|     invalid|         COD|  NULL|
| O100001|      C1081|         R675|           Pune|    Bangalore|2026-01-29 17:48:00|2026-01-29 18:16:00|2026-01-29 18:16:00|         PLACED|               D359|         304|  CreditCard|  NULL|
| O100002|      C313

**Identify obvious data issues**

1>Mixed Datetime formats and blanks.

2>order_amount has commas and invalid strings

3>Some rows are Delivered but delivery_time is blank

4>Potential duplicates check on order_id

**PHASE 2 – Cleaning**

**1. Trim all string columns.**

**2. Clean order_amount:**

Remove commas
Convert to IntegerType
Handle invalid values safely

**3. Parse timestamps into:**

order_time_clean
pickup_time_clean
delivery_time_clean

Support multiple formats:
yyyy-MM-dd HH:mm:ss
dd/MM/yyyy HH:mm:ss
yyyy/MM/dd HH:mm:ss
Keep original columns for audit.

**4. Create flags:**

order_time_valid
pickup_time_valid
delivery_time_valid

In [None]:
col = [
    "order_id",
    "customer_id",
    "restaurant_id",
    "restaurant_city",
    "delivery_city",
    "order_time",
    "pickup_time",
    "delivery_time",
    "delivery_status",
    "delivery_partner_id",
    "order_amount",
    "payment_mode",
    "rating"
]
from pyspark.sql import functions as f
trimmed_df=delivery_df
for c in col:
    trimmed_df = trimmed_df.withColumn(c, f.trim(f.col(c)))
trimmed_df.show(10)


+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|         order_time|        pickup_time|      delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+
| O100000|      C9467|         R972|      Bangalore|      Chennai|29/01/2026 04:16:00|               NULL|               NULL|      CANCELLED|               D505|     invalid|         COD|  NULL|
| O100001|      C1081|         R675|           Pune|    Bangalore|2026-01-29 17:48:00|2026-01-29 18:16:00|2026-01-29 18:16:00|         PLACED|               D359|         304|  CreditCard|  NULL|
| O100002|      C313

In [None]:

trimmed_df = trimmed_df.withColumn(
    "order_amount_new",
    f.when(
        f.regexp_replace(f.col("order_amount"), ",", "").rlike("^[0-9]+$"),
        f.regexp_replace(f.col("order_amount"), ",", "").cast("int")
    ).otherwise(None)
)
trimmed_df.show(10)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|         order_time|        pickup_time|      delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|order_amount_new|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+
| O100000|      C9467|         R972|      Bangalore|      Chennai|29/01/2026 04:16:00|               NULL|               NULL|      CANCELLED|               D505|        NULL|         COD|  NULL|            NULL|
| O100001|      C1081|         R675|           Pune|    Bangalore|2026-01-29 17:48:00|2026-01-29 18:16:00|2026-01-29 18:16:00|         PLACED|      

In [None]:

from pyspark.sql import functions as F

trimmed_df = (
    trimmed_df
    .withColumn(
        "order_time_clean",
        F.coalesce(
            F.expr("try_to_timestamp(order_time, 'yyyy-MM-dd HH:mm:ss')"),
            F.expr("try_to_timestamp(order_time, 'dd/MM/yyyy HH:mm:ss')"),
            F.expr("try_to_timestamp(order_time, 'yyyy/MM/dd HH:mm:ss')")
        )
    )
    .withColumn(
        "pickup_time_clean",
        F.coalesce(
            F.expr("try_to_timestamp(pickup_time, 'yyyy-MM-dd HH:mm:ss')"),
            F.expr("try_to_timestamp(pickup_time, 'dd/MM/yyyy HH:mm:ss')"),
            F.expr("try_to_timestamp(pickup_time, 'yyyy/MM/dd HH:mm:ss')")
        )
    )
    .withColumn(
        "delivery_time_clean",
        F.coalesce(
            F.expr("try_to_timestamp(delivery_time, 'yyyy-MM-dd HH:mm:ss')"),
            F.expr("try_to_timestamp(delivery_time, 'dd/MM/yyyy HH:mm:ss')"),
            F.expr("try_to_timestamp(delivery_time, 'yyyy/MM/dd HH:mm:ss')")
        )
    )
)
trimmed_df.printSchema()
trimmed_df.show(10)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_city: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- order_time: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- delivery_time: string (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- delivery_partner_id: string (nullable = true)
 |-- order_amount: integer (nullable = true)
 |-- payment_mode: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- order_amount_new: integer (nullable = true)
 |-- order_time_clean: timestamp (nullable = true)
 |-- pickup_time_clean: timestamp (nullable = true)
 |-- delivery_time_clean: timestamp (nullable = true)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+-

In [None]:
clean_df = (trimmed_df
    .withColumn("order_time_valid",    F.col("order_time_clean").isNotNull())
    .withColumn("pickup_time_valid",   F.col("pickup_time_clean").isNotNull())
    .withColumn("delivery_time_valid", F.col("delivery_time_clean").isNotNull())
)
clean_df.show(10)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+-------------------+-------------------+-------------------+----------------+-----------------+-------------------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|         order_time|        pickup_time|      delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|order_amount_new|   order_time_clean|  pickup_time_clean|delivery_time_clean|order_time_valid|pickup_time_valid|delivery_time_valid|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+-------------------+-------------------+-------------------+----------------+-----------------+-------------------+
| O100000|      

**PHASE 3 – Derived Metrics**

Create time-based metrics:

prep_time = pickup_time - order_time

delivery_time_gap = delivery_time - pickup_time

total_fulfillment_time = delivery_time - order_time

All in minutes.

Rows with negative times must be flagged.

In [None]:

from pyspark.sql import functions as F

clean_df = (
    clean_df
    .withColumn("prep_time_min",
        (F.col("pickup_time_clean").cast("long") - F.col("order_time_clean").cast("long")) / 60
    )
    .withColumn("delivery_time_gap_min",
        (F.col("delivery_time_clean").cast("long") - F.col("pickup_time_clean").cast("long")) / 60
    )
    .withColumn("total_fulfillment_time_min",
        (F.col("delivery_time_clean").cast("long") - F.col("order_time_clean").cast("long")) / 60
    )
    .withColumn(
        "negative_time_flag",
        (F.col("prep_time_min") < 0) |
        (F.col("delivery_time_gap_min") < 0) |
        (F.col("total_fulfillment_time_min") < 0)
    )
)
clean_df.show(10)

+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+------------+------+----------------+-------------------+-------------------+-------------------+----------------+-----------------+-------------------+-------------+---------------------+--------------------------+------------------+
|order_id|customer_id|restaurant_id|restaurant_city|delivery_city|         order_time|        pickup_time|      delivery_time|delivery_status|delivery_partner_id|order_amount|payment_mode|rating|order_amount_new|   order_time_clean|  pickup_time_clean|delivery_time_clean|order_time_valid|pickup_time_valid|delivery_time_valid|prep_time_min|delivery_time_gap_min|total_fulfillment_time_min|negative_time_flag|
+--------+-----------+-------------+---------------+-------------+-------------------+-------------------+-------------------+---------------+-------------------+------------+-----

**PHASE 4 – Data Validation**

1. Count rows where:

      pickup_time < order_time

      delivery_time < pickup_time

2. Count rows where delivery_status = DELIVERED but delivery_time
is null.

3. Count rows with invalid amounts.

4. Count CANCELLED orders.

Deliverable: Operational Data Quality Report.

In [None]:

from pyspark.sql import functions as F

df = clean_df

cnt_pickup_before_order = df.filter(
    F.col("pickup_time_clean") < F.col("order_time_clean")
).count()

cnt_delivery_before_pickup = df.filter(
    F.col("delivery_time_clean") < F.col("pickup_time_clean")
).count()

cnt_delivered_missing_time = df.filter(
    (F.lower(F.col("delivery_status")) == "delivered") &
    F.col("delivery_time_clean").isNull()
).count()

cnt_invalid_amounts = df.filter(
    F.col("order_amount").isNull() | (F.col("order_amount") <= 0)
).count()

cnt_cancelled = df.filter(
    F.lower(F.col("delivery_status")) == "cancelled"
).count()

print("pickup_time < order_time:", cnt_pickup_before_order)
print("delivery_time < pickup_time:", cnt_delivery_before_pickup)
print("DELIVERED but delivery_time is NULL:", cnt_delivered_missing_time)
print("Invalid order_amount:", cnt_invalid_amounts)
print("CANCELLED orders:", cnt_cancelled)


pickup_time < order_time: 19095
delivery_time < pickup_time: 17650
DELIVERED but delivery_time is NULL: 2817
Invalid order_amount: 14073
CANCELLED orders: 37658


**PHASE 5 – Operational Analytics**

1. Average prep_time per restaurant.

2. Average delivery_time_gap per delivery city.

3. Top 10 slowest restaurants.

4. Cities with highest average delivery time.

5. Cancellation percentage per city.

In [None]:

clean_df.groupBy("restaurant_id") \
  .agg(F.avg("prep_time_min").alias("avg_prep_time")) \
  .show()


+-------------+------------------+
|restaurant_id|     avg_prep_time|
+-------------+------------------+
|         R810|10.602649006622517|
|         R791|11.434782608695652|
|         R159|13.357575757575757|
|         R708|13.153846153846153|
|         R906| 13.87741935483871|
|         R143| 12.98857142857143|
|         R469|13.335616438356164|
|         R767|12.972222222222221|
|         R605|12.807692307692308|
|         R589|11.654545454545454|
|         R667|13.335365853658537|
|         R778|14.018987341772151|
|         R847|13.453416149068323|
|         R635|12.033783783783784|
|         R725|13.013986013986013|
|         R167|12.878306878306878|
|         R100|11.740963855421686|
|         R945| 12.89090909090909|
|         R177|12.105960264900663|
|         R484|12.201438848920864|
+-------------+------------------+
only showing top 20 rows


In [None]:

clean_df.groupBy("delivery_city") \
  .agg(F.avg("delivery_time_gap_min").alias("avg_delivery_time")) \
  .show()


+-------------+------------------+
|delivery_city| avg_delivery_time|
+-------------+------------------+
|    Bangalore|25.101736972704714|
|      Chennai| 25.08549251120586|
|       Mumbai| 25.13103953147877|
|      Kolkata| 25.09600935047587|
|         Pune| 25.02926229965935|
|        Delhi| 25.15737868380235|
|    Hyderabad|24.943524774774776|
+-------------+------------------+



In [None]:

clean_df.groupBy("restaurant_id") \
  .agg(F.avg("total_fulfillment_time_min").alias("avg_total_time")) \
  .orderBy(F.desc("avg_total_time")) \
  .limit(10) \
  .show()


+-------------+------------------+
|restaurant_id|    avg_total_time|
+-------------+------------------+
|         R301|44.264900662251655|
|         R850|              42.9|
|         R487| 42.73376623376623|
|         R710|              42.5|
|         R559| 42.34228187919463|
|         R736| 42.26896551724138|
|         R201|42.206896551724135|
|         R164|42.203821656050955|
|         R857|42.091503267973856|
|         R483| 42.08441558441559|
+-------------+------------------+



In [None]:

clean_df.groupBy("delivery_city") \
  .agg(F.avg("delivery_time_gap_min").alias("avg_delivery_time")) \
  .orderBy(F.desc("avg_delivery_time")) \
  .show()


+-------------+------------------+
|delivery_city| avg_delivery_time|
+-------------+------------------+
|        Delhi| 25.15737868380235|
|       Mumbai| 25.13103953147877|
|    Bangalore|25.101736972704714|
|      Kolkata| 25.09600935047587|
|      Chennai| 25.08549251120586|
|         Pune| 25.02926229965935|
|    Hyderabad|24.943524774774776|
+-------------+------------------+



In [None]:

df.groupBy("delivery_city") \
  .agg(
      (F.sum((F.lower("delivery_status") == "cancelled").cast("int")) /
       F.count("*") * 100).alias("cancellation_pct")
  ) \
  .show()


+-------------+------------------+
|delivery_city|  cancellation_pct|
+-------------+------------------+
|    Bangalore|24.437224414017173|
|      Chennai| 25.29206144788888|
|       Mumbai| 25.26425066062665|
|      Kolkata|25.643661511144337|
|         Pune|24.844691484889534|
|        Delhi|25.025536261491315|
|    Hyderabad| 25.23496906437444|
+-------------+------------------+



**PHASE 6 – Window Functions**

1. Rank restaurants by:
Average fulfillment time
Cancellation rate

2. Rank delivery partners by:
Fastest average delivery
Highest ratings

3. Identify top 3 and bottom 3 per city.

In [None]:
from pyspark.sql.window import Window


clean_df.groupBy("restaurant_id") \
  .agg(F.avg("total_fulfillment_time_min").alias("avg_time")) \
  .withColumn("rank", F.rank().over(Window.orderBy("avg_time"))) \
  .show()



+-------------+------------------+----+
|restaurant_id|          avg_time|rank|
+-------------+------------------+----+
|         R344|31.937062937062937|   1|
|         R234| 32.66901408450704|   2|
|         R553|32.701388888888886|   3|
|         R956|  32.9559748427673|   4|
|         R625| 32.97452229299363|   5|
|         R934| 33.08227848101266|   6|
|         R194| 33.11184210526316|   7|
|         R941|33.136986301369866|   8|
|         R741| 33.13907284768212|   9|
|         R173|33.266304347826086|  10|
|         R767| 33.31111111111111|  11|
|         R290|33.314465408805034|  12|
|         R248|33.396449704142015|  13|
|         R730| 33.48344370860927|  14|
|         R339| 33.48795180722892|  15|
|         R218|33.556338028169016|  16|
|         R457| 33.56849315068493|  17|
|         R128| 33.57142857142857|  18|
|         R552|33.642857142857146|  19|
|         R822| 33.73006134969325|  20|
+-------------+------------------+----+
only showing top 20 rows


In [None]:

clean_df.groupBy("restaurant_id") \
  .agg((F.sum((F.lower("delivery_status")=="cancelled").cast("int"))/F.count("*"))
       .alias("cancel_rate")) \
  .withColumn("rank", F.rank().over(Window.orderBy(F.desc("cancel_rate")))) \
  .show()


+-------------+-------------------+----+
|restaurant_id|        cancel_rate|rank|
+-------------+-------------------+----+
|         R617| 0.3708609271523179|   1|
|         R163|0.35443037974683544|   2|
|         R939|0.35428571428571426|   3|
|         R994|0.35195530726256985|   4|
|         R896|0.34705882352941175|   5|
|         R198|0.34408602150537637|   6|
|         R138| 0.3419354838709677|   7|
|         R759| 0.3375796178343949|   8|
|         R392| 0.3357142857142857|   9|
|         R540| 0.3352601156069364|  10|
|         R105| 0.3333333333333333|  11|
|         R438| 0.3333333333333333|  11|
|         R679|0.32967032967032966|  13|
|         R166| 0.3277777777777778|  14|
|         R531|0.32679738562091504|  15|
|         R164| 0.3216374269005848|  16|
|         R674|0.32142857142857145|  17|
|         R961|0.32075471698113206|  18|
|         R836|            0.31875|  19|
|         R239|            0.31875|  19|
+-------------+-------------------+----+
only showing top

In [None]:

clean_df.groupBy("delivery_partner_id") \
  .agg(F.avg("delivery_time_gap_min").alias("avg_delivery")) \
  .withColumn("rank", F.rank().over(Window.orderBy("avg_delivery"))) \
  .show()


+-------------------+------------------+----+
|delivery_partner_id|      avg_delivery|rank|
+-------------------+------------------+----+
|               D417|18.899224806201552|   1|
|               D267|19.158620689655173|   2|
|               D246| 19.36241610738255|   3|
|               D540|20.071428571428573|   4|
|               D108|20.078014184397162|   5|
|               D682|20.441860465116278|   6|
|               D624| 20.49645390070922|   7|
|               D664|20.515384615384615|   8|
|               D711|20.679738562091504|   9|
|               D396|20.729166666666668|  10|
|               D401| 20.74342105263158|  11|
|               D564|20.764227642276424|  12|
|               D426|            20.768|  13|
|               D387|20.870967741935484|  14|
|               D918|20.942622950819672|  15|
|               D718|              21.0|  16|
|               D220|21.074324324324323|  17|
|               D920|21.085271317829456|  18|
|               D497| 21.090909090

In [None]:

clean_df.groupBy("delivery_partner_id") \
  .agg(F.avg(F.col("rating").cast("double")).alias("avg_rating")) \
  .withColumn("rank", F.rank().over(Window.orderBy(F.desc("avg_rating")))) \
  .show()


+-------------------+------------------+----+
|delivery_partner_id|        avg_rating|rank|
+-------------------+------------------+----+
|               D951| 3.489743589743589|   1|
|               D149| 3.479411764705882|   2|
|               D118| 3.472727272727273|   3|
|               D751| 3.448717948717949|   4|
|               D774| 3.423809523809523|   5|
|               D397|  3.42093023255814|   6|
|               D814|3.4099999999999993|   7|
|               D735|3.4047619047619047|   8|
|               D141|3.4027027027027024|   9|
|               D368|3.3879999999999995|  10|
|               D264| 3.386111111111111|  11|
|               D647|3.3769230769230774|  12|
|               D417|3.3647058823529403|  13|
|               D319| 3.350000000000001|  14|
|               D293| 3.344117647058823|  15|
|               D601|3.3428571428571425|  16|
|               D413|3.3404761904761906|  17|
|               D991| 3.336363636363636|  18|
|               D345|3.33421052631

In [None]:

w = Window.partitionBy("delivery_city").orderBy("avg_time")

clean_df.groupBy("delivery_city","restaurant_id") \
  .agg(F.avg("total_fulfillment_time_min").alias("avg_time")) \
  .withColumn("rn_fast", F.row_number().over(w)) \
  .withColumn("rn_slow", F.row_number().over(w.orderBy(F.desc("avg_time")))) \
  .filter((F.col("rn_fast")<=3) | (F.col("rn_slow")<=3)) \
  .show()



+-------------+-------------+------------------+-------+-------+
|delivery_city|restaurant_id|          avg_time|rn_fast|rn_slow|
+-------------+-------------+------------------+-------+-------+
|    Bangalore|         R222|53.266666666666666|    900|      1|
|    Bangalore|         R600| 53.23529411764706|    899|      2|
|    Bangalore|         R884| 51.45454545454545|    898|      3|
|    Bangalore|         R267| 25.26086956521739|      3|    898|
|    Bangalore|         R189|             24.45|      2|    899|
|    Bangalore|         R134|            21.625|      1|    900|
|      Chennai|         R163| 51.94117647058823|    900|      1|
|      Chennai|         R324|              51.5|    899|      2|
|      Chennai|         R182|50.888888888888886|    898|      3|
|      Chennai|         R869|22.714285714285715|      3|    898|
|      Chennai|         R899|21.818181818181817|      2|    899|
|      Chennai|         R428|21.142857142857142|      1|    900|
|        Delhi|         R

**PHASE 7 – Customer Experience**

1. Average rating per restaurant.

2. Relationship between:

     Delivery time

     Rating

3. Identify if slower deliveries reduce ratings.

In [None]:


df = clean_df.withColumn("rating_num", F.col("rating").cast("double"))

df.groupBy("restaurant_id", "restaurant_city") \
  .agg(F.avg("rating_num").alias("avg_rating")) \
  .orderBy(F.desc("avg_rating")) \
  .show(20, truncate=False)



+-------------+---------------+-----------------+
|restaurant_id|restaurant_city|avg_rating       |
+-------------+---------------+-----------------+
|R660         |Chennai        |4.95             |
|R641         |Delhi          |4.9              |
|R768         |Chennai        |4.9              |
|R789         |Hyderabad      |4.9              |
|R806         |Hyderabad      |4.9              |
|R420         |Pune           |4.9              |
|R864         |Pune           |4.9              |
|R746         |Hyderabad      |4.85             |
|R209         |Hyderabad      |4.8              |
|R486         |Bangalore      |4.8              |
|R922         |Mumbai         |4.8              |
|R514         |Bangalore      |4.8              |
|R166         |Pune           |4.7              |
|R961         |Delhi          |4.7              |
|R752         |Chennai        |4.666666666666667|
|R651         |Delhi          |4.65             |
|R905         |Mumbai         |4.64             |


In [None]:
df = clean_df.withColumn("rating_num", F.col("rating").cast("double"))
corr = df.stat.corr("delivery_time_gap_min", "rating_num")
print(f"Correlation (delivery_time_gap_min vs rating): {corr}")

if corr is None:
    print("No correlation computed (insufficient data).")
elif corr < 0:
    print(" Slower deliveries are associated with LOWER ratings.")
else:
    print(" No evidence that slower deliveries reduce ratings (corr >= 0).")



Correlation (delivery_time_gap_min vs rating): 0.00625906888579824
 No evidence that slower deliveries reduce ratings (corr >= 0).


**PHASE 8 – Performance Engineering**
1. Cache the cleaned DataFrame.
2. Use explain(True) on:

      Restaurant ranking query

      City delay analysis query

3. Identify shuffle stages.
4. Repartition by delivery_city.
5. Compare plans before and after.

In [None]:

clean_df.cache()
clean_df.count()


150000

In [None]:

from pyspark.sql import functions as F

restaurant_rank_df = (
    clean_df
      .groupBy("restaurant_id", "restaurant_city")
      .agg(
          F.count("*").alias("total_orders"),
          F.avg(F.col("rating").cast("double")).alias("avg_rating")
      )
)

restaurant_rank_df.explain(True)


== Parsed Logical Plan ==
'Aggregate ['restaurant_id, 'restaurant_city], ['restaurant_id, 'restaurant_city, 'count(*) AS total_orders#2723, 'avg(cast('rating as double)) AS avg_rating#2724]
+- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, pickup_time#200, delivery_time#201, delivery_status#202, delivery_partner_id#203, order_amount#273, payment_mode#205, rating#206, order_amount_new#327, order_time_clean#447, pickup_time_clean#448, delivery_time_clean#449, order_time_valid#518, pickup_time_valid#519, delivery_time_valid#520, prep_time_min#616, delivery_time_gap_min#617, total_fulfillment_time_min#618, (((prep_time_min#616 < cast(0 as double)) OR (delivery_time_gap_min#617 < cast(0 as double))) OR (total_fulfillment_time_min#618 < cast(0 as double))) AS negative_time_flag#619]
   +- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, pickup_time#200, delive

In [None]:


from pyspark.sql import functions as F

city_delay_df = (
    clean_df
      .withColumn("delay_min",
          (F.unix_timestamp("delivery_time") - F.unix_timestamp("pickup_time")) / 60
      )
      .filter(F.col("delay_min").isNotNull())
      .groupBy("delivery_city")
      .agg(F.avg("delay_min").alias("avg_delay_min"))
)

city_delay_df.explain(True)



== Parsed Logical Plan ==
'Aggregate ['delivery_city], ['delivery_city, 'avg('delay_min) AS avg_delay_min#3118]
+- Filter isnotnull(delay_min#3117)
   +- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, pickup_time#200, delivery_time#201, delivery_status#202, delivery_partner_id#203, order_amount#273, payment_mode#205, rating#206, order_amount_new#327, order_time_clean#447, pickup_time_clean#448, delivery_time_clean#449, order_time_valid#518, pickup_time_valid#519, delivery_time_valid#520, prep_time_min#616, delivery_time_gap_min#617, total_fulfillment_time_min#618, negative_time_flag#619, (cast((unix_timestamp(delivery_time#201, yyyy-MM-dd HH:mm:ss, Some(Etc/UTC), true) - unix_timestamp(pickup_time#200, yyyy-MM-dd HH:mm:ss, Some(Etc/UTC), true)) as double) / cast(60 as double)) AS delay_min#3117]
      +- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, p

#  Identify shuffle stages

*   Exchange SinglePartition
*   SortMergeJoin
*   Sort
*   Exchange hashpartitioning
*   HashAggregate + Exchange
*   Exchange
*   Window + Exchange (due to global window ordering)

In [None]:
df_city = clean_df.repartition("delivery_city").cache()

In [None]:


from pyspark.sql import functions as F

df_city = clean_df

city_delay_df_2 = (
    df_city
      .filter(F.col("pickup_time_clean").isNotNull() & F.col("delivery_time_clean").isNotNull())
      .withColumn(
          "delay_min",
          (F.col("delivery_time_clean").cast("long") - F.col("pickup_time_clean").cast("long")) / 60
      )
      .groupBy("delivery_city")
      .agg(F.avg("delay_min").alias("avg_delay_min"))
)

city_delay_df_2.explain(True)
city_delay_df_2.show(truncate=False)




== Parsed Logical Plan ==
'Aggregate ['delivery_city], ['delivery_city, 'avg('delay_min) AS avg_delay_min#3990]
+- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, pickup_time#200, delivery_time#201, delivery_status#202, delivery_partner_id#203, order_amount#273, payment_mode#205, rating#206, order_amount_new#327, order_time_clean#447, pickup_time_clean#448, delivery_time_clean#449, order_time_valid#518, pickup_time_valid#519, delivery_time_valid#520, prep_time_min#616, delivery_time_gap_min#617, total_fulfillment_time_min#618, negative_time_flag#619, (cast((cast(delivery_time_clean#449 as bigint) - cast(pickup_time_clean#448 as bigint)) as double) / cast(60 as double)) AS delay_min#3989]
   +- Filter (isnotnull(pickup_time_clean#448) AND isnotnull(delivery_time_clean#449))
      +- Project [order_id#194, customer_id#195, restaurant_id#196, restaurant_city#197, delivery_city#198, order_time#199, pickup_time#200, delivery

**PHASE 9 – RDD**
1. Convert delivered orders to RDD.
2. Compute:

    Total revenue using reduce

    Order count per city using map-reduce
  
3. Explain why DataFrames are better here.

In [None]:
from pyspark.sql import functions as F

delivered_rdd = (
    clean_df
      .filter(F.lower(F.col("delivery_status")) == "delivered")
      .select("delivery_city", "order_amount_new")
      .rdd
)
delivered_rdd.take(10)

[Row(delivery_city='Chennai', order_amount_new=2318),
 Row(delivery_city='Hyderabad', order_amount_new=1054),
 Row(delivery_city='Hyderabad', order_amount_new=856),
 Row(delivery_city='Hyderabad', order_amount_new=None),
 Row(delivery_city='Bangalore', order_amount_new=872),
 Row(delivery_city='Delhi', order_amount_new=1692),
 Row(delivery_city='Kolkata', order_amount_new=872),
 Row(delivery_city='Kolkata', order_amount_new=602),
 Row(delivery_city='Mumbai', order_amount_new=1268),
 Row(delivery_city='Bangalore', order_amount_new=924)]

In [None]:

total_revenue = (
    delivered_rdd
      .map(lambda r: r.order_amount_new)
      .filter(lambda x: x is not None)
      .map(float)
      .reduce(lambda a, b: a + b)
)

print("Total revenue:", total_revenue)


Total revenue: 50591938.0


In [None]:

orders_per_city = (
    delivered_rdd
      .map(lambda r: (r.delivery_city, 1))
      .reduceByKey(lambda a, b: a + b)
)

print(orders_per_city.collect())


[('Hyderabad', 5284), ('Delhi', 5456), ('Mumbai', 5278), ('Pune', 5448), ('Chennai', 5352), ('Bangalore', 5448), ('Kolkata', 5296)]


**Explain why DataFrames are better here.**

DataFrames provide automatic query optimization and efficient memory usage through Catalyst and Tungsten. RDDs give low‑level control but are slower and harder to optimize for aggregation workloads.