In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Capstone").getOrCreate()

###PHASE 1 — SCHEMA DESIGN & INGESTION

1. Define explicit schemas for all datasets
2. Load raw delivery data using schema enforcement
3. Identify and flag corrupt records
4. Validate schema correctness

DATASET 1 — DELIVERY TRANSACTIONS (CSV)

In [5]:
delivery_data = [
("DLV001","Delhi ","D001","Delivered","120","2024-01-05 10:30"),
("DLV002","Mumbai","D002","Delivered","90","05/01/2024 11:00"),
("DLV003","Bangalore","D003","In Transit","200","2024/01/06 09:45"),
("DLV004","Delhi","D004","Cancelled","","2024-01-07 14:00"),
("DLV005","Chennai","D002","Delivered","invalid","2024-01-08 16:20"),
("DLV006","Mumbai","D005","Delivered",None,"2024-01-08 18:10"),
("DLV007","Delhi","D001","Delivered","140","09-01-2024 12:30"),
("DLV008","Bangalore","D003","Delivered","160","2024-01-09 15:45"),
("DLV009","Mumbai","D004","Delivered","110","2024-01-10 13:20"),
("DLV009","Mumbai","D004","Delivered","110","2024-01-10 13:20")
]

delivery_schema = StructType([
    StructField("delivery_id",StringType(),False),
    StructField("city",StringType(),True),
    StructField("driver_id",StringType(),True),
    StructField("status",StringType(),True),
    StructField("delivery_time_minutess",StringType(),True),
    StructField("delivery_timestamp",StringType(),True)
])

delivery_df = spark.createDataFrame(delivery_data,delivery_schema)
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|
+-----------+---------+---------+----------+----------------------+------------------+
|     DLV001|   Delhi |     D001| Delivered|                   120|  2024-01-05 10:30|
|     DLV002|   Mumbai|     D002| Delivered|                    90|  05/01/2024 11:00|
|     DLV003|Bangalore|     D003|In Transit|                   200|  2024/01/06 09:45|
|     DLV004|    Delhi|     D004| Cancelled|                      |  2024-01-07 14:00|
|     DLV005|  Chennai|     D002| Delivered|               invalid|  2024-01-08 16:20|
|     DLV006|   Mumbai|     D005| Delivered|                  NULL|  2024-01-08 18:10|
|     DLV007|    Delhi|     D001| Delivered|                   140|  09-01-2024 12:30|
|     DLV008|Bangalore|     D003| Delivered|                   160|  2024-01-09 15:45|
|     DLV009|   Mumbai|     D004| Delivered

DATASET 2 — DRIVER MASTER (JSON)

In [6]:
driver_data = [
("D001","Ravi","Senior"),
("D002","Amit","Junior"),
("D003","Sneha","Senior"),
("D004","Karan","Junior"),
("D005","Neha","Senior")
]

driver_schema = StructType([
    StructField("driver_id",StringType(),False),
    StructField("driver_name",StringType(),True),
    StructField("position",StringType(),True)
])

driver_df = spark.createDataFrame(driver_data,driver_schema)
driver_df.show()

+---------+-----------+--------+
|driver_id|driver_name|position|
+---------+-----------+--------+
|     D001|       Ravi|  Senior|
|     D002|       Amit|  Junior|
|     D003|      Sneha|  Senior|
|     D004|      Karan|  Junior|
|     D005|       Neha|  Senior|
+---------+-----------+--------+



DATASET 3 — CITY ZONE LOOKUP (REFERENCE)

In [7]:
city_zone_data = [
("Delhi","North"),
("Mumbai","West"),
("Bangalore","South"),
("Chennai","South")
]

city_schema = StructType([
    StructField("city",StringType(),False),
    StructField("zone",StringType(),True)
])

city_df = spark.createDataFrame(city_zone_data,city_schema)
city_df.show()

+---------+-----+
|     city| zone|
+---------+-----+
|    Delhi|North|
|   Mumbai| West|
|Bangalore|South|
|  Chennai|South|
+---------+-----+



In [9]:
#3
corrupt_df = delivery_df.filter(
    (col("delivery_time_minutess").isNull()) |
    (col("delivery_time_minutess") == "invalid"))

corrupt_df.show()

+-----------+-------+---------+---------+----------------------+------------------+
|delivery_id|   city|driver_id|   status|delivery_time_minutess|delivery_timestamp|
+-----------+-------+---------+---------+----------------------+------------------+
|     DLV005|Chennai|     D002|Delivered|               invalid|  2024-01-08 16:20|
|     DLV006| Mumbai|     D005|Delivered|                  NULL|  2024-01-08 18:10|
+-----------+-------+---------+---------+----------------------+------------------+



PHASE 2 — DATA CLEANING &
STANDARDIZATION

5. Trim all string columns
6. Standardize status values
7. Convert delivery_time_minutes to IntegerType
8. Handle invalid and null delivery times
9. Parse multiple timestamp formats into TimestampType
10. Remove duplicate delivery IDs

In [10]:
#5
for c in delivery_df.columns:
    delivery_df = delivery_df.withColumn(c, trim(col(c)))
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|
+-----------+---------+---------+----------+----------------------+------------------+
|     DLV001|    Delhi|     D001| Delivered|                   120|  2024-01-05 10:30|
|     DLV002|   Mumbai|     D002| Delivered|                    90|  05/01/2024 11:00|
|     DLV003|Bangalore|     D003|In Transit|                   200|  2024/01/06 09:45|
|     DLV004|    Delhi|     D004| Cancelled|                      |  2024-01-07 14:00|
|     DLV005|  Chennai|     D002| Delivered|               invalid|  2024-01-08 16:20|
|     DLV006|   Mumbai|     D005| Delivered|                  NULL|  2024-01-08 18:10|
|     DLV007|    Delhi|     D001| Delivered|                   140|  09-01-2024 12:30|
|     DLV008|Bangalore|     D003| Delivered|                   160|  2024-01-09 15:45|
|     DLV009|   Mumbai|     D004| Delivered

In [13]:
#6
from pyspark.sql.functions import col,lower
delivery_df = delivery_df.withColumn("status",lower(col("status")))
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|
+-----------+---------+---------+----------+----------------------+------------------+
|     DLV001|    Delhi|     D001| delivered|                   120|  2024-01-05 10:30|
|     DLV002|   Mumbai|     D002| delivered|                    90|  05/01/2024 11:00|
|     DLV003|Bangalore|     D003|in transit|                   200|  2024/01/06 09:45|
|     DLV004|    Delhi|     D004| cancelled|                      |  2024-01-07 14:00|
|     DLV005|  Chennai|     D002| delivered|               invalid|  2024-01-08 16:20|
|     DLV006|   Mumbai|     D005| delivered|                  NULL|  2024-01-08 18:10|
|     DLV007|    Delhi|     D001| delivered|                   140|  09-01-2024 12:30|
|     DLV008|Bangalore|     D003| delivered|                   160|  2024-01-09 15:45|
|     DLV009|   Mumbai|     D004| delivered

In [15]:
#7
delivery_df = delivery_df.withColumn("delivery_time_minutes",when(col("delivery_time_minutess").rlike("^[0-9]+$"),
         col("delivery_time_minutess").cast("int")).otherwise(None))
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+---------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|
+-----------+---------+---------+----------+----------------------+------------------+---------------------+
|     DLV001|    Delhi|     D001| delivered|                   120|  2024-01-05 10:30|                  120|
|     DLV002|   Mumbai|     D002| delivered|                    90|  05/01/2024 11:00|                   90|
|     DLV003|Bangalore|     D003|in transit|                   200|  2024/01/06 09:45|                  200|
|     DLV004|    Delhi|     D004| cancelled|                      |  2024-01-07 14:00|                 NULL|
|     DLV005|  Chennai|     D002| delivered|               invalid|  2024-01-08 16:20|                 NULL|
|     DLV006|   Mumbai|     D005| delivered|                  NULL|  2024-01-08 18:10|                 NULL|
|     DLV007|    De

In [16]:
#8
delivery_df = delivery_df.filter(col("delivery_time_minutes").isNotNull())
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+---------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|
+-----------+---------+---------+----------+----------------------+------------------+---------------------+
|     DLV001|    Delhi|     D001| delivered|                   120|  2024-01-05 10:30|                  120|
|     DLV002|   Mumbai|     D002| delivered|                    90|  05/01/2024 11:00|                   90|
|     DLV003|Bangalore|     D003|in transit|                   200|  2024/01/06 09:45|                  200|
|     DLV007|    Delhi|     D001| delivered|                   140|  09-01-2024 12:30|                  140|
|     DLV008|Bangalore|     D003| delivered|                   160|  2024-01-09 15:45|                  160|
|     DLV009|   Mumbai|     D004| delivered|                   110|  2024-01-10 13:20|                  110|
|     DLV009|   Mum

In [19]:
#9
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import lit
delivery_df = delivery_df.withColumn("delivery_ts",coalesce(
        try_to_timestamp(col("delivery_timestamp"), lit("yyyy-MM-dd HH:mm")),
        try_to_timestamp(col("delivery_timestamp"), lit("dd-MM-yyyy HH:mm")),
        try_to_timestamp(col("delivery_timestamp"), lit("yyyy/MM/dd HH:mm")),
        try_to_timestamp(col("delivery_timestamp"), lit("dd/MM/yyyy HH:mm"))
    ).cast(TimestampType()))
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+---------------------+-------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|
+-----------+---------+---------+----------+----------------------+------------------+---------------------+-------------------+
|     DLV001|    Delhi|     D001| delivered|                   120|  2024-01-05 10:30|                  120|2024-01-05 10:30:00|
|     DLV002|   Mumbai|     D002| delivered|                    90|  05/01/2024 11:00|                   90|2024-01-05 11:00:00|
|     DLV003|Bangalore|     D003|in transit|                   200|  2024/01/06 09:45|                  200|2024-01-06 09:45:00|
|     DLV007|    Delhi|     D001| delivered|                   140|  09-01-2024 12:30|                  140|2024-01-09 12:30:00|
|     DLV008|Bangalore|     D003| delivered|                   160|  2024-01-09 15:45|           

In [20]:
#10
delivery_df = delivery_df.dropDuplicates(["delivery_id"])
delivery_df.show()

+-----------+---------+---------+----------+----------------------+------------------+---------------------+-------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|
+-----------+---------+---------+----------+----------------------+------------------+---------------------+-------------------+
|     DLV001|    Delhi|     D001| delivered|                   120|  2024-01-05 10:30|                  120|2024-01-05 10:30:00|
|     DLV002|   Mumbai|     D002| delivered|                    90|  05/01/2024 11:00|                   90|2024-01-05 11:00:00|
|     DLV003|Bangalore|     D003|in transit|                   200|  2024/01/06 09:45|                  200|2024-01-06 09:45:00|
|     DLV007|    Delhi|     D001| delivered|                   140|  09-01-2024 12:30|                  140|2024-01-09 12:30:00|
|     DLV008|Bangalore|     D003| delivered|                   160|  2024-01-09 15:45|           

PHASE 3 — BUSINESS FILTERING

Tasks

11. Keep only Delivered deliveries
12. Remove cancelled and in-transit deliveries
13. Validate record counts before and after filtering

In [22]:
#11,12
before_count = delivery_df.count()
delivered_df = delivery_df.filter(col("status") == "delivered")
after_count = delivered_df.count()

#13
print("Before filtering:",before_count)
print("After filtering:",after_count)

delivered_df.show()

Before filtering: 6
After filtering: 5
+-----------+---------+---------+---------+----------------------+------------------+---------------------+-------------------+
|delivery_id|     city|driver_id|   status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|
+-----------+---------+---------+---------+----------------------+------------------+---------------------+-------------------+
|     DLV001|    Delhi|     D001|delivered|                   120|  2024-01-05 10:30|                  120|2024-01-05 10:30:00|
|     DLV002|   Mumbai|     D002|delivered|                    90|  05/01/2024 11:00|                   90|2024-01-05 11:00:00|
|     DLV007|    Delhi|     D001|delivered|                   140|  09-01-2024 12:30|                  140|2024-01-09 12:30:00|
|     DLV008|Bangalore|     D003|delivered|                   160|  2024-01-09 15:45|                  160|2024-01-09 15:45:00|
|     DLV009|   Mumbai|     D004|delivered|                   110

PHASE 4 — DATA ENRICHMENT & JOINS

14. Join delivery data with driver master
15. Join enriched data with city zone lookup
16. Use broadcast join where appropriate
17. Explain join strategy using explain(True)

In [25]:
#14
enriched_df = delivered_df.join(driver_df, "driver_id", "left")
enriched_df.show()

#15,16
final_df = enriched_df.join(broadcast(city_df), "city", "left")
final_df.show()


+---------+-----------+---------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+
|driver_id|delivery_id|     city|   status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|driver_name|position|
+---------+-----------+---------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+
|     D001|     DLV001|    Delhi|delivered|                   120|  2024-01-05 10:30|                  120|2024-01-05 10:30:00|       Ravi|  Senior|
|     D002|     DLV002|   Mumbai|delivered|                    90|  05/01/2024 11:00|                   90|2024-01-05 11:00:00|       Amit|  Junior|
|     D001|     DLV007|    Delhi|delivered|                   140|  09-01-2024 12:30|                  140|2024-01-09 12:30:00|       Ravi|  Senior|
|     D003|     DLV008|Bangalore|delivered|                   160|  2024-01-09 15:45|                  160

In [26]:
#17
final_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [driver_id#115, delivery_id#113, city#114, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, delivery_ts#205, driver_name#73, position#74]
:  +- Join LeftOuter, (driver_id#115 = driver_id#72)
:     :- Filter (status#138 = delivered)
:     :  +- Deduplicate [delivery_id#113]
:     :     +- Project [delivery_id#113, city#114, driver_id#115, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, cast(coalesce(try_to_timestamp(delivery_timestamp#118, Some(yyyy-MM-dd HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(dd-MM-yyyy HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(yyyy/MM/dd HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(dd/MM/yyyy HH:mm), TimestampType, Some(Etc/UTC), false)) as timestamp) AS delivery_ts#

PHASE 5 — ANALYTICS & WINDOW
FUNCTIONS

Tasks

18. Average delivery time per city
19. Average delivery time per driver
20. Rank drivers by performance within each city
21. Identify fastest driver per zone
22. Identify top 2 drivers per city

In [31]:
#18
avg_city_df = final_df.groupBy("city").agg(avg("delivery_time_minutes").alias("avg_time"))
avg_city_df.show()

#19
avg_driver_df = final_df.groupBy("driver_name").agg(avg("delivery_time_minutes").alias("avg_time"))
avg_driver_df.show()

#20
from pyspark.sql.window import Window

window_city = Window.partitionBy("city").orderBy(col("delivery_time_minutes"))

rank_df = final_df.withColumn("rank", rank().over(window_city))
rank_df.show()

#21
window_zone = Window.partitionBy("zone").orderBy(col("delivery_time_minutes"))

fastest_zone_df = final_df.withColumn("rank", rank().over(window_zone)).filter(col("rank") == 1)
fastest_zone_df.show()

#22
top2_city_df = rank_df.filter(col("rank") <= 2)
top2_city_df.show()

+---------+--------+
|     city|avg_time|
+---------+--------+
|Bangalore|   160.0|
|   Mumbai|   100.0|
|    Delhi|   130.0|
+---------+--------+

+-----------+--------+
|driver_name|avg_time|
+-----------+--------+
|       Ravi|   130.0|
|      Sneha|   160.0|
|       Amit|    90.0|
|      Karan|   110.0|
+-----------+--------+

+---------+---------+-----------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+-----+----+
|     city|driver_id|delivery_id|   status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|driver_name|position| zone|rank|
+---------+---------+-----------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+-----+----+
|Bangalore|     D003|     DLV008|delivered|                   160|  2024-01-09 15:45|                  160|2024-01-09 15:45:00|      Sneha|  Senior|South|   1|
|    Delhi|     D001|     D

PHASE 6 — PERFORMANCE OPTIMIZATION

Tasks

23. Identify DataFrames reused multiple times
24. Apply caching appropriately
25. Compare execution plans with and without cache
26. Repartition data by city
27. Explain why repartitioning improves performance

In [32]:
#23
#final_df is reused multiple times
#24
final_df.cache()

DataFrame[city: string, driver_id: string, delivery_id: string, status: string, delivery_time_minutess: string, delivery_timestamp: string, delivery_time_minutes: int, delivery_ts: timestamp, driver_name: string, position: string, zone: string]

In [33]:
#25
final_df.explain(True)
final_df.unpersist()

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [driver_id#115, delivery_id#113, city#114, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, delivery_ts#205, driver_name#73, position#74]
:  +- Join LeftOuter, (driver_id#115 = driver_id#72)
:     :- Filter (status#138 = delivered)
:     :  +- Deduplicate [delivery_id#113]
:     :     +- Project [delivery_id#113, city#114, driver_id#115, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, cast(coalesce(try_to_timestamp(delivery_timestamp#118, Some(yyyy-MM-dd HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(dd-MM-yyyy HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(yyyy/MM/dd HH:mm), TimestampType, Some(Etc/UTC), false), try_to_timestamp(delivery_timestamp#118, Some(dd/MM/yyyy HH:mm), TimestampType, Some(Etc/UTC), false)) as timestamp) AS delivery_ts#

DataFrame[city: string, driver_id: string, delivery_id: string, status: string, delivery_time_minutess: string, delivery_timestamp: string, delivery_time_minutes: int, delivery_ts: timestamp, driver_name: string, position: string, zone: string]

In [35]:
#26
final_df = final_df.repartition("city")
final_df.show()


#27
#Reshuffle improves performance because it :
#reduces shuffle
#improves windows function

+---------+---------+-----------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+-----+
|     city|driver_id|delivery_id|   status|delivery_time_minutess|delivery_timestamp|delivery_time_minutes|        delivery_ts|driver_name|position| zone|
+---------+---------+-----------+---------+----------------------+------------------+---------------------+-------------------+-----------+--------+-----+
|Bangalore|     D003|     DLV008|delivered|                   160|  2024-01-09 15:45|                  160|2024-01-09 15:45:00|      Sneha|  Senior|South|
|   Mumbai|     D002|     DLV002|delivered|                    90|  05/01/2024 11:00|                   90|2024-01-05 11:00:00|       Amit|  Junior| West|
|   Mumbai|     D004|     DLV009|delivered|                   110|  2024-01-10 13:20|                  110|2024-01-10 13:20:00|      Karan|  Junior| West|
|    Delhi|     D001|     DLV001|delivered|                   120|  20

PHASE 7 — FILE FORMAT STRATEGY

Tasks

28. Write cleaned delivery data to Parquet
29. Write aggregated analytics to ORC
30. Compare file output structure
31. Explain why Avro is suitable for future real-time tracking

In [36]:
#28
final_df.write.mode("overwrite").parquet("/data/delivery_cleaned")

In [37]:
#29
avg_city_df.write.mode("overwrite").orc("/data/delivery_analytics")

In [None]:
#31
#AVRO is suitable for real-time tracking because
#it has compact row-based format
#ideal for Kafka & streaming pipelines

PHASE 8 — DEBUGGING & ERROR ANALYSIS

Tasks

32. Identify potential NoneType errors
33. Identify schema mismatch risks
34. Debug an intentionally broken transformation
35. Use explain plan to find inefficient operations

In [38]:
final_df.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- RepartitionByExpression [city#114]
   +- Project [city#114, driver_id#115, delivery_id#113, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, delivery_ts#205, driver_name#73, position#74, zone#86]
      +- Join LeftOuter, (city#114 = city#85)
         :- Project [driver_id#115, delivery_id#113, city#114, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, delivery_ts#205, driver_name#73, position#74]
         :  +- Join LeftOuter, (driver_id#115 = driver_id#72)
         :     :- Filter (status#138 = delivered)
         :     :  +- Deduplicate [delivery_id#113]
         :     :     +- Project [delivery_id#113, city#114, driver_id#115, status#138, delivery_time_minutess#117, delivery_timestamp#118, delivery_time_minutes#159, cast(coalesce(try_to_timestamp(delivery_timestamp#118, Some(yyyy-MM-dd HH:mm), TimestampType, Some(Etc/UTC), false), try_to_t