In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
    .appName("Read CSV example") \
    .getOrCreate()

Exercise 1.1

Create a transformation pipeline that:
Filters only Completed rides
Selects ride_id , city , distance_km
Tasks:
Do not trigger any action
Explain whether Spark executed anything

In [3]:

rides_data = [
    ("R001","U001","Hyderabad",12.5,240,"Completed"),
    ("R002","U002","Delhi",8.2,180,"Completed"),
    ("R003","U003","Mumbai",15.0,300,"Cancelled"),
    ("R004","U004","Bangalore",5.5,120,"Completed"),
    ("R005","U005","Hyderabad",20.0,360,"Completed"),
    ("R006","U006","Delhi",25.0,420,"Completed"),
    ("R007","U007","Mumbai",7.5,150,"Completed"),
    ("R008","U008","Bangalore",18.0,330,"Completed"),
    ("R009","U009","Delhi",6.0,140,"Cancelled"),
    ("R010","U010","Hyderabad",10.0,200,"Completed")
]
rides_cols = ["ride_id","user_id","city","distance_km","duration_seconds","status"]
rides_df = spark.createDataFrame(rides_data, rides_cols)

completed_rides_df = rides_df.filter(rides_df.status == "Completed")
selected_cols_df = completed_rides_df.select("ride_id", "city", "distance_km")




Exercise 1.2

Trigger a single action on the pipeline.
Tasks:

Identify which line caused execution
Explain why previous lines did not execute

In [5]:
rides_data = [
    ("R001","U001","Hyderabad",12.5,240,"Completed"),
    ("R002","U002","Delhi",8.2,180,"Completed"),
    ("R003","U003","Mumbai",15.0,300,"Cancelled"),
    ("R004","U004","Bangalore",5.5,120,"Completed"),
    ("R005","U005","Hyderabad",20.0,360,"Completed"),
    ("R006","U006","Delhi",25.0,420,"Completed"),
    ("R007","U007","Mumbai",7.5,150,"Completed"),
    ("R008","U008","Bangalore",18.0,330,"Completed"),
    ("R009","U009","Delhi",6.0,140,"Cancelled"),
    ("R010","U010","Hyderabad",10.0,200,"Completed")
]
rides_cols = ["ride_id","user_id","city","distance_km","duration_seconds","status"]
rides_df = spark.createDataFrame(rides_data, rides_cols)

completed_rides_df = rides_df.filter(rides_df.status == "Completed")
selected_cols_df = completed_rides_df.select("ride_id", "city", "distance_km")

result_count = selected_cols_df.count()

print(f"Number of completed rides (with selected columns): {result_count}")


Number of completed rides (with selected columns): 8


EXERCISE SET 2 — DAG & LINEAGE

Exercise 2.1

Create a transformation chain with:
Multiple filters
A column selection
Tasks:
Run explain(True)
Identify:
Logical plan
Optimized logical plan
Physical plan

In [7]:



from pyspark.sql.functions import col


pipeline_df = (
    rides_df
        .filter(col("status") == "Completed")      # Filter 1
        .filter(col("distance_km") > 10)           # Filter 2
        .filter(col("duration_seconds") >= 200)    # Filter 3
        .select("ride_id", "city", "distance_km")  # Column selection
)
pipeline_df.explain(True)


== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km]
+- Filter (duration_seconds#23L >= cast(200 as bigint))
   +- Filter (distance_km#22 > cast(10 as double))
      +- Filter (status#24 = Completed)
         +- LogicalRDD [ride_id#19, user_id#20, city#21, distance_km#22, duration_seconds#23L, status#24], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double
Project [ride_id#19, city#21, distance_km#22]
+- Filter (duration_seconds#23L >= cast(200 as bigint))
   +- Filter (distance_km#22 > cast(10 as double))
      +- Filter (status#24 = Completed)
         +- LogicalRDD [ride_id#19, user_id#20, city#21, distance_km#22, duration_seconds#23L, status#24], false

== Optimized Logical Plan ==
Project [ride_id#19, city#21, distance_km#22]
+- Filter (((isnotnull(status#24) AND isnotnull(distance_km#22)) AND isnotnull(duration_seconds#23L)) AND ((status#24 = Completed) AND ((distance_km#22 > 10.0) AND (duration_seconds#23L >= 200))))
   +- LogicalR

Exercise 2.2

Reorder transformations (filter after join vs before join).
Tasks:
Compare DAGs
Identify which plan is more efficient and why

In [9]:


rides_data = [
    ("R001","U001","Hyderabad",12.5,240,"Completed"),
    ("R002","U002","Delhi",8.2,180,"Completed"),
    ("R003","U003","Mumbai",15.0,300,"Cancelled"),
    ("R004","U004","Bangalore",5.5,120,"Completed"),
    ("R005","U005","Hyderabad",20.0,360,"Completed"),
    ("R006","U006","Delhi",25.0,420,"Completed"),
    ("R007","U007","Mumbai",7.5,150,"Completed"),
    ("R008","U008","Bangalore",18.0,330,"Completed"),
    ("R009","U009","Delhi",6.0,140,"Cancelled"),
    ("R010","U010","Hyderabad",10.0,200,"Completed")
]
rides_cols = ["ride_id","user_id","city","distance_km","duration_seconds","status"]
rides_df = spark.createDataFrame(rides_data, rides_cols)

surge_data = [
    ("Hyderabad",1.2),
    ("Delhi",1.5),
    ("Mumbai",1.8),
    ("Bangalore",1.3)
]
surge_cols = ["city","surge_multiplier"]
surge_df = spark.createDataFrame(surge_data, surge_cols)


In [10]:

from pyspark.sql.functions import col


pipeline_A = (
    rides_df
        .filter(col("distance_km") > 10)            # Filter first
        .join(surge_df, on="city", how="inner")     # Then join
        .select("ride_id", "city", "distance_km", "surge_multiplier")
)

print("\n=== Pipeline A (Filter BEFORE Join) — explain(True) ===")
pipeline_A.explain(True)


pipeline_B = (
    rides_df
        .join(surge_df, on="city", how="inner")     # Join first
        .filter(col("distance_km") > 10)            # Then filter
        .select("ride_id", "city", "distance_km", "surge_multiplier")
)

print("\n=== Pipeline B (Join BEFORE Filter) — explain(True) ===")
pipeline_B.explain(True)




=== Pipeline A (Filter BEFORE Join) — explain(True) ===
== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km, 'surge_multiplier]
+- Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
   +- Join Inner, (city#34 = city#38)
      :- Filter (distance_km#35 > cast(10 as double))
      :  +- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false
      +- LogicalRDD [city#38, surge_multiplier#39], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double, surge_multiplier: double
Project [ride_id#32, city#34, distance_km#35, surge_multiplier#39]
+- Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
   +- Join Inner, (city#34 = city#38)
      :- Filter (distance_km#35 > cast(10 as double))
      :  +- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, stat

EXERCISE SET 3 — PARTITIONS & SHUFFLE

Exercise 3.1

Check the number of partitions of rides_df .
Tasks:
Repartition into 4 partitions

Coalesce into 1 partition
Observe number of output files when writing to Parquet

In [11]:
print("Initial partitions:", rides_df.rdd.getNumPartitions())
rides_4 = rides_df.repartition(4)
print("After repartition(4):", rides_4.rdd.getNumPartitions())
rides_1 = rides_4.coalesce(1)
print("After coalesce(1):", rides_1.rdd.getNumPartitions())
rides_df.write.mode("overwrite").parquet("/tmp/initial")
rides_4.write.mode("overwrite").parquet("/tmp/repartition4")
rides_1.write.mode("overwrite").parquet("/tmp/coalesce1")


Initial partitions: 2
After repartition(4): 4
After coalesce(1): 1


Exercise 3.2

Repartition rides by city .
Tasks:
Run explain(True)
Identify whether a shuffle is introduced

In [13]:
rides_by_city = rides_df.repartition("city")
print("Partitions after repartition by city:", rides_by_city.rdd.getNumPartitions())
rides_by_city.explain(True)


Partitions after repartition by city: 1
== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false

== Analyzed Logical Plan ==
ride_id: string, user_id: string, city: string, distance_km: double, duration_seconds: bigint, status: string
RepartitionByExpression [city#34]
+- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false

== Optimized Logical Plan ==
RepartitionByExpression [city#34]
+- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   ResultQueryStage 1
   +- AQEShuffleRead coalesced
      +- ShuffleQueryStage 0
         +- Exchange hashpartitioning(city#34, 200), REPARTITION_BY_COL, [plan_id=349]
            +- *(1) Scan ExistingRDD[ride_id#32,user_id#33,city#34,distance_km#35,duration_seconds#36L,stat

EXERCISE SET 4 — JOIN WITHOUT BROADCAST (BAD
DAG)

Exercise 4.1

Join rides_df with surge_df on city without using broadcast.
Tasks:
Run explain(True)
Identify:
Join type
Exchange operators
Sort operations
Stage boundaries

In [19]:

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

join_df = rides_df.join(surge_df, on="city", how="inner")
join_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false
+- LogicalRDD [city#38, surge_multiplier#39], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
+- Join Inner, (city#34 = city#38)
   :- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false
   +- LogicalRDD [city#38, surge_multiplier#39], false

== Optimized Logical Plan ==
Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
+- Join Inner, (city#34 = city#38)
   :- Filter isnotnull(city#34)
   :  +- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], 

Exercise 4.2

Apply a filter ( distance_km > 10 ) before the join.
Tasks:
Observe whether shuffle is removed
Explain why or why not

In [20]:
from pyspark.sql.functions import col
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
filtered_join_df = (
    rides_df
               .filter(col("distance_km") > 10)      # selective filter
        .join(surge_df, on="city", how="inner")
)


Exercise 5.1

Apply a broadcast hint to surge_df .
Tasks:
Run explain(True)
Identify:
Join type
BroadcastExchange
Disappearance of shuffles

In [21]:
from pyspark.sql.functions import col, broadcast

broadcast_join_df = (
    rides_df
        .filter(col("status") == "Completed")
        .join(broadcast(surge_df), on="city", how="inner")
        .select("ride_id", "city", "distance_km", "surge_multiplier")
)
broadcast_join_df.explain(True)


== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km, 'surge_multiplier]
+- Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
   +- Join Inner, (city#34 = city#38)
      :- Filter (status#37 = Completed)
      :  +- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#38, surge_multiplier#39], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double, surge_multiplier: double
Project [ride_id#32, city#34, distance_km#35, surge_multiplier#39]
+- Project [city#34, ride_id#32, user_id#33, distance_km#35, duration_seconds#36L, status#37, surge_multiplier#39]
   +- Join Inner, (city#34 = city#38)
      :- Filter (status#37 = Completed)
      :  +- LogicalRDD [ride_id#32, user_id#33, city#34, distance_km#35, duration_seconds#36L, status#37], false
      +- ResolvedHint (st

Exercise 5.2

Compare physical plans from:
Exercise 4.1
Exercise 5.1
Tasks:
List operators that disappeared
Explain performance impact