In [1]:
# Set the PySpark environment variables
import os
os.environ['SPARK_HOME'] = "/home/user5/Downloads/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

Shuffling in Spark — In Short
What it is → Moving data between partitions/nodes so related records end up together.

Why it happens → Needed when:

Grouping (groupBy, reduceByKey)

Joining datasets

Sorting

Repartitioning


Why it’s “bad” →

Slow → network transfer between nodes.

Costly → serialization, disk I/O if memory overflows.

Unbalanced → can leave some partitions empty or overloaded.



Mental model
No shuffle → Data stays where it is → fast.

Shuffle → Spark “mixes and sends” data across the cluster → slower.

Use only when necessary.

In [2]:
# Step 1: Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ShuffleDemo") \
    .getOrCreate()

sc = spark.sparkContext


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/06 12:08:38 WARN Utils: Your hostname, datumlabs-Latitude-5420, resolves to a loopback address: 127.0.1.1; using 192.168.1.167 instead (on interface wlp0s20f3)
25/08/06 12:08:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/06 12:08:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Step 2: Create Sample Data

# Sample sales data (store_id, sales_amount)
data = [
    ("store1", 100), ("store2", 50),
    ("store1", 200), ("store2", 75),
    ("store3", 60), ("store3", 90)
]

rdd = sc.parallelize(data, 3)  # force 3 partitions
print("Initial number of partitions:", rdd.getNumPartitions())


Initial number of partitions: 3


In [4]:
#Step 3: View Data by Partition

def show_partitions(rdd, label):
    print(f"\n{label}")
    for i, part in enumerate(rdd.glom().collect()):
        print(f"Partition {i}: {part}")

show_partitions(rdd, "Original data distribution")

#enumerate() function is a Python tool that allows you to loop through an iterable—like a list, tuple, or string—and have access to both the index and the element itself.

#glom()
#Spark normally hides partition boundaries from you. we used glom to see our partitions



#The 3 at the end → number of total tasks in this stage 

#The 0 before the plus → number of finished tasks.

#The + 3 → number of tasks currently running.

#these partitons are alos decided by roundrobin algo 


Original data distribution


[Stage 0:>                                                          (0 + 3) / 3]

Partition 0: [('store1', 100), ('store2', 50)]
Partition 1: [('store1', 200), ('store2', 75)]
Partition 2: [('store3', 60), ('store3', 90)]


                                                                                

In [5]:
# Step 4: No Shuffle Example

# No shuffle - just map and sum
total_sales = rdd.map(lambda x: x[1]).sum()
print("\nTotal Sales (no shuffle):", total_sales)

# Check partitions again (should be unchanged)
show_partitions(rdd, "After no-shuffle operation")

# map -> for all k lie change kr deta he 


Total Sales (no shuffle): 575

After no-shuffle operation
Partition 0: [('store1', 100), ('store2', 50)]
Partition 1: [('store1', 200), ('store2', 75)]
Partition 2: [('store3', 60), ('store3', 90)]


In [6]:
# Step 5: With Shuffle Example

# With shuffle - reduceByKey
sales_per_store = rdd.reduceByKey(lambda x, y: x + y)
#reduceByKey() -> Combine values with the same key using a function.

# Show per-store sales
print("\nSales per store (with shuffle):", sales_per_store.collect())

# Check new partition distribution after shuffle
show_partitions(sales_per_store, "After shuffle (reduceByKey) distribution")


#hash("store1") % 3 → 1
#hash("store2") % 3 → 1
#hash("store3") % 3 → 0
#Why Partition 2 is empty ?  Because none of your keys hashed to % 3 == 2.


Sales per store (with shuffle): [('store3', 150), ('store1', 300), ('store2', 125)]

After shuffle (reduceByKey) distribution
Partition 0: [('store3', 150)]
Partition 1: [('store1', 300), ('store2', 125)]
Partition 2: []


In [7]:
spark