In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext.getOrCreate()

In [3]:
numbers = sc.textFile("../Data/numbers.txt", 5).map(lambda x : int(x))

## Sort numbers in each partition

In [4]:
numbers.glom().take(1)

[[4, 0, 3, 6, 8, 7, 9, 9, 8, 5, 6, 5, 6, 3, 6, 5, 1, 8, 8, 1, 5]]

In [5]:
numbers.mapPartitions(lambda x : sorted(x)).glom().take(1)

[[0, 1, 1, 3, 3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 8, 8, 8, 8, 9, 9]]

## What is the difference between .map() and .mapPartitions()

In [6]:
import time
class Power:
    def __init__(self, p):
        self.p = p
        time.sleep(2)
        
    def applyPower(self, x):
        return x**self.p
        
# mapPartitions        
def power_map_partitions(nums):
    c = Power(5)
    
    for x in nums:
        yield c.applyPower(x)
        
# map
def power_map(num):
    c = Power(5)
    return c.applyPower(num)

In [7]:
t = time.time()
numbers.map(power_map).collect()
time.time() - t

42.148236989974976

In [8]:
t = time.time()
numbers.mapPartitions(power_map_partitions).collect()
time.time() - t

2.051476001739502

In [9]:
sc.stop()