In [1]:
!pip install pyspark



In [2]:
from pyspark import SparkContext
sc = SparkContext()
sc

In [3]:
# flatMap
rdd = sc.parallelize([2, 3, 4])
newRDD = rdd.flatMap(lambda x: range(1, x*2))
newRDD.collect()


[1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7]

In [4]:
# mapValues
rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
rdd.mapValues(lambda x: len(x)).collect()


[('a', 3), ('b', 1)]

In [5]:
# mapValues
rdd = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])])
rdd.mapValues(len).collect()

[('a', 3), ('b', 1)]

In [6]:
# map
rdd.map(lambda x: (x[0], len(x[1]))).collect()

[('a', 3), ('b', 1)]

In [7]:
# filter
rdd = sc.parallelize([1, 2, 3, 4, 5])
rdd.filter(lambda x: x%2 == 0).collect()


[2, 4]

Reduce

In [8]:
# reduce
sc.parallelize([1, 2, 3, 4, 5]).reduce(lambda x, y: x+y)

15

In [9]:
# reduceByKey
rdd = sc.parallelize([("to", 1), ("be", 1), ("or", 1), ("not", 1), ("to", 1),("be", 1)])
rdd.reduceByKey(lambda x, y: x+y).collect()


[('to', 2), ('be', 2), ('or', 1), ('not', 1)]

In [10]:
#  union
rdd = sc.parallelize([1, 1, 2, 3])
rdd.union(rdd).collect()

[1, 1, 2, 3, 1, 1, 2, 3]

In [11]:
# distinct
sc.parallelize([1, 1, 2, 3]).distinct().collect()


[2, 1, 3]

In [12]:
# join
rdd1 = sc.parallelize([(1, 'a'), (1, 'b'), (5, 'c'), (2, 'd'), (3, 'e')])
rdd2 = sc.parallelize([(1, 'AA'), (5, 'BB'), (5, 'CC'), (6, 'DD')])
rdd1.join(rdd2).collect()

[(1, ('a', 'AA')), (1, ('b', 'AA')), (5, ('c', 'BB')), (5, ('c', 'CC'))]

In [13]:
# leftOuterJoin
rdd1 = sc.parallelize([(1, 'a'), (1, 'b'), (5, 'c'), (2, 'd'), (3, 'e')])
rdd2 = sc.parallelize([(1, 'AA'), (5, 'BB'), (5, 'CC'), (6, 'DD')])
rdd1.leftOuterJoin(rdd2).collect()


[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (5, ('c', 'BB')),
 (5, ('c', 'CC')),
 (2, ('d', None)),
 (3, ('e', None))]

In [14]:
# rightOuterJoin
rdd1 = sc.parallelize([(1, 'a'), (1, 'b'), (5, 'c'), (2, 'd'), (3, 'e')])
rdd2 = sc.parallelize([(1, 'AA'), (5, 'BB'), (5, 'CC'), (6, 'DD')])
rdd1.rightOuterJoin(rdd2).collect()

[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (5, ('c', 'BB')),
 (5, ('c', 'CC')),
 (6, (None, 'DD'))]

In [15]:
# fullOuterJoin
rdd1 = sc.parallelize([(1, 'a'), (1, 'b'), (5, 'c'), (2, 'd'), (3, 'e')])
rdd2 = sc.parallelize([(1, 'AA'), (5, 'BB'), (5, 'CC'), (6, 'DD')])
rdd1.fullOuterJoin(rdd2).collect()

[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (5, ('c', 'BB')),
 (5, ('c', 'CC')),
 (2, ('d', None)),
 (6, (None, 'DD')),
 (3, ('e', None))]

In [16]:
# cartesian
rdd = sc.parallelize([1,2,3])
rdd.cartesian(rdd).collect()

[(1, 1), (1, 2), (1, 3), (2, 1), (3, 1), (2, 2), (2, 3), (3, 2), (3, 3)]

In [17]:
# groupByKey
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.groupByKey().collect()


[('b', <pyspark.resultiterable.ResultIterable at 0x7fad233aa5d0>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x7fad233aa9d0>)]

In [18]:
# groupByKey
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 2), ("b", 3)])
rdd.groupByKey().mapValues(list).collect()


[('b', [1, 3]), ('a', [1, 1, 2])]

In [19]:
# sortByKey
rdd = sc.parallelize([("c", 1), ("a", 1), ("b", 1)])
rdd.sortByKey().collect()
rdd.collect()

[('c', 1), ('a', 1), ('b', 1)]

In [20]:
# Aggregate the above RDD
def mySequenceFunction(x, y):
  x.add(y)
  return x
 
def myCombinerFunction(x, y):
  x.update(y)
  return x
rdd = sc.parallelize([("c1", "p1"), ("c2", "p1"), ("c1", "p1"), ("c2", "p2"), ("c2", "p3")])
rdd.aggregateByKey(set(), mySequenceFunction, myCombinerFunction).collect()

[('c1', {'p1'}), ('c2', {'p1', 'p2', 'p3'})]

In [21]:
#rdd.treeAggregate(0, mySequenceFunction, myCombinerFunction).collect()

In [22]:
# zip
x = sc.parallelize(range(0, 5))
y = sc.parallelize(range(1000, 1005))
x.zip(y).collect()

[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]

In [23]:
# zipWithIndex
y.zipWithIndex().collect()

[(1000, 0), (1001, 1), (1002, 2), (1003, 3), (1004, 4)]

In [24]:
sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3)]

# Glom Operation
The glom() operation returns an RDD created by coalescing all elements within each partition into a list. Glom is a highly useful operation when you want to access batches of an RDD. Listing 1.34 is a simple example of how we can access data batches of an RDD

In [25]:
# glom
rdd = sc.parallelize([1, 2, 3, 4], 3)
sorted(rdd.glom().collect())

[[1], [2], [3, 4]]

In [26]:
# glom
rdd = sc.parallelize([1,2,3,4,5,6,7,8], 4)
rdd.glom().collect()

[[1, 2], [3, 4], [5, 6], [7, 8]]

In [27]:
# repartition
rdd = rdd.repartition(2)
rdd.glom().collect()

[[1, 2, 5, 6, 7, 8], [3, 4]]

In [28]:
# coalesce: This is optimized or improved version of repartition() 
# where the movement of the data across the partitions is lower using coalesce.
rdd = sc.parallelize([1,2,3,4,5,6,7,8], 4)
rdd.coalesce(2).glom().collect()

[[1, 2, 3, 4], [5, 6, 7, 8]]

In [29]:
 # cache
 rdd = sc.parallelize([1, 2, 3, 4])
 rdd.cache()

ParallelCollectionRDD[116] at readRDDFromFile at PythonRDD.scala:274

In [30]:
# countByValue
sc.parallelize([1, 2, 1, 2, 2], 2).countByValue()

defaultdict(int, {1: 2, 2: 3})

In [31]:
# first
sc.parallelize([(4, 2), (1, 2), (3, 2)]).first()


(4, 2)

In [32]:
# take
sc.parallelize([2, 3, 4, 5, 6]).take(2)

[2, 3]

In [33]:
# groupByKey
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print(x.collect())
print([(j[0],[i for i in j[1]]) for j in y.collect()])

[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]
[('B', [5, 4]), ('A', [3, 2, 1])]


In [34]:
# aggregateByKey
x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])
zeroValue = [] # empty list is 'zero value' for append operation
mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)])
mergeComb = (lambda agg1,agg2: agg1 + agg2 )
y = x.aggregateByKey(zeroValue,mergeVal,mergeComb)
print(x.collect())
print(y.collect())

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
[('B', [(1, 1), (2, 4)]), ('A', [(3, 9), (4, 16), (5, 25)])]
