In [1]:
# ########## ONLY in Colab ##########
# !pip3 install pyspark
# ########## ONLY in Colab ##########

In [2]:
# Linking with Spark
from pyspark import SparkContext, SparkConf

In [3]:
# Initializing Spark
conf = SparkConf().setAppName("RDD_practice").setMaster("local[*]")
sc = SparkContext(conf=conf)
print(sc)

<SparkContext master=local[*] appName=RDD_practice>


In [4]:
# Generate random data:
import random
# Generate 10 random numbers between 0 and 40
randomlist = random.sample(range(0, 40), 10)
print(randomlist)

[1, 36, 3, 29, 19, 26, 39, 25, 15, 11]


In [5]:
# Create RDD:
rdd1 = sc.parallelize(randomlist, 4)
rdd1.collect()

[1, 36, 3, 29, 19, 26, 39, 25, 15, 11]

In [6]:
# Data distribution in partitions:
print(rdd1.getNumPartitions())
print(rdd1.glom().collect())
print("Two partitions: ", rdd1.glom().take(2))

4
[[1, 36], [3, 29], [19, 26], [39, 25, 15, 11]]
Two partitions:  [[1, 36], [3, 29]]


In [9]:
# Print last partition
for item in rdd1.glom().collect()[3]:
  print(item)

39
25
15
11


In [10]:
# count():
rdd1.count()

10

In [11]:
# first():
rdd1.first()

1

In [13]:
# top():
rdd1.top(2)

[39, 36]

In [14]:
# distinct():
rdd1.distinct().collect()

[36, 1, 29, 25, 26, 3, 19, 39, 15, 11]

In [15]:
# map():
rdd_map = rdd1.map(lambda x:(x+1)*3)
rdd_map.collect()

[6, 111, 12, 90, 60, 81, 120, 78, 48, 36]

In [16]:
# filter(): 
rdd_filter = rdd1.filter(lambda x : x%3==0)
rdd_filter.collect()

[36, 3, 39, 15]

In [17]:
# flatMap():
rdd_flatmap=rdd1.flatMap(lambda x: [x+2,x+5])
print(rdd_flatmap.collect())
print("The summation of elements =", rdd_flatmap.reduce(lambda a,b : a + b))

[3, 6, 38, 41, 5, 8, 31, 34, 21, 24, 28, 31, 41, 44, 27, 30, 17, 20, 13, 16]
The summation of elements = 478


In [18]:
# Descriptive statistics:
print([rdd1.max(), rdd1.min(), rdd1.mean(), rdd1.sum(), round(rdd1.stdev(),2), rdd1.top(2)])

[39, 1, 20.4, 204, 12.31, [39, 36]]


In [19]:
# mapPartitions():

def myfunc(partition):
  sum = 0
  for item in partition:
    sum = sum + item
  yield sum  # "return" causes a function to exit; "yield" is used to define generator and returns an intermediate results.

rdd_mapPartition = rdd1.mapPartitions(myfunc)
rdd_mapPartition.collect()

[37, 32, 45, 90]

In [20]:
# union():
print(rdd1.collect())
rdd2 = sc.parallelize([1, 14, 20, 20, 28, 10, 13, 3],2)
print(rdd2.collect())

rdd_union = rdd1.union(rdd2)
print(rdd_union.getNumPartitions())
print(rdd_union.collect())

[1, 36, 3, 29, 19, 26, 39, 25, 15, 11]
[1, 14, 20, 20, 28, 10, 13, 3]
6
[1, 36, 3, 29, 19, 26, 39, 25, 15, 11, 1, 14, 20, 20, 28, 10, 13, 3]


In [21]:
# intersection():
rdd_intersection = rdd1.intersection(rdd2)
print(rdd_intersection.getNumPartitions())
print(rdd_intersection.collect())

rdd_intersection.glom().collect()

6
[1, 3]


[[], [1], [], [3], [], []]

In [22]:
# Find empty partitions
counter = 0
for item in rdd_intersection.glom().collect():
  if len(item) == 0:
    counter = counter + 1
print(counter)

4


In [23]:
# coalesce(numPartitions):
rdd_intersection.coalesce(1).glom().collect()

[[1, 3]]

In [24]:
# takeSample(withReplacement, num, [seed])
# This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver’s memory.

rdd1.takeSample(False, 5)

[25, 11, 36, 29, 39]

In [25]:
print(rdd1.collect())
print(rdd1.takeOrdered(5))
print(rdd1.takeOrdered(5, key=lambda x: -x))

[1, 36, 3, 29, 19, 26, 39, 25, 15, 11]
[1, 3, 11, 15, 19]
[39, 36, 29, 26, 25]


In [26]:
# reduce():
# A commutative and associative binary operator.
rdd1.reduce(lambda x,y: x+y)

204

In [27]:
# reduceByKey():
rdd_Rbk = sc.parallelize([(1,4),(7,10),(5,7),(1,12),(7,12),(7,1),(9,1),(7,4)], 2)
print(rdd_Rbk.collect())
rdd_Rbk.reduceByKey(lambda x,y: x+y).collect()


# tabular visualization
import pandas as pd
Counter = pd.DataFrame({'Key': rdd_Rbk.keys().collect(),
                 'Values': rdd_Rbk.values().collect()})
Counter

[(1, 4), (7, 10), (5, 7), (1, 12), (7, 12), (7, 1), (9, 1), (7, 4)]


Unnamed: 0,Key,Values
0,1,4
1,7,10
2,5,7
3,1,12
4,7,12
5,7,1
6,9,1
7,7,4


In [29]:
# sortByKey():
rdd_Rbk.reduceByKey(lambda x,y: x+y).sortByKey().collect()

[(1, 16), (5, 7), (7, 27), (9, 1)]

In [30]:
# countByKey()
rdd_Rbk.countByKey()
rdd_Rbk.countByKey().items()
sorted(rdd_Rbk.countByKey())
sorted(rdd_Rbk.countByKey().items())

[(1, 2), (5, 1), (7, 4), (9, 1)]

In [31]:
# groupByKey():
rdd_group = rdd_Rbk.groupByKey() # or assign a partition 4
rdd_group.getNumPartitions()

rdd_group.collect() # it executes at driver node, not recommended

for item in rdd_group.collect():
  print(item[0], [value for value in item[1]])

1 [4, 12]
7 [10, 12, 1, 4]
5 [7]
9 [1]


In [32]:
# lookup(key):
rdd_Rbk.lookup(7)

[10, 12, 1, 4]

In [33]:
# cache:
# By default, each transformed RDD may be recomputed each time you run an action on it.
# However, you may also persist an RDD in memory using the persist (or cache) method,
# in which case Spark will keep the elements around on the cluster for much faster access the next time you query it.

rdd_Rbk.persist() # OR rdd_Rbk.cache()

ParallelCollectionRDD[48] at readRDDFromFile at PythonRDD.scala:274

In [34]:
# Persistence (https://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence)
from pyspark import StorageLevel
rdd1.persist(StorageLevel.MEMORY_AND_DISK)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

# New Section