In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName('RDD_Actions')

sc = SparkContext.getOrCreate(conf=conf)

# RDD Actions

Unlike transformations that produce RDDs, Action functions produce a value back to the Spark driver program. Actions may trigger a previously constructed, lazy RDD to be evaluated.

## Reduce Action

aggregates elements of dataset through a function

In [3]:
from operator import add
sc.parallelize(range(1, 6)).reduce(add)

15

In [19]:
sc.parallelize((2 for _ in range(10))).map(lambda c: 1).reduce(add)

10

In [23]:
# create a list of numbers from 1 - 10, double the numbers and then add them
# print('Initial : ', sc.parallelize(range(1, 11)).map(lambda c: c*2).collect())
print('After transformation and reduction : ', sc.parallelize(range(1, 11)).map(lambda c: c*2).cache().reduce(add)) ## actual

After transformation and reduction :  110


## First Action

returns the first element of RDD.

In [24]:
sc.parallelize([10, 1, 4, 5]).first()

10

## takeOrdered Action

returns an array with given number of ordered elements in RDD.

In [28]:
sc.parallelize([10, 1, 5, 7, 1]).takeOrdered(5)

[1, 1, 5, 7, 10]

## Take Action

returns array of elements as specified number in the take method

Eg. take(5) will return 5 elements without order.

In [29]:
sc.parallelize(range(0, 100)).take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## Count

returns a long value indicating the number of elements present in RDD.

In [31]:
sc.parallelize(range(20, 163)).count()

143

## Collect

- returns the elements of the dataset as an array to the driver program
- should be used wisely as all workers return the data to the driver program
- if the dataset is huge, it may give error as <strong>OutOfMemoryError</strong>

In [32]:
sc.parallelize(['Apple', 'Banana', 'Grapes', 'Apple']).collect()

['Apple', 'Banana', 'Grapes', 'Apple']

In [33]:
sc.parallelize(['Apple', 'Banana', 'Grapes', 'Apple']).distinct().collect()

['Apple', 'Banana', 'Grapes']

## saveAsTextFile

writes entire RDD dataset as a textfile on the path specified on the local file system or HDFS

In [34]:
sc.parallelize(range(1, 100), 8).saveAsTextFile('./data/saveAsTextRDD.txt')

## ForEach (Some issue...)

passes each element in an RDD through a function 

In [43]:
def f(x): print(x)
    
result = sc.parallelize([1, 2, 3, 4])
result.foreach(f)

## ForEachPartition

    executes the function for each partition

In [40]:
sc.parallelize(range(1, 11), 5).foreachPartition(lambda c: print(c))

## Mathematical Operations

In [44]:
numbers = sc.parallelize(range(1, 100))

In [49]:
numbers.min()


1

In [50]:
numbers.max()

99

In [51]:
numbers.sum()


4950

In [52]:
numbers.variance()


816.6666666666666

In [53]:
numbers.mean()


50.0

In [54]:
numbers.stdev()

28.577380332470412