# RDD Transformation and Action Functions

In [1]:
import findspark 
findspark.init() 

Creating SparkSession for using Spark environment

In [2]:
pyspark = SparkSession.builder \
.master("local[4]")\
.appName("TransformationAction")\
.config("spark.executer.memory","4g")\
.config("spark.driver.memory","4g")\
.getOrCreate()

sc = pyspark.sparkContext

In [3]:
listNumber = [1,2,3,5,7,9,3,11,5,13]

In [4]:
listRdd = sc.parallelize(listNumber)

## Single RDD Transaction Functions

In [5]:
listRdd.take(10)

[1, 2, 3, 5, 7, 9, 3, 11, 5, 13]

In [6]:
listRdd.distinct().take(10)

[1, 13, 2, 3, 5, 7, 9, 11]

#### Here list numbers assigned to x and the numbers' square was taken

In [7]:
listRdd.map(lambda x: x*x).take(10)

[1, 4, 9, 25, 49, 81, 9, 121, 25, 169]

#### list numbers were filtered according to bigger than three

In [8]:
listRdd.filter(lambda x: x>3).take(10)

[5, 7, 9, 11, 5, 13]

#### Get all numbers except of 13 

In [9]:
listRdd.filter(lambda x: x != 3).take(10)

[1, 2, 5, 7, 9, 11, 5, 13]

#### Get only 1, 3 and 5 numbers from list

In [10]:
listRdd.filter(lambda x: (x == 1) or (x == 2) or (x == 5)).take(10)
listRdd.filter(lambda x: (x is 1) or (x is 2) or (x is 5)).take(10)

[1, 2, 5, 5]

#### list numbera were filtered according to bigger than 5 and smaller than 10

In [11]:
listRdd.filter(lambda x: x>5 and x<10).take(10)

[7, 9]

In [12]:
textList = ["I go to school", "Writing Homework", "Playing football"]

In [13]:
textRdd = sc.parallelize(textList)
textRdd.take(10)

['I go to school', 'Writing Homework', 'Playing football']

In [14]:
textRdd.map(lambda x: x.upper()).take(10)

['I GO TO SCHOOL', 'WRITING HOMEWORK', 'PLAYING FOOTBALL']

In [15]:
textRdd.map(lambda x: x.lower()).take(10)

['i go to school', 'writing homework', 'playing football']

In [16]:
textRdd.flatMap(lambda x: x.split(" ")).map(lambda x: x.lower()).take(10)

['i', 'go', 'to', 'school', 'writing', 'homework', 'playing', 'football']

In [17]:
textRdd.flatMap(lambda x: x.split(" ")).map(lambda x: x.upper()).take(10)

['I', 'GO', 'TO', 'SCHOOL', 'WRITING', 'HOMEWORK', 'PLAYING', 'FOOTBALL']

#### Sample create a sample from population with replacement, fraction and seed

In [18]:
listRdd.sample(True,0.5,10).take(10)

[9, 3, 3, 11]

In [19]:
listRdd.sample(True,0.5,10).take(10)

[9, 3, 3, 11]

In [20]:
listRdd.sample(True,0.7,42).take(10)

[1, 2, 3, 3, 5, 7, 7, 11, 11, 11]

#### Getting the number of partitions in RDD

In [21]:
listRdd.getNumPartitions()

12

#### Getting maximum value

In [22]:
listRdd.max()

13

#### Getting minimum values

In [23]:
listRdd.min()

1

#### Counting of elements

In [24]:
listRdd.count()

10

#### Getting mean of list

In [25]:
listRdd.mean()

5.9

#### Getting variance of list

In [26]:
listRdd.variance()

14.49

#### Getting standard deviation of list

In [27]:
listRdd.stdev()

3.8065732621348563

#### Set and get name of RDD

In [28]:
listRdd.setName("NumbersRdd")
listRdd.name()

'NumbersRdd'

## Multi RDD Transformation Functions

In [29]:
listNumber1 = [1,3,5,7,9,11]

In [30]:
listNumber2 = [3,5,7,11,13,15]

In [31]:
rdd1 = sc.parallelize(listNumber1)

In [32]:
rdd2 = sc.parallelize(listNumber2)

In [33]:
rdd1.take(10)

[1, 3, 5, 7, 9, 11]

In [34]:
rdd2.take(10)

[3, 5, 7, 11, 13, 15]

In [35]:
# Join two RDD dataset 
rdd1.union(rdd2).take(10)

[1, 3, 5, 7, 9, 11, 3, 5, 7, 11]

#### Get intersection between rdd1 and rdd2

In [36]:
rdd1.intersection(rdd2).take(10)

[3, 5, 7, 11]

#### Subract rdd1 from rdd2

In [37]:
rdd1.subtract(rdd2).take(10)

[1, 9]

#### Calculating cartesian 

In [38]:
rdd1.cartesian(rdd2).take(20)

[(1, 3),
 (1, 5),
 (1, 7),
 (1, 11),
 (1, 13),
 (1, 15),
 (3, 3),
 (3, 5),
 (3, 7),
 (3, 11),
 (3, 13),
 (3, 15),
 (5, 3),
 (5, 5),
 (5, 7),
 (5, 11),
 (5, 13),
 (5, 15),
 (7, 3),
 (7, 5)]

# Basic RDD Action Functions

In [39]:
myList = [1,2,3,4,5,7,9,11,3,5,7,3]
actionRdd = sc.parallelize(myList)
actionRdd.take(20)

[1, 2, 3, 4, 5, 7, 9, 11, 3, 5, 7, 3]

#### It gets all elements in RDD

In [40]:
actionRdd.collect()

[1, 2, 3, 4, 5, 7, 9, 11, 3, 5, 7, 3]

#### Count RDD elements

In [41]:
actionRdd.count()

12

#### It counts elements by value

In [42]:
actionRdd.countByValue()

defaultdict(int, {1: 1, 2: 1, 3: 3, 4: 1, 5: 2, 7: 2, 9: 1, 11: 1})

#### It counts elements by key (key,count)

In [43]:
actionRdd.map(lambda x: (x,1)).countByKey().items()

dict_items([(1, 1), (2, 1), (3, 3), (4, 1), (5, 2), (7, 2), (9, 1), (11, 1)])

In [44]:
actionRdd.top(3)

[11, 9, 7]

In [45]:
actionRdd.takeOrdered(3)

[1, 2, 3]

#### takeSample() creates a sample from population with replacement false and sample size 10

In [46]:
actionRdd.takeSample(False,10)

[2, 3, 4, 5, 11, 1, 7, 7, 5, 3]

#### takeSample() creates a sample from population with replacement True and sample size 10

In [47]:
actionRdd.takeSample(True,10)

[3, 7, 11, 7, 3, 7, 3, 2, 3, 11]

#### Aggregates a dataset element using function (addition of RDD numbers)

In [48]:
actionRdd.reduce(lambda x,y: x+y)

60

#### Multiplication of RDD numbers

In [49]:
actionRdd.reduce(lambda x,y: x*y)

26195400

#### It is same with reduce() but it has initial value sucha as zero

In [50]:
actionRdd.fold(0,lambda x,y: x+y)

60

#### First lambda function aggregate the element of each partitios and return result for all partitions
#### Second lambda function combine aggregated tuple type values 

In [51]:
seqOp = (lambda acc, val: (acc[0] + val, acc[1] +1))
combOp = (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
actionRdd.aggregate((0,0), seqOp, combOp)

(60, 12)

#### Get first element of list

In [52]:
actionRdd.first()

1

#### Save our RDD as partitions in text format

In [54]:
actionRdd.saveAsTextFile("myList")