# 5. Pair RDD Transformations
PairRDD is which provides enables to applied transformation functions with key/value pairs.

In [1]:
import findspark 
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
pyspark = SparkSession.builder \
.master("local[4]")\
.appName("PairRDD")\
.config("spark.executer.memory","2g")\
.config("spark.driver.memory","2g")\
.getOrCreate()

#  Single PairRDD Transformations

In [4]:
age = [("Ali",40),("Mehmet",21),("Veli", 26),("Ayse",33),("Elif",28)]

In [5]:
ageRdd= sc.parallelize(age)

### Filtering by age (by value) which are lower than 30 

In [6]:
ageRdd.filter(lambda key_value: key_value[1] < 30).take(3)

[('Mehmet', 21), ('Veli', 26), ('Elif', 28)]

### Filtering by name (by key)

In [7]:
ageRdd.filter(lambda key_value: key_value[0] == "Ali").take(3)

[('Ali', 40)]

In [8]:
rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,7),(5,8)])

### ReduceByKey calculates sum of key values

In [9]:
rdd.reduceByKey(lambda x,y: x + y).take(10)

[(1, 2), (3, 10), (4, 7), (5, 8)]

### groupByKey it return a dataset of (K, Iterable<V>) pairs.

In [10]:
rdd.groupByKey().take(10)

[(1, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af2e8>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af160>),
 (4, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af208>),
 (5, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1afeb8>)]

In [11]:
rdd.groupByKey().collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af710>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1afa20>),
 (4, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af860>),
 (5, <pyspark.resultiterable.ResultIterable at 0x7f8e0b1af6a0>)]

### Here is calculated according to key .  (key, (sum of value, quantity of key))

In [12]:
result = rdd.combineByKey((lambda value: (value,1)), (lambda acc,value: (acc[0] + value, acc[1] +1)), (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

In [13]:
result.collect()

[(1, (2, 1)), (3, (10, 2)), (4, (7, 1)), (5, (8, 1))]

### mapValues() focuses only on value of keys and here multiplied with 10 

In [14]:
rdd.mapValues(lambda x: x*10).collect()

[(1, 20), (3, 40), (3, 60), (4, 70), (5, 80)]

#  Multiple PairRDD Transformations 

In [15]:
rdd2 = sc.parallelize([(3,9),(3,11),(6,36)])

### Subtraction of two rdd

In [16]:
rdd.subtract(rdd2).take(10)

[(3, 4), (1, 2), (4, 7), (5, 8), (3, 6)]

In [17]:
rdd2.subtract(rdd).take(10)

[(6, 36), (3, 11), (3, 9)]

### Join two RDD values

In [18]:
 rdd.join(rdd2).take(10)

[(3, (4, 9)), (3, (4, 11)), (3, (6, 9)), (3, (6, 11))]

### rightOuterJoin() (key, (value, corresponding value))
Right key calculated but it does not have corresponding value in right. Therefore it is written None.

In [19]:
rdd.rightOuterJoin(rdd2).collect()

[(3, (4, 9)), (3, (4, 11)), (3, (6, 9)), (3, (6, 11)), (6, (None, 36))]

### leftOuterJoin() (key, (value, corresponding value))
Left key calculated but it does not have corresponding value in left. Therefore it is written None.

In [20]:
rdd.leftOuterJoin(rdd2).collect()

[(1, (2, None)),
 (3, (4, 9)),
 (3, (4, 11)),
 (3, (6, 9)),
 (3, (6, 11)),
 (4, (7, None)),
 (5, (8, None))]

### It calculates all combination values of two Rdd (Key, Iterable < V >)

In [21]:
rdd.cogroup(rdd2).collect()

[(1,
  (<pyspark.resultiterable.ResultIterable at 0x7f8e0b159748>,
   <pyspark.resultiterable.ResultIterable at 0x7f8e0b159400>)),
 (3,
  (<pyspark.resultiterable.ResultIterable at 0x7f8e0b1594a8>,
   <pyspark.resultiterable.ResultIterable at 0x7f8e0b159160>)),
 (4,
  (<pyspark.resultiterable.ResultIterable at 0x7f8e0b159470>,
   <pyspark.resultiterable.ResultIterable at 0x7f8e0b159128>)),
 (5,
  (<pyspark.resultiterable.ResultIterable at 0x7f8e0b159438>,
   <pyspark.resultiterable.ResultIterable at 0x7f8e0b1590b8>)),
 (6,
  (<pyspark.resultiterable.ResultIterable at 0x7f8e0b159278>,
   <pyspark.resultiterable.ResultIterable at 0x7f8e0b159198>))]