In [2]:
import findspark
findspark.init()

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext

In [2]:
pyspark = SparkSession.builder \
.master("local[4]") \
.appName("Create a RDD") \
.config("spark.executor.memory", "4g") \
.config("spark.driver.memory", "2g") \
.getOrCreate()

23/04/12 07:08:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## 1. RDD Basic Transformations

In [3]:
sc = pyspark.sparkContext

In [4]:
my_list = [1,2,3,4,5,6,7,8,8,4,42,67]

In [5]:
list_rdd = sc.parallelize(my_list)

In [6]:
list_rdd.take(10)

                                                                                

[1, 2, 3, 4, 5, 6, 7, 8, 8, 4]

### map()

In [9]:
list_rdd.map(lambda x: x*x*x).take(5)

[1, 8, 27, 64, 125]

### filter()

In [10]:
list_rdd.filter(lambda x: x == 8).take(5)

[8, 8]

### flatMap()

In [11]:
txt = ["Hello", "How are you?","CERN-LHC", "Data Engineering"]

In [12]:
txt_rdd = sc.parallelize(txt)

In [13]:
txt_rdd.flatMap(lambda x: x.split(" ")).map(lambda x: x.upper()).take(9)

['HELLO', 'HOW', 'ARE', 'YOU?', 'CERN-LHC', 'DATA', 'ENGINEERING']

### distinct()

In [14]:
list_rdd.distinct().take(10)

[4, 8, 1, 5, 2, 6, 42, 3, 7, 67]

### sample()

In [15]:
list_rdd.sample(True, 0.7, 45).take(10)

[5, 5, 6, 7, 7, 8, 8, 8, 8, 4]

## 2. RDD Pair Transformations

In [16]:
my_list_1 = [1,5,7,9,5,7]
my_list_2 = [1,4,8,19]
rdd_my_list_1 = sc.parallelize(my_list_1)
rdd_my_list_2 = sc.parallelize(my_list_2)

### union()

In [17]:
rdd_my_list_1.union(rdd_my_list_2).take(10)

[1, 5, 7, 9, 5, 7, 1, 4, 8, 19]

### intersection()

In [18]:
rdd_my_list_1.intersection(rdd_my_list_2).take(10)

[1]

### substract()

In [19]:
rdd_my_list_1.subtract(rdd_my_list_2).take(10)

[9, 5, 5, 7, 7]

### cartesian()

In [20]:
rdd_my_list_1.cartesian(rdd_my_list_2).take(10)

[(1, 1),
 (1, 4),
 (1, 8),
 (1, 19),
 (5, 1),
 (7, 1),
 (5, 4),
 (7, 4),
 (5, 8),
 (7, 8)]

## 3. RDD Basic Actions

### collect()

In [21]:
rdd_my_list_1.collect()

[1, 5, 7, 9, 5, 7]

### count() 

In [22]:
rdd_my_list_1.count()

6

### countByValue()

In [23]:
rdd_my_list_1.countByValue()

defaultdict(int, {1: 1, 5: 2, 7: 2, 9: 1})

### take()

In [25]:
rdd_my_list_1.take(3)

[1, 5, 7]

In [26]:
rdd_my_list_1.top(4)

[9, 7, 7, 5]

In [27]:
rdd_my_list_1.takeOrdered(20)

[1, 5, 5, 7, 7, 9]

In [30]:
rdd_my_list_1.takeSample(False, 5, 33)

[1, 7, 9, 7, 5]

In [31]:
rdd_my_list_1.reduce(lambda x,y: x+y)

34

In [32]:
rdd_my_list_1.fold(0, lambda x,y: x+y)

34