In [3]:
from pyspark import SparkContext, SparkConf

In [4]:
conf = SparkConf().setAppName("RDD").setMaster("local[4]")
sc = SparkContext(conf=conf)

24/10/01 09:46:18 WARN Utils: Your hostname, guilherme-linux resolves to a loopback address: 127.0.1.1; using 192.168.2.154 instead (on interface enp5s0)
24/10/01 09:46:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 09:46:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc

24/09/28 18:48:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
sc.defaultParallelism

4

In [8]:
import random

In [9]:
random_list = random.sample(range(0, 100), 10)

In [10]:
print(random_list)

[69, 28, 30, 19, 89, 22, 77, 9, 8, 20]


In [11]:
rdd1 = sc.parallelize(random_list, 4)

In [23]:
num_partitions = rdd1.getNumPartitions()
print(f"O RDD foi dividido em {num_partitions} partições.")

O RDD foi dividido em 4 partições.


## ACTIONS

In [24]:
partitions_data = rdd1.glom().collect()
for idx, partition in enumerate(partitions_data):
    print(f"Partição {idx}: {partition}")

Partição 0: [80, 27]
Partição 1: [47, 53]
Partição 2: [74, 49]
Partição 3: [65, 23, 89, 11]


In [25]:
rdd1.glom().take(2)

[[80, 27], [47, 53]]

In [28]:
rdd1.glom().collect()[-1]

[65, 23, 89, 11]

In [29]:
rdd1.count()

10

In [30]:
rdd1.first()

80

In [32]:
rdd1.top(2)

[89, 80]

## TRANSFORMATIONS

In [33]:
rdd1.distinct()

PythonRDD[18] at RDD at PythonRDD.scala:53

In [34]:
rdd1.distinct().collect() # Usando a ação collect para poder executar e ver o resultado da tranformação distinct()

[80, 53, 49, 65, 89, 74, 27, 47, 23, 11]

In [37]:
rdd_map = rdd1.map(lambda x:x**2)

In [38]:
rdd_map.collect()

[6400, 729, 2209, 2809, 5476, 2401, 4225, 529, 7921, 121]

In [39]:
rdd_filter = rdd1.filter(lambda x: x % 2 ==0)

In [41]:
rdd_filter.glom().collect() # Garbage collector irá remover partições vazias

[[80], [], [74], []]

In [47]:
rdd_filter_repartitioned = rdd_filter.repartition(1)

In [48]:
rdd_filter_repartitioned.getNumPartitions()

1

In [49]:
rdd_filter_repartitioned.glom().collect() 

[[80, 74]]

In [50]:
rdd_filter_repartitioned.collect()

[80, 74]

In [60]:
rdd_flat_map = rdd1.flatMap(lambda x: range(x, x+2))

In [62]:
rdd_flat_map.glom().collect() 

[[80, 81, 27, 28],
 [47, 48, 53, 54],
 [74, 75, 49, 50],
 [65, 66, 23, 24, 89, 90, 11, 12]]

In [63]:
rdd_flat_map.reduce(lambda x,y: x+y)

1046

In [64]:
print([rdd1.max(), rdd1.min(), rdd1.mean(), rdd1.stdev(), rdd1.sum()])

[89, 11, 51.8, 24.4695729427385, 518]


In [68]:
def myFunc(partition):

    yield sum(partition)

In [69]:
rdd_map_partitions = rdd1.mapPartitions(myFunc)

In [70]:
rdd_map_partitions.collect()

[107, 100, 123, 188]

In [12]:
rdd2 = sc.parallelize([1, 14, 20, 20, 28, 10, 13, 3], 2)

### Union

In [33]:
sc.setJobGroup("TESTE", "Sorting and writing dataframe - 5 Partitions.")
rdd_union = rdd1.union(rdd2)

In [35]:
rdd_union.getNumPartitions()

6

In [14]:
rdd_union.glom().collect()

[[69, 28],
 [30, 19],
 [89, 22],
 [77, 9, 8, 20],
 [1, 14, 20, 20],
 [28, 10, 13, 3]]

### Intersection

In [37]:
rdd_intersection = rdd1.intersection(rdd2)

In [47]:
print(rdd_intersection.collect())
print(rdd_intersection.getNumPartitions())
print(rdd_intersection.glom().collect())
print(rdd_intersection.zipWithIndex().map(lambda x: (x[1] %2, x[0])).partitionBy(2).map(lambda x: x[1]).glom().collect())

[20, 28]
6
[[], [], [20], [], [28], []]


[[20], [28]]

### Find empty partitions

In [49]:
counter = 0

for partition in rdd_intersection.glom().collect():
    if len(partition) == 0:
        counter += 1
counter

4

### coalesce

In [51]:
rdd_intersection.coalesce(1).collect()

                                                                                

[20, 28]

### takeSample

In [52]:
rdd1.takeSample(False, 5)

[9, 77, 22, 20, 69]

## takeOrdered

In [54]:
print(rdd1.takeOrdered(5))
rdd1.takeOrdered(5, key=lambda x: -x)

[8, 9, 19, 20, 22]


[89, 77, 69, 30, 28]

### reduce

In [59]:
rdd1.reduce(lambda x,y: x-y)

-77

### reduceByKey

In [75]:
rdd_rbk = sc.parallelize([(1,4),(7,10),(5,7),(1,12),(7,12),(7,1),(9,1),(7,4)],2)
print(rdd_rbk.glom().collect())
rdd_rbk.reduceByKey(lambda x,y: x+y).collect()
# rdd_rbk = rdd_rbk.reduceByKey(lambda x,y: x+y)

[[(1, 4), (7, 10), (5, 7), (1, 12)], [(7, 12), (7, 1), (9, 1), (7, 4)]]


[(1, 16), (7, 27), (5, 7), (9, 1)]

### sortByKey

In [73]:
rdd_rbk.sortByKey(True).collect()

[(1, 16), (5, 7), (7, 27), (9, 1)]

### countByKey

In [76]:
rdd_rbk.countByKey()

defaultdict(int, {1: 2, 7: 4, 5: 1, 9: 1})

### groupByKey

In [77]:
rdd_group = rdd_rbk.groupByKey()
rdd_group.getNumPartitions()

2

In [78]:
for item in rdd_group.collect():
    print(item[0], list(item[1]))

1 [4, 12]
7 [10, 12, 1, 4]
5 [7]
9 [1]


## lookup

In [79]:
rdd_rbk.lookup(7)

[10, 12, 1, 4]

### cache or persist

In [80]:
rdd_rbk.persist()

ParallelCollectionRDD[218] at readRDDFromFile at PythonRDD.scala:289

In [81]:
from pyspark import StorageLevel
rdd1.persist(StorageLevel.MEMORY_AND_DISK)

ParallelCollectionRDD[2] at readRDDFromFile at PythonRDD.scala:289