In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('reduce')
spark = SparkContext(conf=conf).getOrCreate()

24/12/05 10:57:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# reduce()
- 여러 개의 값을 하나로 축약
- reduce 연산은 순서 의존적
- 각 파티션 내에서 연산은 독립적으로 실행, 최종 결과는 파티션 간의 결과가 결합

In [2]:
sample_rdd = spark.parallelize([1,2,3,4,5])
sample_rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [3]:
from operator import add

sample_rdd.reduce(add)

                                                                                

15

In [5]:
# partition이 나뉘어져 있을 때 
sample_rdd1 = spark.parallelize([1,2,3,4])
sample_rdd1.reduce(lambda x, y:(x*2)+y)

26

In [6]:
# partition 1개일 때 
sample_rdd1.glom().collect()

[[1, 2, 3, 4]]

In [7]:
# partition 2개일 때
sample_rdd2 = spark.parallelize([1,2,3,4],2)
sample_rdd2.glom().collect()

[[1, 2], [3, 4]]

In [8]:
sample_rdd2.reduce(lambda x,y:(x*2)+y)

18

In [9]:
# partition 3개일 때 
sample_rdd3 = spark.parallelize([1,2,3,4],3)
sample_rdd3.glom().collect()

[[1], [2], [3, 4]]

In [10]:
sample_rdd3.reduce(lambda x,y:(x*2)+y)

18

# fold(ZeroValue, func)
- reduce()는 단순한 RDD 축약 연산
- fold()는 오류없는 축약 연산 가능
- 빈 RDD 처리 가능

In [11]:
sample_rdd4 = spark.parallelize([2,3,4],4)
sample_rdd4.reduce(lambda x,y:x*y)

24

In [12]:
sample_rdd4.fold(1,lambda x,y:x*y)

24

In [13]:
# 빈 RDD 예외처리
sample_rdd5 = spark.parallelize([])
try:
    sample_rdd5.reduce(lambda x,y:x+y)
except ValueError as e:
    print(f"Reduce Error : {e}")

Reduce Error : Can not reduce() empty RDD


In [14]:
sample_rdd5.fold(0, lambda a,b:a+b)

0

In [15]:
sample_rdd6 = spark.parallelize([1,"a",2])
sample_rdd6.collect()

[1, 'a', 2]

In [20]:
sample_rdd6.reduce(lambda x,y:str(x)+str(y))

'1a2'

In [16]:
sample_rdd6.fold("",lambda x,y:str(x)+str(y))

'1a2'

# groupBy, aggregate
- RDD.aggregate(zeroValue, func1_partition, func2_combine)

In [26]:
sample_rdd7 = spark.parallelize([1,2,1,1,3,5,5,8])
sample_rdd7.collect()

[1, 2, 1, 1, 3, 5, 5, 8]

In [29]:
result = sample_rdd7.groupBy(lambda x:x%2).collect()

In [33]:
sorted([(x,sorted(y)) for (x,y) in result])

[(0, [2, 8]), (1, [1, 1, 1, 3, 5, 5])]

In [34]:
func1 = lambda x,y:(x[0]+y, x[1]+1)
func2 = lambda x,y:(x[0] + y[0], x[1]+y[1])
sample_rdd8 = spark.parallelize([1,2,3,4], 2)
sample_rdd8.aggregate((0,0), func1, func2)

(10, 4)

In [35]:
sample_rdd8.glom().collect()

[[1, 2], [3, 4]]

# reduceByKey()가 groupByKey()보다 빠름

In [36]:
sample_rdd9 = spark.parallelize(
[
    ("짜장면",15),
    ("짬뽕", 10),
    ("짜장면", 5),
    ("짬뽕",20)
])

In [37]:
sample_rdd9_g = sample_rdd9.groupByKey()
sample_rdd9_g

PythonRDD[46] at RDD at PythonRDD.scala:53

In [38]:
sample_rdd9_g.mapValues(len).collect()

[('짜장면', 2), ('짬뽕', 2)]

In [39]:
sample_rdd9_g.mapValues(list).collect()

[('짜장면', [15, 5]), ('짬뽕', [10, 20])]

In [40]:
sample_rdd9_g.mapValues(lambda x:sum(x) / len(x)).collect()

[('짜장면', 10.0), ('짬뽕', 15.0)]

In [41]:
sample_rdd9.reduceByKey(add).collect()

[('짜장면', 20), ('짬뽕', 30)]

# CountByKey() : action
# key() : transformation

In [42]:
sample_rdd9.countByKey()

defaultdict(int, {'짜장면': 2, '짬뽕': 2})

In [43]:
sample_rdd9.keys()

PythonRDD[56] at RDD at PythonRDD.scala:53

In [44]:
sample_rdd9.keys().collect()

['짜장면', '짬뽕', '짜장면', '짬뽕']

# join()
- inner join이 default, 양쪽에 존재하는 key의 집합
- outer join은 left or right

In [48]:
join_rdd1 = spark.parallelize([
    ("a", 10),
    ("b", 20),
    ("c", 30)
])

join_rdd2 = spark.parallelize([
    ("a", 20),
    ("b", 20),
    ("b", 10),
    ("d",5)
])

join_rdd1.join(join_rdd2).collect()

[('b', (20, 20)), ('b', (20, 10)), ('a', (10, 20))]

In [49]:
join_rdd1.leftOuterJoin(join_rdd2).collect()

[('b', (20, 20)), ('b', (20, 10)), ('c', (30, None)), ('a', (10, 20))]

In [50]:
join_rdd1.rightOuterJoin(join_rdd2).collect()

[('b', (20, 20)), ('b', (20, 10)), ('d', (None, 5)), ('a', (10, 20))]

In [51]:
spark.stop()