### PySpark 설정

In [2]:
import findspark
findspark.init()
# [+] pyspark 불러오기
from pyspark import SparkConf, SparkContext

In [7]:
# [+] SparkConf, SparkContext 객체 생성하기
conf = SparkConf().setMaster("local").setAppName("reduction_operations")
sc = SparkContext(conf = conf)

In [9]:
sc.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.app.name', 'reduction_operations'),
 ('spark.driver.port', '62846'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1680575441978'),
 ('spark.app.startTime', '1680575441014'),
 ('spark

### reduce( )

In [8]:
# reduce()
sc.parallelize([1, 2, 3, 4, 5]).reduce(lambda x, y: x + y)

15

In [10]:
# reduce() 는 각 파티션별로 수행됨
# 파티션 수: 2
sc.parallelize([1, 2, 3, 4, 5], 2).reduce(lambda x, y: x + y)

15

In [14]:
# reduce() 주의점: 파티션 구성에 따라 결과 값이 달라질 수 있음
# 예: lambda x, y: (x * 2) + y
sc.parallelize([1, 2, 3, 4]).reduce(lambda x, y: (x*2) + y)

26

In [13]:
# 파티션 수: 2
sc.parallelize([1, 2, 3, 4], 2).reduce(lambda x, y: (x*2) + y)

18

### fold( )

In [15]:
# fold(): reduce()와 유사하며, 초기값을 지정
sc.parallelize([1, 2, 3, 4]).fold(1, lambda x,y: x + y)

12

In [16]:
# 파티션 수: 2
sc.parallelize([1, 2, 3, 4], 2).fold(1, lambda x,y: x + y)

13

### 10000개 정수 난수의 총합 계산하기

In [18]:
import numpy as np

In [19]:
x = np.random.randint(0, 100, size = 10000)

In [20]:
x[:100]

array([91, 63, 18, 26, 28, 95, 75, 67, 79, 24, 45, 54, 40, 14, 45, 20,  3,
       85, 28, 26, 98, 40, 26, 53, 75, 14, 49, 21, 89,  3, 46,  1, 11,  8,
       67, 52, 11, 90, 78, 83, 13, 40, 61, 17, 73, 84, 64, 60, 54, 21, 48,
       26, 26, 15, 76, 40, 43, 32, 68, 46, 50,  2, 20, 37, 43, 98, 84, 20,
       46, 25, 54, 63, 50, 89, 93, 74, 30,  2, 35, 14, 69, 99, 26, 75,  4,
       82, 22, 16, 56, 81, 15, 78, 50, 95, 54, 91, 42, 51, 75, 46])

In [21]:
rdd = sc.parallelize(x, 8)

In [25]:
partitioned_data = rdd.glom().collect()        # glom() : 파티션별로 값을 가지고 객체를 반환
for i, partition in enumerate(partitioned_data):
    print(f"Partiton {i}: {partition}\n")

Partiton 0: [91, 63, 18, 26, 28, 95, 75, 67, 79, 24, 45, 54, 40, 14, 45, 20, 3, 85, 28, 26, 98, 40, 26, 53, 75, 14, 49, 21, 89, 3, 46, 1, 11, 8, 67, 52, 11, 90, 78, 83, 13, 40, 61, 17, 73, 84, 64, 60, 54, 21, 48, 26, 26, 15, 76, 40, 43, 32, 68, 46, 50, 2, 20, 37, 43, 98, 84, 20, 46, 25, 54, 63, 50, 89, 93, 74, 30, 2, 35, 14, 69, 99, 26, 75, 4, 82, 22, 16, 56, 81, 15, 78, 50, 95, 54, 91, 42, 51, 75, 46, 21, 92, 17, 64, 14, 25, 73, 41, 41, 2, 57, 66, 21, 63, 46, 42, 36, 18, 67, 89, 82, 67, 20, 7, 41, 19, 60, 78, 54, 31, 49, 85, 22, 91, 8, 93, 58, 95, 5, 39, 57, 23, 48, 82, 27, 81, 59, 72, 81, 29, 45, 57, 77, 39, 29, 65, 0, 21, 31, 66, 0, 46, 30, 30, 44, 67, 50, 82, 55, 23, 0, 75, 86, 58, 20, 39, 71, 78, 6, 66, 81, 38, 18, 2, 4, 74, 89, 65, 19, 86, 37, 74, 0, 62, 82, 47, 95, 73, 28, 81, 3, 21, 53, 73, 40, 20, 38, 39, 26, 4, 40, 97, 48, 39, 6, 72, 18, 54, 7, 29, 83, 52, 69, 36, 59, 85, 35, 35, 53, 5, 56, 34, 46, 82, 82, 93, 81, 46, 48, 11, 97, 93, 10, 67, 10, 57, 51, 97, 54, 77, 76, 58, 38

In [26]:
rdd.reduce(lambda x,y : x + y)

489904

### grouBy()

In [30]:
# groupBy(): 인자 함수를 기준으로 그룹핑을 수행
res = sc.parallelize([1, 1, 2, 3, 5, 8]).groupBy(lambda x: x % 2).collect()

In [31]:
# groupBy() 결과 출력하기
res

[(1, <pyspark.resultiterable.ResultIterable at 0x1b545e52af0>),
 (0, <pyspark.resultiterable.ResultIterable at 0x1b545ee1130>)]

In [34]:
for k, v in res:
    print(k, list(v))

1 [1, 1, 3, 5]
0 [2, 8]


### aggregate( )

In [35]:
# reduce(), fold(): 입출력 RDD 형식이 같지 않으면 오류가 발생
sc.parallelize([1, 2, 3, 4]).reduce(lambda x, y: (x + y, x * y)) # single value -> pair 생성 시도할 경우 오류

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 55) (TaekwonLaptop executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 686, in main
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 678, in process
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\pyspark\rdd.py", line 1248, in func
    yield reduce(f, iterator, initial)
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\pyspark\util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\midst\AppData\Local\Temp\ipykernel_9592\2799694531.py", line 2, in <lambda>
TypeError: can only concatenate tuple (not "int") to tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 686, in main
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 678, in process
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\pyspark\rdd.py", line 1248, in func
    yield reduce(f, iterator, initial)
  File "C:\Dev\Spark\spark-3.3.2-bin-hadoop3\python\pyspark\util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\midst\AppData\Local\Temp\ipykernel_9592\2799694531.py", line 2, in <lambda>
TypeError: can only concatenate tuple (not "int") to tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more


In [36]:
sc.parallelize([1, 2, 3, 4]).fold((0, 0), lambda x, y: (x[0] + y, x[1] + 1))

TypeError: unsupported operand type(s) for +: 'int' and 'tuple'

In [None]:
"""
    aggregate(): 입출력 형식 다른 RDD 에 대한 reduction을 수행
    - zeroValues: 파티션 초기 값
    - seqOp: 각 파티션에 대한 reduction 함수
    - combop: 파티션 결과를 합치는 reduction 함수
""" 

seqOp = (lambda x, y: (x[0] + y, x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [None]:
# aggregate() 수행
