In [12]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트 
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 col, avg, date_sub, current_date, rand, when, broadcast

In [8]:
# 생성 a new SparkSession
spark = (SparkSession
         .builder
         .appName("optimize-data-shuffles")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

# Set log level to ERROR
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

In [9]:
# 생성 some sample data frames
# A large data frame with 1 million rows and two columns: id and value
large_df = (spark.range(0, 1000000)
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"date", date_sub(current_date(), (rand() * 365).cast("int")))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"age", (rand() * 100).cast("int"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salary", 100*(rand() * 100).cast("int"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"grade", 
                        when(  # 조건문 시작(rand() * 5).cast("int") == 0, "IC")
                        .when(  # 조건문 시작(rand() * 5).cast("int") == 1, "IC-2")
                        .when(  # 조건문 시작(rand() * 5).cast("int") == 2, "M1")
                        .when(  # 조건문 시작(rand() * 5).cast("int") == 3, "M2")
                        .when(  # 조건문 시작(rand() * 5).cast("int") == 4, "IC-3")
                        .otherwise(  # 기본값 설정"M3")))
large_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+----------+---+------+------+-----+
| id|      date|age|salary|gender|grade|
+---+----------+---+------+------+-----+
|  0|2023-06-22| 50|  3600|     F| IC-3|
|  1|2023-03-19| 35|  2500|     F|   IC|
|  2|2023-06-17| 77|  3400|     F|   M1|
|  3|2023-03-16| 97|  2600|     M|   IC|
|  4|2024-01-03| 87|  5400|     F|   M1|
+---+----------+---+------+------+-----+
only showing top 5 rows



                                                                                

In [10]:
# 필터링 the DataFrame by age
df_filtered = large_df.filter(  # 데이터 필터링col("age") >= 55)

# Map the DataFrame by adding 10% bonus to salary
df_mapped = df_filtered.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"bonus", col("salary") * 1.1)

# Locally aggregate the DataFrame by computing the average bonus by age
df_aggregated = df_mapped.groupBy(  # 그룹화"age").agg(avg("bonus"))

# 출력 the result
df_aggregated.show(5)



+---+-----------------+
|age|       avg(bonus)|
+---+-----------------+
| 85|5462.653508771929|
| 65|5478.981668009669|
| 78|5457.682827459767|
| 81| 5482.96132596685|
| 76| 5465.81568744408|
+---+-----------------+
only showing top 5 rows



                                                                                

In [11]:
df_aggregated.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[age#9], functions=[avg(bonus#56)])
   +- Exchange hashpartitioning(age#9, 200), ENSURE_REQUIREMENTS, [plan_id=123]
      +- HashAggregate(keys=[age#9], functions=[partial_avg(bonus#56)])
         +- Project [age#9, (cast(salary#13 as double) * 1.1) AS bonus#56]
            +- Filter (isnotnull(age#9) AND (age#9 >= 55))
               +- Project [age#9, (cast((rand(900612033348343497) * 100.0) as int) * 100) AS salary#13]
                  +- Project [cast((rand(3280734957678084291) * 100.0) as int) AS age#9]
                     +- Range (0, 1000000, step=1, splits=2)




In [13]:
# 생성 another DataFrame with some dummy data
df2 = spark.createDataFrame([(25, "A"), (30, "B"), (35, "C"), (40, "D"), (45, "E"), (50, "F"), (55, "G"), (60, "H"), (65, "I"), (70, "J")], ["age", "level"])

# Join the two DataFrames by age using broadcast join
df_joined = large_df.join(  # 데이터프레임 조인broadcast(df2), "age")

# Globally aggregate the joined DataFrame by computing the sum of salary by level using partial aggregation
df_aggregated = df_joined.groupBy(  # 그룹화"level").avg("salary")

# 출력 the result
df_aggregated.show()  # DataFrame 내용 출력

                                                                                

+-----+------------------+
|level|       avg(salary)|
+-----+------------------+
|    F| 5002.275473217882|
|    E|  4973.83709120455|
|    B| 4939.277204130262|
|    D| 4983.376623376624|
|    C| 4948.175987171778|
|    J|            4956.3|
|    A| 4930.878955298845|
|    G| 4923.778271405493|
|    I| 4980.892425463336|
|    H|4999.8611662038875|
+-----+------------------+



In [14]:
df_aggregated.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[level#91], functions=[avg(salary#13)])
   +- Exchange hashpartitioning(level#91, 200), ENSURE_REQUIREMENTS, [plan_id=311]
      +- HashAggregate(keys=[level#91], functions=[partial_avg(salary#13)])
         +- Project [salary#13, level#91]
            +- BroadcastHashJoin [cast(age#9 as bigint)], [age#90L], Inner, BuildRight, false
               :- Filter isnotnull(age#9)
               :  +- Project [age#9, (cast((rand(900612033348343497) * 100.0) as int) * 100) AS salary#13]
               :     +- Project [cast((rand(3280734957678084291) * 100.0) as int) AS age#9]
               :        +- Range (0, 1000000, step=1, splits=2)
               +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=306]
                  +- Filter isnotnull(age#90L)
                     +- Scan ExistingRDD[age#90L,level#91]




In [15]:
# Repartition the DataFrame by gender with 2 partitions
df_repartitioned = large_df.repartition(col(  # 컬럼 참조"gender"))

# Repartition the DataFrame by age range with 5 partitions
df_repartitioned_by_range = large_df.repartitionByRange(5, col(  # 컬럼 참조"age"))

In [16]:
large_df.explain()

== Physical Plan ==
*(1) Project [id#4L, date#6, age#9, salary#13, gender#18, CASE WHEN (cast((rand(-5258595656362598529) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(6677297146942895454) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(6084707916817199194) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(7628756694394173931) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(-1822131519618029291) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#24]
+- *(1) Project [id#4L, date#6, age#9, salary#13, CASE WHEN (cast((rand(-1512255023260467776) * 2.0) as int) = 0) THEN M ELSE F END AS gender#18]
   +- *(1) Project [id#4L, date#6, age#9, (cast((rand(900612033348343497) * 100.0) as int) * 100) AS salary#13]
      +- *(1) Project [id#4L, date#6, cast((rand(3280734957678084291) * 100.0) as int) AS age#9]
         +- *(1) Project [id#4L, date_sub(2024-02-21, cast((rand(-5065184338059177050) * 365.0) as int)) AS date#6]
            +- *(1) Range (0, 1000000, step=1, splits=2)




In [17]:
df_repartitioned.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange hashpartitioning(gender#18, 200), REPARTITION_BY_COL, [plan_id=363]
   +- Project [id#4L, date#6, age#9, salary#13, gender#18, CASE WHEN (cast((rand(-5258595656362598529) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(6677297146942895454) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(6084707916817199194) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(7628756694394173931) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(-1822131519618029291) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#24]
      +- Project [id#4L, date#6, age#9, salary#13, CASE WHEN (cast((rand(-1512255023260467776) * 2.0) as int) = 0) THEN M ELSE F END AS gender#18]
         +- Project [id#4L, date#6, age#9, (cast((rand(900612033348343497) * 100.0) as int) * 100) AS salary#13]
            +- Project [id#4L, date#6, cast((rand(3280734957678084291) * 100.0) as int) AS age#9]
               +- Project [id#4L, date_sub(2024-02-21, cast((rand(-50651843380591

In [18]:
df_repartitioned_by_range.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange rangepartitioning(age#9 ASC NULLS FIRST, 5), REPARTITION_BY_NUM, [plan_id=391]
   +- Project [id#4L, date#6, age#9, salary#13, gender#18, CASE WHEN (cast((rand(-5258595656362598529) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(6677297146942895454) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(6084707916817199194) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(7628756694394173931) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(-1822131519618029291) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#24]
      +- Project [id#4L, date#6, age#9, salary#13, CASE WHEN (cast((rand(-1512255023260467776) * 2.0) as int) = 0) THEN M ELSE F END AS gender#18]
         +- Project [id#4L, date#6, age#9, (cast((rand(900612033348343497) * 100.0) as int) * 100) AS salary#13]
            +- Project [id#4L, date#6, cast((rand(3280734957678084291) * 100.0) as int) AS age#9]
               +- Project [id#4L, date_sub(2024-02-21, cast((rand(-506

In [19]:
spark.stop()  # Spark 세션 종료 - 리소스 정리