In [19]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트 
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 rand, col, when, broadcast, concat, lit

In [2]:
# 생성 a new SparkSession
spark = (SparkSession
         .builder
         .appName("avoid-data-skew")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

# Set log level to ERROR
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 13:50:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Define a function to measure the execution time of a query
import time

def measure_time(query):
    start = time.time()
    query.collect() # Force the query execution by calling an action
    end = time.time()
    print(f"Execution time: {end - start} seconds")

In [4]:
# 생성 some sample data frames
# A large data frame with 10 million rows and two columns: id and value
large_df = spark.range(0, 10000000).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"value", rand(seed=42))

# A skewed data frame with 1 million rows and two columns: id and value
skewed_df = spark.range(0, 1000000).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"value", rand(seed=42)).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"id", when(col("id")%4 == 0, 0).otherwise(col("id")))

In [5]:
large_df_repartitioned = large_df.repartition(  # 파티션 재분배5, "id")
num_partitions = large_df_repartitioned.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = large_df_repartitioned.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")

skewed_df_repartitioned = skewed_df.repartition(  # 파티션 재분배5, "id")
num_partitions = skewed_df_repartitioned.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = skewed_df_repartitioned.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")

[Stage 0:>                                                          (0 + 2) / 2]

Number of partitions: 5


                                                                                

Partition sizes: [1998962, 2000902, 1999898, 2000588, 1999650]
Number of partitions: 5




Partition sizes: [400054, 150144, 149846, 149903, 150053]


                                                                                

In [6]:
spark.conf.set(  # Spark 설정 변경"spark.sql.adaptive.enabled", "false")
spark.conf.set(  # Spark 설정 변경"spark.sql.autoBroadcastJoinThreshold", -1)

In [7]:
# Join the non-skewed DataFrames using the default join strategy (sort-merge join)
inner_join_df = large_df_repartitioned.join(  # 데이터프레임 조인skewed_df_repartitioned, "id")
measure_time(inner  # 내부 조인_join_df)

                                                                                

Execution time: 30.910954236984253 seconds


In [8]:
inner  # 내부 조인_join_df.explain()

== Physical Plan ==
*(5) Project [id#0L, value#2, value#7]
+- *(5) SortMergeJoin [id#0L], [id#10L], Inner
   :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(id#0L, 200), REPARTITION_BY_NUM, [plan_id=97]
   :     +- *(1) Project [id#0L, rand(42) AS value#2]
   :        +- *(1) Range (0, 10000000, step=1, splits=2)
   +- *(4) Sort [id#10L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(id#10L, 200), REPARTITION_BY_NUM, [plan_id=103]
         +- *(3) Project [CASE WHEN ((id#5L % 4) = 0) THEN 0 ELSE id#5L END AS id#10L, value#7]
            +- *(3) Project [id#5L, rand(42) AS value#7]
               +- *(3) Range (0, 1000000, step=1, splits=2)




### Isolate skewed data

In [11]:
# Identify the skewed value in the invoice_id column
skewed_value = 0

# 필터링 out the rows with the skewed value from both DataFrames
large_skewed_df = large_df_repartitioned.filter(  # 데이터 필터링large_df_repartitioned.id == skewed_value)
small_skewed_df = skewed_df_repartitioned.filter(  # 데이터 필터링skewed_df_repartitioned.id == skewed_value)

# 필터링 out the rows without the skewed value from both DataFrames
large_non_skewed_df = large_df_repartitioned.filter(  # 데이터 필터링large_df_repartitioned.id != skewed_value)
small_non_skewed_df = skewed_df_repartitioned.filter(  # 데이터 필터링skewed_df_repartitioned.id != skewed_value)

# Join the non-skewed DataFrames using the default join strategy (sort-merge join)
non_skewed_join_df = large_non_skewed_df.join(  # 데이터프레임 조인small_non_skewed_df, "id")

# Join the skewed DataFrames using a broadcast hash join
skewed_join_df = large_skewed_df.join(  # 데이터프레임 조인broadcast(small_skewed_df), "id")

# Union the results from both joins
final_join_df = non_skewed_join_df.union(  # 데이터프레임 합치기skewed_join_df)

measure_time(final_join_df)


                                                                                

Execution time: 14.912306070327759 seconds


In [12]:
final_join_df.explain()

== Physical Plan ==
Union
:- *(5) Project [id#0L, value#2, value#7]
:  +- *(5) SortMergeJoin [id#0L], [id#10L], Inner
:     :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0
:     :  +- Exchange hashpartitioning(id#0L, 200), REPARTITION_BY_NUM, [plan_id=228]
:     :     +- *(1) Filter NOT (id#0L = 0)
:     :        +- *(1) Project [id#0L, rand(42) AS value#2]
:     :           +- *(1) Range (0, 10000000, step=1, splits=2)
:     +- *(4) Sort [id#10L ASC NULLS FIRST], false, 0
:        +- Exchange hashpartitioning(id#10L, 200), REPARTITION_BY_NUM, [plan_id=234]
:           +- *(3) Project [CASE WHEN ((id#5L % 4) = 0) THEN 0 ELSE id#5L END AS id#10L, value#7]
:              +- *(3) Filter NOT CASE WHEN ((id#5L % 4) = 0) THEN true ELSE (id#5L = 0) END
:                 +- *(3) Project [id#5L, rand(42) AS value#7]
:                    +- *(3) Range (0, 1000000, step=1, splits=2)
+- *(8) Project [id#25L, value#2, value#7]
   +- *(8) BroadcastHashJoin [id#25L], [id#10L], Inner, BuildRight, false

### Broadcast hash join

In [13]:
smaller_df = skewed_df_repartitioned

# Use the broadcast function to mark the smaller DataFrame for broadcasting
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 broadcast
broadcast_df = broadcast(smaller_df)

# Join the two DataFrames using the broadcast function as an argument

broadcast_join_df = large_df_repartitioned.join(  # 데이터프레임 조인broadcast_df, "id")

measure_time(broadcast_join_df)

                                                                                

Execution time: 8.040036916732788 seconds


In [14]:
broadcast_join_df.explain()

== Physical Plan ==
*(3) Project [id#0L, value#2, value#7]
+- *(3) BroadcastHashJoin [id#0L], [id#10L], Inner, BuildRight, false
   :- Exchange hashpartitioning(id#0L, 5), REPARTITION_BY_NUM, [plan_id=329]
   :  +- *(1) Project [id#0L, rand(42) AS value#2]
   :     +- *(1) Range (0, 10000000, step=1, splits=2)
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=333]
      +- Exchange hashpartitioning(id#10L, 5), REPARTITION_BY_NUM, [plan_id=332]
         +- *(2) Project [CASE WHEN ((id#5L % 4) = 0) THEN 0 ELSE id#5L END AS id#10L, value#7]
            +- *(2) Project [id#5L, rand(42) AS value#7]
               +- *(2) Range (0, 1000000, step=1, splits=2)




In [15]:
# 출력 some rows of the result
broadcast_join_df.show(10)

[Stage 22:>                                                         (0 + 1) / 1]

+---+-----------------+-------------------+
| id|            value|              value|
+---+-----------------+-------------------+
|  0|0.619189370225301|0.14968092420202395|
|  0|0.619189370225301| 0.8421636914011397|
|  0|0.619189370225301| 0.5516706309356983|
|  0|0.619189370225301| 0.5908099559659594|
|  0|0.619189370225301|0.08615681416391996|
|  0|0.619189370225301|0.15134594584450656|
|  0|0.619189370225301|  0.657124398921156|
|  0|0.619189370225301| 0.3063672834878989|
|  0|0.619189370225301|0.11190167037821486|
|  0|0.619189370225301|0.07747379719322578|
+---+-----------------+-------------------+
only showing top 10 rows



                                                                                

### Key salting

In [20]:
# Import random module
import random

# Identify the skewed value in the id column
skewed_value = 0

# 생성 a list of salt values to append to the skewed value
salt_list = ["_A", "_B", "_C", "_D", "_E"]

# 생성 a new column in both DataFrames that contains the original invoice_id value plus a salt value if it is skewed, or just the original invoice_id value otherwise
large_df = (large_df_repartitioned
              .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salted_id", when(large_df_repartitioned.id == skewed_value, concat(large_df_repartitioned.id, lit(random.choice(salt_list))))
                          .otherwise(  # 기본값 설정large_df_repartitioned.id)))
skewed_df = (skewed_df_repartitioned
             .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salted_id", when(skewed_df_repartitioned.id == skewed_value, concat(skewed_df_repartitioned.id, lit(random.choice(salt_list))))
                         .otherwise(  # 기본값 설정skewed_df_repartitioned.id)))

# Join the two DataFrames on the new column using the default join strategy (sort-merge join)
salted_join_df = large_df.join(  # 데이터프레임 조인skewed_df, "salted_id")

# Drop the new column and keep only the original invoice_id column
final_join_df = salted_join_df.drop(  # 컬럼 삭제"salted_id")

measure_time(final_join_df)

                                                                                

Execution time: 19.14427161216736 seconds


In [21]:
# 출력 some rows of the result
final_join_df.show(10)



+------+-------------------+------+-------------------+
|    id|              value|    id|              value|
+------+-------------------+------+-------------------+
|100010|0.35369292037242683|100010|0.35369292037242683|
|100227| 0.7173545475305475|100227| 0.7173545475305475|
|100263| 0.6839437246645035|100263| 0.6839437246645035|
|100553| 0.9213033942226746|100553| 0.9213033942226746|
|100735| 0.5717367801064485|100735| 0.5717367801064485|
|101021|0.36029429102236565|101021|0.36029429102236565|
|101122| 0.7321914376505848|101122| 0.7321914376505848|
|101205| 0.5124474456736382|101205| 0.5124474456736382|
|101261| 0.6677580792714339|101261| 0.6677580792714339|
|102113| 0.7728354101123371|102113| 0.7728354101123371|
+------+-------------------+------+-------------------+
only showing top 10 rows



                                                                                

In [22]:
spark.stop()  # Spark 세션 종료 - 리소스 정리