In [1]:
# Import modules
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 broadcast, col, rand, skewness,lit

In [3]:
spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("optimize-join-strategies")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

In [4]:
# 생성 some sample data frames
# A large data frame with 10 million rows and two columns: id and value
large_df = spark.range(0, 1000000).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"value", rand(seed=42))

# A small data frame with 10000 rows and two columns: id and name
small_df = spark.range(0, 10000).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"name", col("id").cast("string"))

# A skewed data frame with 10 million rows and two columns: id and value
# The id column has a Zipf distribution with a skewness of 4.7
skewed_df = spark.range(0, 1000000).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"value", rand(seed=42)).withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"id", col("id") ** 4)

In [5]:
# Define a function to measure the execution time of a query
import time

def measure_time(query):
    start = time.time()
    query.collect() # Force the query execution by calling an action
    end = time.time()
    print(f"Execution time: {end - start} seconds")

## Choosing the right join type

In [10]:
# Join large_df and small_df using an inner  # 내부 조인 join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id"))

# Join large_df and small_df using a left  # 왼쪽 조인 outer join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id", "left"))

# Join large_df and small_df using a right  # 오른쪽 조인 outer join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id", "right"))

# Join large_df and small_df using a full outer  # 외부 조인 join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id", "full"))

# Join large_df and small_df using a left  # 왼쪽 조인 semi join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id", "left_semi"))

# Join large_df and small_df using a left  # 왼쪽 조인 anti join on id column
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id", "left_anti"))

Execution time: 1.0291528701782227 seconds


                                                                                

Execution time: 25.628353357315063 seconds


                                                                                

Execution time: 5.767467021942139 seconds


                                                                                

Execution time: 18.269603490829468 seconds
Execution time: 0.5185227394104004 seconds
Execution time: 6.6788036823272705 seconds


## Broadcasting small tables

In [16]:
# Join large_df and small_df using an inner  # 내부 조인 join with broadcast hash join hint
spark.conf.set(  # Spark 설정 변경"spark.sql.adaptive.enabled", "false")
spark.conf.set(  # Spark 설정 변경"spark.sql.autoBroadcastJoinThreshold", -1)

# Join large_df and small_df using an inner  # 내부 조인 join without broadcasting
measure_time(large_df.join(  # 데이터프레임 조인small_df, "id"))

# Join large_df and small_df using an inner  # 내부 조인 join with broadcasting
measure_time(large_df.join(  # 데이터프레임 조인broadcast(small_df), "id"))

                                                                                

Execution time: 3.1970551013946533 seconds
Execution time: 0.20557928085327148 seconds


## Using Join Hints

In [17]:
# Join large_df and small_df using an inner  # 내부 조인 join with broadcast hash join hint
inner_join_broadcast_hint = large_df.hint("broadcast").join(  # 데이터프레임 조인small_df, "id")
measure_time(inner  # 내부 조인_join_broadcast_hint)

# Join large_df and small_df using an inner  # 내부 조인 join with shuffle hash join hint
inner_join_shuffle_hash_hint = large_df.hint("shuffle_hash").join(  # 데이터프레임 조인small_df, "id")
measure_time(inner  # 내부 조인_join_shuffle_hash_hint)

# Join large_df and small_df using an inner  # 내부 조인 join with shuffle replicate nested loop join hint
inner_join_shuffle_replicate_nl_hint = large_df.hint("shuffle_replicate_nl").join(  # 데이터프레임 조인small_df, "id")
measure_time(inner  # 내부 조인_join_shuffle_replicate_nl_hint)

# Join large_df and small_df using an inner  # 내부 조인 join with sort merge join hint
inner_join_merge_hint = large_df.hint("merge").join(  # 데이터프레임 조인small_df, "id")
measure_time(inner  # 내부 조인_join_merge_hint)

Execution time: 1.8980967998504639 seconds


                                                                                

Execution time: 2.253967046737671 seconds


                                                                                

Execution time: 761.0224421024323 seconds




Execution time: 2.357747793197632 seconds


                                                                                

## Enable Adaptive Query Execution

In [19]:
# Join large_df and skewed_df using an inner  # 내부 조인 join without AQE
spark.conf.set(  # Spark 설정 변경"spark.sql.adaptive.enabled", "false")
inner_join_no_aqe = large_df.join(  # 데이터프레임 조인skewed_df, "id")
measure_time(inner  # 내부 조인_join_no_aqe)

# Join large_df and skewed_df using an inner  # 내부 조인 join with AQE
spark.conf.set(  # Spark 설정 변경"spark.sql.adaptive.enabled", "true")
inner_join_aqe = large_df.join(  # 데이터프레임 조인skewed_df, "id")
measure_time(inner  # 내부 조인_join_aqe)

                                                                                

Execution time: 8.188302278518677 seconds




Execution time: 2.7499380111694336 seconds


                                                                                

## Handling skewed data

### Salting

In [20]:
# Join large_df and skewed_df using an inner  # 내부 조인 join with salting
# Add a salt column to the skewed_df with 10 random values
skewed_df_with_salt = skewed_df.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salt", (rand(seed=42) * 10).cast("int"))

# Join large_df and skewed_df_with_salt on id and salt columns
salted_join = large_df.withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salt", lit(0)).join(skewed_df_with_salt, ["id", "salt"])

# Remove the salt column and self-join on id column
salted_join_no_salt = salted_join.drop("salt").join(skewed_df.select(  # 컬럼 선택"id"), "id")
measure_time(salted_join_no_salt)



Execution time: 6.502509117126465 seconds


                                                                                

### Repartitioning

In [21]:
# Join large_df and skewed_df using an inner  # 내부 조인 join with repartitioning
# Repartition the skewed_df into 1000 partitions
skewed_df_repartitioned = skewed_df.repartition(  # 파티션 재분배1000, "id")

# Join large_df and skewed_df_repartitioned on id column
repartitioned_join = large_df.join(  # 데이터프레임 조인skewed_df_repartitioned, "id")
measure_time(repartitioned_join)



Execution time: 10.696999549865723 seconds


                                                                                

In [25]:
spark.stop()  # Spark 세션 종료 - 리소스 정리