In [54]:
from delta import  # Delta Lake 라이브러리 임포트 *
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 *
from pyspark.sql.types import  # Spark SQL 데이터 타입 임포트 *
import timeit

In [3]:
builder = (SparkSession.builder  # SparkSession 빌더 패턴 시작
           .appName("optimize-table-partitions-delta")  # 애플리케이션 이름 설정
           .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
           .config("spark.executor.memory", "512m")  # Spark 설정 옵션
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  # Spark 설정 옵션
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")  # Spark 설정 옵션)

spark = configure_spark_with_delta_pip(builder).getOrCreate()  # SparkSession 생성 또는 기존 세션 반환
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4ea6b6e-1365-4798-a1ed-32d6667a2ddd;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 310ms :: artifacts dl 22ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

In [12]:
%load_ext sparksql_magic
%config SparkSql.limit=20

In [45]:
# 생성 some sample data frames
# A large data frame with 1 million rows
large_df = (spark.range(0, 1000000)
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salary", 100*(rand() * 100).cast("int"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"country_code", 
                        when(  # 조건문 시작(rand() * 4).cast("int") == 0, "US")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 1, "CN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 2, "IN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 3, "BR")
                        .otherwise(  # 기본값 설정'RU')))
large_df.show(5)

+---+------+------+------------+
| id|salary|gender|country_code|
+---+------+------+------------+
|  0|  3100|     M|          US|
|  1|  4300|     M|          CN|
|  2|  3000|     M|          IN|
|  3|  4500|     F|          US|
|  4|  5900|     F|          RU|
+---+------+------+------------+
only showing top 5 rows



In [47]:
(large_df.write
 .format("delta")  # Delta Lake 형식으로 저장
 .mode("overwrite")  # 기존 데이터 덮어쓰기
 .save("../data/tmp/large_delta"))

                                                                                

In [48]:
(large_df.write
 .format("delta")  # Delta Lake 형식으로 저장
 .mode("overwrite")  # 기존 데이터 덮어쓰기
 .partitionBy("country_code")
 .option("overwriteSchema", "true")
 .save("../data/tmp/large_delta_partitioned"))

                                                                                

In [55]:
non_partitioned_query = "spark.sql(\"SELECT country_code,gender, COUNT(*) AS employees FROM delta.`/opt/workspace/data/tmp/large_delta` GROUP BY ALL ORDER BY employees DESC\").show()  # DataFrame 내용 출력"
non_partitioned_time = timeit.timeit(non_partitioned_query, number=1, globals=globals())
print(f"Non-partitioned query time: {non_partitioned_time} seconds")

                                                                                

+------------+------+---------+
|country_code|gender|employees|
+------------+------+---------+
|          RU|     F|   158374|
|          RU|     M|   158085|
|          US|     M|   124924|
|          US|     F|   124921|
|          CN|     F|    94448|
|          CN|     M|    93464|
|          IN|     F|    70416|
|          IN|     M|    70189|
|          BR|     F|    52726|
|          BR|     M|    52453|
+------------+------+---------+

Non-partitioned query time: 1.8541588850002881 seconds


In [56]:
partitioned_query = "spark.sql(\"SELECT country_code,gender, COUNT(*) AS employees FROM delta.`/opt/workspace/data/tmp/large_delta_partitioned` GROUP BY ALL ORDER BY employees DESC\").show()  # DataFrame 내용 출력"
partitioned_time = timeit.timeit(partitioned_query, number=1, globals=globals())
print(f"Partitioned query time: {partitioned_time} seconds")


+------------+------+---------+
|country_code|gender|employees|
+------------+------+---------+
|          RU|     F|   158374|
|          RU|     M|   158085|
|          US|     M|   124924|
|          US|     F|   124921|
|          CN|     F|    94448|
|          CN|     M|    93464|
|          IN|     F|    70416|
|          IN|     M|    70189|
|          BR|     F|    52726|
|          BR|     M|    52453|
+------------+------+---------+

Partitioned query time: 0.9437477390001732 seconds


In [57]:
spark.stop()  # Spark 세션 종료 - 리소스 정리