In [1]:
from delta import  # Delta Lake 라이브러리 임포트 *
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 *
from pyspark.sql.types import  # Spark SQL 데이터 타입 임포트 *
import timeit

In [2]:
builder = (SparkSession.builder  # SparkSession 빌더 패턴 시작
           .appName("compression-delta-table")  # 애플리케이션 이름 설정
           .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
           .config("spark.executor.memory", "512m")  # Spark 설정 옵션
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")  # Spark 설정 옵션
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")  # Spark 설정 옵션)

spark = configure_spark_with_delta_pip(builder).getOrCreate()  # SparkSession 생성 또는 기존 세션 반환
spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-039c90a5-6b76-416c-a380-08c58b2a6e42;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 399ms :: artifacts dl 14ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

In [3]:
%load_ext sparksql_magic
%config SparkSql.limit=20

In [4]:
# 생성 some sample data frames
# A large data frame with 1 million rows
df = (spark.range(0, 1000000)
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salary", 100*(rand() * 100).cast("int"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"country_code", 
                        when(  # 조건문 시작(rand() * 4).cast("int") == 0, "US")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 1, "CN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 2, "IN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 3, "BR")
                        .otherwise(  # 기본값 설정'RU')))
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+------+------+------------+
| id|salary|gender|country_code|
+---+------+------+------------+
|  0|  2000|     M|          BR|
|  1|  4000|     M|          RU|
|  2|  2400|     F|          IN|
|  3|  6200|     M|          US|
|  4|  7900|     M|          RU|
+---+------+------+------------+
only showing top 5 rows



                                                                                

In [5]:
# 쓰기 the DataFrame to a Delta Lake table with the default compression codec (snappy)
(df.write.format("delta")  # Delta Lake 형식으로 저장
    .mode("overwrite")  # 기존 데이터 덮어쓰기
    .save("../data/tmp/employee_salary_snappy"))

                                                                                

In [6]:
# Check the size of the table on disk
query = "(spark.read.format(\"delta\").load(  # 파일 로드\"../data/tmp/employee_salary_snappy\").write.mode(\"overwrite\").format(\"noop\").save())"
snappy_time = timeit.timeit(query, number=1, globals=globals())
print(f"Snappy Compression query time: {snappy_time} seconds")

[Stage 20:>                                                         (0 + 2) / 2]

Snappy Compression query time: 2.3627631140006997 seconds


                                                                                

In [7]:
# 쓰기 the DataFrame to a Delta Lake table with the default compression codec (snappy)
(df.write.format("delta")  # Delta Lake 형식으로 저장
 .mode("overwrite")  # 기존 데이터 덮어쓰기
 .option("compression", "zstd")
 .save("../data/tmp/employee_salary_zstd"))

                                                                                

In [8]:
# Check the size of the table on disk
query = "(spark.read.format(\"delta\").load(  # 파일 로드\"../data/tmp/employee_salary_zstd\").write.mode(\"overwrite\").format(\"noop\").save())"
zstd_time = timeit.timeit(query, number=1, globals=globals())
print(f"zstd Compression query time: {zstd_time} seconds")

                                                                                

zstd Compression query time: 1.8800181449987576 seconds


                                                                                

In [10]:
spark.stop()  # Spark 세션 종료 - 리소스 정리