In [6]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트 
from pyspark import StorageLevel 
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 rand, current_date, date_sub

In [2]:
spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("cache-and-persist")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 13:44:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
# Define a function to measure the execution time of a query
import time

def measure_time(query):
    start = time.time()
    query.collect() # Force the query execution by calling an action
    end = time.time()
    print(f"Execution time: {end - start} seconds")

In [3]:
# 생성 some sample data frames
# A large data frame with 10 million rows and two columns: id and value
large_df = (spark.range(0, 10000000)
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"date", date_sub(current_date(), (rand() * 365).cast("int")))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"ProductId", (rand() * 100).cast("int")))
large_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+----------+---------+
| id|      date|ProductId|
+---+----------+---------+
|  0|2024-02-10|       67|
|  1|2023-07-12|       39|
|  2|2023-08-10|        8|
|  3|2023-05-22|       29|
|  4|2023-06-22|       63|
+---+----------+---------+
only showing top 5 rows



                                                                                

In [4]:
# Cache the DataFrame using cache() method
large_df.cache()  # 메모리에 캐싱
# Check the storage level of the cached DataFrame
print(large_df.storageLevel)

Disk Memory Deserialized 1x Replicated


In [7]:
# Persist the DataFrame using persist() method with a different storage level
large_df.persist(  # 스토리지 레벨 지정하여 캐싱StorageLevel.MEMORY_AND_DISK_DESER)
# Check the storage level of the persisted DataFrame
print(large_df.storageLevel)

Disk Memory Deserialized 1x Replicated


In [10]:
results_df = large_df.groupBy(  # 그룹화"ProductId").agg({"Id": "count"}) 
measure_time(results_df)
# 표시 the result
results_df.show(5)

                                                                                

Execution time: 8.600075006484985 seconds
+---------+---------+
|ProductId|count(Id)|
+---------+---------+
|       31|    99961|
|       85|    99746|
|       65|   100023|
|       53|   100615|
|       78|    99985|
+---------+---------+
only showing top 5 rows



In [11]:
results_df = large_df.groupBy(  # 그룹화"ProductId").agg({"Id": "count"}) 
measure_time(results_df)
# 표시 the result
results_df.show(5)

Execution time: 0.984121561050415 seconds




+---------+---------+
|ProductId|count(Id)|
+---------+---------+
|       31|    99961|
|       85|    99746|
|       65|   100023|
|       53|   100615|
|       78|    99985|
+---------+---------+
only showing top 5 rows



                                                                                

In [12]:
# Unpersist the DataFrame using unpersist() method
large_df.unpersist()  # 캐시 해제
# Check the storage level of the unpersisted DataFrame
print(large_df.storageLevel)

Serialized 1x Replicated


In [13]:
spark.stop()  # Spark 세션 종료 - 리소스 정리