In [2]:
from pyspark.sql import SparkSession  # Spark SQL 작업을 위한 SparkSession 임포트 
from pyspark.sql.functions import  # Spark SQL 함수들 임포트 rand, when

In [3]:
spark = (SparkSession.builder  # SparkSession 빌더 패턴 시작
         .appName("partitioning-and-repartitioning")  # 애플리케이션 이름 설정
         .master("spark://spark-master:7077")  # Spark 마스터 URL 설정
         .config("spark.executor.memory", "512m")  # Spark 설정 옵션
         .getOrCreate()  # SparkSession 생성 또는 기존 세션 반환)

spark.sparkContext.setLogLevel("ERROR")  # 로그 레벨을 ERROR로 설정

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 13:36:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# 생성 some sample data frames
# A large data frame with 1 million rows
large_df = (spark.range(0, 1000000)
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"salary", 100*(rand() * 100).cast("int"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn(  # 새 컬럼 추가 또는 기존 컬럼 수정"country_code", 
                        when(  # 조건문 시작(rand() * 4).cast("int") == 0, "US")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 1, "CN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 2, "IN")
                        .when(  # 조건문 시작(rand() * 4).cast("int") == 3, "BR")))
large_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+------+------+------------+
| id|salary|gender|country_code|
+---+------+------+------------+
|  0|  2700|     M|          BR|
|  1|   500|     M|          IN|
|  2|  4200|     M|          BR|
|  3|  6700|     M|          US|
|  4|   400|     M|          BR|
+---+------+------+------------+
only showing top 5 rows



                                                                                

In [5]:
num_partitions = large_df.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = large_df.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")

Number of partitions: 2




Partition sizes: [500000, 500000]


                                                                                

In [6]:
df_hash = large_df.repartition(  # 파티션 재분배10, "id")

In [7]:
num_partitions = df_hash.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = df_hash.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")

[Stage 2:>                                                          (0 + 2) / 2]

Number of partitions: 10




Partition sizes: [99990, 99781, 99533, 99938, 100111, 100200, 100448, 100094, 100048, 99857]


                                                                                

In [8]:
df_range = large_df.repartitionByRange(10, "id")

In [9]:
num_partitions = df_range.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = df_range.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")



Number of partitions: 10




Partition sizes: [93970, 98146, 108664, 99680, 99522, 106143, 92137, 97096, 112388, 92254]


                                                                                

In [10]:
df_coalesce = df_range.coalesce(  # 파티션 수 줄이기4)

In [11]:
num_partitions = df_coalesce.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

partition_sizes = df_coalesce.rdd.glom().map(len).collect()
print(f"Partition sizes: {partition_sizes}")

Number of partitions: 4




Partition sizes: [205487, 298575, 299793, 196145]


                                                                                

In [None]:
(large_df.write 
    .format("parquet")
    .partitionBy("id")
    .mode("overwrite")  # 기존 데이터 덮어쓰기
    .save("../data/tmp/partitioned_output"))

[Stage 13:>                                                         (0 + 2) / 2]

In [21]:
df_read = (spark.read
           .format("parquet")
           .load(  # 파일 로드"../data/tmp/partitioned_output"))

df_read.show(5)

                                                                                

+----------+---+
|     value| id|
+----------+---+
|0.15517375|505|
|0.95623612|505|
|0.09070664|505|
|0.85489201|505|
|0.09197253|505|
+----------+---+
only showing top 5 rows



In [22]:
spark.stop()  # Spark 세션 종료 - 리소스 정리