# Today's topic: How Spark partitions are influenced when loading the data with parquet

# Why do we need this?
- Understand how Spark creates partitioning and what influences it makes performance and debugging better
- You learned how the number of partitions, empty partitions and distribution of data within partitions influences the performance (added the previous comments below)
- Coalesce and especially repartition are expensive operation. If we can influence the spark partitions during loading already is a big win

# 0. Set-Ups

General hints for this notebook:
- Spark UI usually accesible by http://localhost:4040/ or http://localhost:4041/
- Deep dive Spark UI happens in later episodes
- sc.setJobDescription("Description") replaces the Job Description of an action in the Spark UI with your own
- sdf.rdd.getNumPartitions() returns the number partitions of the current Spark DataFrame
- sdf.write.format("noop").mode("overwrite").save() is a good way to analyze and initiate actions for transformations without side effects during an actual write

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import math
import time


In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""
sc = spark.sparkContext

In [3]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [5]:
def rows_per_partition(sdf: "DataFrame") -> None:
    num_rows = sdf.count()
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id").count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id").show()

In [6]:
def rows_per_partition_col(sdf: "DataFrame", num_rows: int, col: str) -> None:
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id", col).count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id", col).show()


In [7]:
BASE_DIR = "D:/Spark/Data"
results_dict = {}
results_list = []

In [8]:
def write_generator(num_rows, num_files):
    sdf = sdf_generator(num_rows, num_files)
    path = f"{BASE_DIR}/{num_files}_files_{num_rows}_rows.parquet"
    sc.setJobDescription(f"Write {num_files} files, {num_rows} rows")
    sdf.write.format("parquet").mode("overwrite").save(path)
    sc.setJobDescription("None")
    print(f"Num partitions written: {sdf.rdd.getNumPartitions()}")
    print(f"Saved Path: {path}")
    return path

In [9]:
def set_configs(maxPartitionsMB = 128, openCostInMB = 4, minPartitions = 4):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    openCostInBytes = math.ceil(openCostInMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")
    spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInBytes)+"b")
    spark.conf.set("spark.sql.files.minPartitionNum", str(minPartitions))
    print(" ")
    print("******** SPARK CONFIGURATIONS ********")
    print(f"MaxPartitionSize {maxPartitionsMB} MB or {maxPartitionsBytes} bytes")
    print(f"OpenCostInBytes {openCostInMB} MB or {openCostInBytes} bytes")
    print(f"Min Partitions: {minPartitions}")

    results_dict["maxPartitionsBytes"] = maxPartitionsMB

In [82]:
def get_parquet_meta_data(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .withColumn("calcNumBlocks", f.col("compressedMB")/128)
    )
    sdf.show(20, truncate=False)
    return sdf

In [11]:
def get_parquet_blocks(path):
    sdf = (
        spark.read.parquet_blocks(path)
        .dropDuplicates(["filename","block"])
        .orderBy("filename", "block")
        .withColumn("blockEnd", f.col("blockStart") + f.col("compressedBytes") - 1)
        .withColumn("blockMiddle", f.col("blockStart") + 0.5 * f.col("compressedBytes"))
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("filename", "block", "blockStart", "blockEnd", "blockMiddle", "compressedBytes", "compressedMB", "rows")
    )

    sdf.show(20, truncate=False)

In [12]:
def get_spark_partitions(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "start", "end", "length", "blocks", "compressedBytes", "compressedMB", "rows", "filename")
    )

    sdf.show(20, truncate=False)

In [13]:
def get_parquet_window_length(path):
    sdf = spark.read.parquet_partitions(path)
    val = sdf.select(f.max(sdf["length"]))
    max_length = val.collect()[0][0]
    print(f"Max Parquet window length: {round(max_length/1024/1024, 1)} MB or {max_length} bytes")

In [14]:
def get_parquet_file_size(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
    )
    sum = sdf.select(f.sum(sdf["compressedBytes"]))
    size = sum.collect()[0][0]
    return size

In [15]:
def round_half_up(n, decimals=0):
    multiplier = 10**decimals
    return math.floor(n * multiplier + 0.5) / multiplier

#source: https://realpython.com/python-rounding/

In [16]:
def estimate_num_partitions(file_size, num_files):
    """
    Reference to code: 
    - Stackoverflow: https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
    - GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala#L86-L97
    """
    #get spark values
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    #Calculate maxSpliPartitionBytes
    # a) If we have bigger files bytesPerCorePadded will be bigger then openCostInBytes but also maxPartitionBytes. 
    # In this case we would limit the size as mayPartitionBytes will be the result of maxSplitPartitionBytes. e.g. 1 GB dataset, 4 cores, maxPartitions 128 MB
    # b) If bytesPerCorePadded is the result of maxSplitPartitionBytes we have a fair split of the data over all cores, e.g. 1 GB dataset, 4 cores, maxPartitions 300 MB
    # c) If bytesPerCore is to small we want to limit amount of Partitions to be opened. This is the cost here.
    paddedFileSize = file_size + num_files * openCostInBytes
    bytesPerCorePadded = paddedFileSize / minPartitionNum
    maxSplitPartitionBytes = min(maxPartitionBytes, max(openCostInBytes, bytesPerCorePadded))
    #Estimation of partitions from Internet
    estimated_num_partitions_int = paddedFileSize/maxSplitPartitionBytes
    #Own Estimator
    avg_file_size_padded = paddedFileSize/num_files
    bytesPerCore = file_size / minPartitionNum
    #Calculate number of files fitting into one partitions. Then calculate the number of partitions
    files_per_partition = max(1, math.floor(maxSplitPartitionBytes/avg_file_size_padded))
    estimated_num_partitions = num_files/files_per_partition
    print(" ")
    print("******** ESTIMATION OF MAX SPLIT PARTITION BYTES AND NO PARTITIONS ********")
    print(f"Avg file Size Padded: {round(avg_file_size_padded/1024/1024, 1)} MB or {avg_file_size_padded} bytes")
    print(f"Padded File Size: {round(paddedFileSize/1024/1024, 1)} MB or {paddedFileSize} bytes")
    print(f"SizePerCore: {round(bytesPerCore/1024/1024, 1)} MB or {bytesPerCore} bytes")
    print(f"SizePerCorePadded: {round(bytesPerCorePadded/1024/1024, 1)} MB or {bytesPerCorePadded} bytes")
    print(f"MaxSplitPartitionBytes: {round(maxSplitPartitionBytes/1024/1024, 1)} MB or {maxSplitPartitionBytes} bytes")
    print(f"MaxFilesPerPartition {files_per_partition}")
    print(f"EstimatedPartitions: {math.ceil(estimated_num_partitions)}, unrounded: {estimated_num_partitions}")
    print(f"EstimatedPartitionsInternet: {math.ceil(estimated_num_partitions_int)}, unrounded: {estimated_num_partitions_int}")

    results_dict["paddedFileSize"] = round(paddedFileSize/1024/1024, 1)
    results_dict["MBPerCore"] = round(bytesPerCore/1024/1024, 1)
    results_dict["MBPerCorePadded"] = round(bytesPerCorePadded/1024/1024, 1)
    results_dict["maxSplitPartitionBytes"] = round(maxSplitPartitionBytes/1024/1024, 1)
    results_dict["avg_file_size_padded"] = round(avg_file_size_padded/1024/1024, 1)
    results_dict["Maxfiles_per_partition"] = files_per_partition
    results_dict["MyEstimationPartitions"] = math.ceil(estimated_num_partitions)
    results_dict["InternetEstimationPartitions"] = math.ceil(estimated_num_partitions_int)
    


In [17]:
def bytes_rows_per_partition(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .groupBy("partition").agg(f.sum("compressedBytes"), f.sum("rows"), f.count("partition"))
        .withColumnRenamed("sum(compressedBytes)", "compressedBytes")
        .withColumnRenamed("sum(rows)", "rows")
        .withColumnRenamed("count(partition)", "numFiles")
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "numFiles", "compressedBytes","compressedMB","rows")
        .orderBy("partition")
    )
    sdf.show(20)
    return sdf

In [18]:
def avg_bytes_rows_partition(sdf):
    sdf = (
        sdf.select(f.mean("numFiles"), f.mean("compressedBytes"), f.mean("rows"))
        .withColumn("avg(compressedMB)", f.round(f.col("avg(compressedBytes)")/1024/1204, 1))
        .select("avg(numFiles)", "avg(compressedBytes)", "avg(compressedMB)", "avg(rows)")
    )
    sdf.show()

In [19]:
def file_analysis(path, num_files):
    file_size = get_parquet_file_size(path)
    avg_file_size = file_size/num_files
    print(" ")
    print("******** FILE SIZE ANALYSIS ********")
    print(f"File Size: {round(file_size/1024/1024, 1)} MB or {file_size} bytes")
    print(f"Num files: {num_files}")
    print(f"Avg file Size: {round(avg_file_size/1024/1024, 1)} MB or {avg_file_size} bytes")
    


In [20]:
def row_count_analysis(num_files, num_rows):
    print(" ")
    print("******** ROW COUNT ANALYSIS ********")    
    print(f"Num files written: {num_files}")
    print(f"Num rows written: {num_rows}")
    print(f"Num rows per file: {int(num_rows/num_files)}")

In [21]:
def get_actual_num_partitions(path):
    sdf = spark.read.parquet(path)
    print(" ")
    print("******** ACTUAL RESULTS ********")   
    print(f"ActualNumPartitions: {sdf.rdd.getNumPartitions()}")
    results_dict["ActualNumPartitions"] = sdf.rdd.getNumPartitions()


In [22]:
def noop_write(path):
    sdf = spark.read.parquet(path)
    sc.setJobDescription("WRITE")
    start_time = time.time()
    sdf.write.format("noop").mode("overwrite").save()
    end_time = time.time()
    sc.setJobDescription("None")
    duration = round(end_time - start_time, 2)
    results_dict["ExecutionTime"] = duration
    print(f"Duration: {duration} sec")

# 1. What influences the no. of partitions when loading parquet files
- Num Cores in the cluster,
  - Correctly speaking it's the "spark.sql.files.minPartitionNum" config
  - It defaults to the default parallism which is our num of cores = 4
- File Size or better estimated file size
- Num parquet files
- Num of blocks/rowgroups within a parquet file
- Max Partition Size:
  - Influences the size of a partition  
  - based on the config"spark.sql.files.maxPartitionBytes"
  - defaults to 128 MB 
- Max Cost Per Bytes
  - Represents the cost of creating a new partition
  - based on the config "spark.sql.files.minPartitionNum"
  - defaults to 4 MB
  - Technically it adds the cost, e.g. 4 MB, to each file which is called padding
  - Through this less but bigger partitions are created around the size of the open cost value
  - Usually no influence, except of smaller files, default of 4MB works
  - Official description: The estimated cost to open a file, measured by the number of bytes that could be scanned in the same time. This is used when putting multiple files into a partition. It is better to over-estimate, then the partitions with small files will be faster than partitions with bigger files (which is scheduled first). This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.

References:
- https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
- https://stackoverflow.com/questions/69034543/number-of-tasks-while-reading-hdfs-in-spark
- https://stackoverflow.com/questions/75924368/skewed-partitions-when-setting-spark-sql-files-maxpartitionbytes
- https://spark.apache.org/docs/latest/sql-performance-tuning.html
- https://www.linkedin.com/pulse/how-initial-number-partitions-determined-pyspark-sugumar-srinivasan#:~:text=Ideally%20partitions%20will%20be%20created,resource%20will%20get%20utilised%20properly

# 2. Summary: Why do we need this?
- Understand how Spark creates partitioning and what influences it makes performance and debugging better
- You learned how the number of partitions, empty partitions and distribution of data within partitions influences the performance (added the previous comments below)
- Coalesce and especially repartition are expensive operation. If we can influence the spark partitions during loading already is a big win

# 3. Recap for reading: How partitions influence performance

## The most important thing you want a good parallisation. 
- This means your number of partitions should always depend on the number of cores you have available. In spark language: spark.sparkContext.defaultParallelism. Recommendations are a factor of 2-4. But really depends on memory and data size. Small data sizes run perfectly with a factor 1x.
- To have a good parallisation you should also have a well (best uniform, worst case normal) distributed dataset. Data skew can even in narrow transformations already make your whole execution dependend on one partition or task as we saw before
## Partition size
- If your partition size is really big > 1GB you might have OOM (out of memory), Garbage collection (GC) and other errors
- Recommendations in the internet say anything between 100-1000 MB. Spark sets his max partition bytes parameter for example to 128 MB. It really depends on your machine and available memory of course. Definitly don't scratch the limits of available memory.
## Distribution overhead
- As we saw in previous experiments a to high number of partitions leads to a lot of scheduling and distribution overhead.
- A good sign is if your actual aexecution time makes not at least 90 % of the total task time. Also if your tasks are below 100 ms it's usually to short

See also here: https://stackoverflow.com/questions/64600212/how-to-determine-the-partition-size-in-an-apache-spark-dataframe

# 5. Simple experiments
- Experiment 1: 4 files, a 64.8 MB, sum 259.3 MB
- Experiment 2: 8 files, a 64.9 MB MB, sum 518.9 MB
- Experiment 3: 8 files, a 47,5 MB, sum 380 MB

In [126]:
num_files = 4
num_rows = 32000000
path = write_generator(num_rows, num_files)

Num partitions written: 4
Saved Path: D:/Spark/Data/4_files_32000000_rows.parquet


In [84]:
num_files = 8
num_rows = 64000000
path = write_generator(num_rows, num_files)

Num partitions written: 8
Saved Path: D:/Spark/Data/8_files_64000000_rows.parquet


# 4. Basic Algorithm

In [96]:
#Basic algorithm
def basic_algorithm(file_size):
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])    
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    size_per_core = file_size/minPartitionNum
    partition_size = min(maxPartitionBytes, size_per_core)
    no_partitions = file_size/partition_size #round up for no_partitions
    
    print(" ")
    print("******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********")
    print(f"File Size: {round(file_size/1024/1024, 1)} MB or {file_size} bytes")
    print(f"Size Per Core: {round(size_per_core/1024/1024, 1)} MB or {size_per_core} bytes")
    print(f"Partionsize: {round(partition_size/1024/1024, 1)} MB or {partition_size} bytes")
    print(f"EstimatedPartitions: {math.ceil(no_partitions)}, unrounded: {no_partitions}")

#Reference: https://www.linkedin.com/pulse/how-initial-number-partitions-determined-pyspark-sugumar-srinivasan#:~:text=Ideally%20partitions%20will%20be%20created,resource%20will%20get%20utilised%20properly

In [102]:
path = "D:/Spark/Data/4_files_32000000_rows.parquet"
num_files = 4
num_rows = 32000000
get_parquet_meta_data(path)
file_analysis(path, num_files)
row_count_analysis(num_files, num_rows)
set_configs(maxPartitionsMB=30, openCostInMB=4, minPartitions=4)
size = get_parquet_file_size(path)
basic_algorithm(size)
basic_maxSplitBytes(size, num_files)
get_actual_num_partitions(path)
#noop_write(path)
bytes_rows_per_partition(path)

+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows   |compressedMB|calcNumBlocks|
+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00000-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy.parquet|1     |67726805       |8000000|64.6        |0.5046875    |
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00001-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy.parquet|1     |68008198       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00003-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy

DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]

In [121]:
path = "D:/Spark/Data/8_files_64000000_rows.parquet"
num_files = 8
num_rows = 64000000
sdf_meta_data = get_parquet_meta_data(path)
file_analysis(path, num_files)
row_count_analysis(num_files, num_rows)
set_configs(maxPartitionsMB=30, openCostInMB=4, minPartitions=4)
size = get_parquet_file_size(path)
basic_algorithm(size)
basic_maxSplitBytes(size, num_files)
max_split_bytes = maxSplitBytes(size, num_files)
file_size_list = get_files_as_list(sdf_meta_data)
res = getFilePartitions(file_size_list, max_split_bytes)
get_actual_num_partitions(path)
#noop_write(path)
bytes_rows_per_partition(path)

+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows   |compressedMB|calcNumBlocks|
+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00001-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy.parquet|1     |68008198       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00003-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy.parquet|1     |68063741       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00002-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy

DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]

In [100]:
def basic_maxSplitBytes(file_size, num_files):
    """
    Reference to code: 
    - Stackoverflow: https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
    - GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala#L86-L97
    """
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    file_size_padded = file_size + num_files * openCostInBytes
    size_per_core_padded = file_size_padded / minPartitionNum
    partition_size = min(maxPartitionBytes, size_per_core_padded)
    no_partitions_padded = file_size_padded/partition_size

    print(" ")
    print("******** Basic TO Calculate Max PartitionSize ********")
    print(f"Padded File Size: {round(file_size_padded/1024/1024, 1)} MB or {file_size_padded} bytes")
    print(f"SizePerCorePadded: {round(size_per_core_padded/1024/1024, 1)} MB or {size_per_core_padded} bytes")
    print(f"MaxPartionsize: {round(partition_size/1024/1024, 1)} MB or {partition_size} bytes")
    print(f"EstimatedPartitions: {math.ceil(no_partitions_padded)}, unrounded: {no_partitions_padded}")

In [112]:
def maxSplitBytes(file_size, num_files):
    """
    Reference to code: 
    - Stackoverflow: https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
    - GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala#L86-L97
    """
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    file_size_padded = file_size + num_files * openCostInBytes
    size_per_core_padded = file_size_padded / minPartitionNum
    max_partition_size = min(maxPartitionBytes, max(openCostInBytes, size_per_core_padded))
    no_partitions_padded = file_size_padded/max_partition_size

    print(" ")
    print("******** Advanced TO Calculate Max PartitionSize ********")
    print(f"Padded File Size: {round(file_size_padded/1024/1024, 1)} MB or {file_size_padded} bytes")
    print(f"SizePerCorePadded: {round(size_per_core_padded/1024/1024, 1)} MB or {size_per_core_padded} bytes")
    print(f"MaxPartionsize: {round(max_partition_size/1024/1024, 1)} MB or {max_partition_size} bytes")
    print(f"EstimatedPartitions: {math.ceil(no_partitions_padded)}, unrounded: {no_partitions_padded}")
    return max_partition_size


In [123]:
file_size_list

[68008198,
 68063741,
 68064947,
 68063635,
 68063528,
 68063118,
 67726805,
 68063235]

In [124]:
max_split_bytes

31457280

In [128]:
def split_files(files_list, max_partition_size):
    result_list = []
    for file_size in files_list:
        result_list.append(file_size)
        if file_size > max_partition_size:
            remaining_file_size
            num_dummy_files = math.ceil(file_size/max_partition_size) - 1
            for i in range(0, num_dummy_files):
                result_list.append(0)
    return result_list


In [130]:
dummy_file_size_list = split_files(file_size_list, max_split_bytes)
len(dummy_file_size_list)

24

In [113]:
def get_files_as_list(parquet_meta_data_sdf):
    return list(parquet_meta_data_sdf.select("compressedBytes").toPandas()["compressedBytes"])

def getFilePartitions(file_size_list, max_split_bytes):
    # Reference to code in GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    partitions = []
    current_files = []
    current_size = 0

    def close_partition():
        if current_files:
            partition_details = {
                "files": current_files.copy(),
                "num_files": len(current_files),
                "partition_size": sum(current_files)
            }
        else:
            partition_details = {}
        partitions.append(partition_details)
        current_files.clear()

    for file_size in file_size_list:
        if current_size + file_size > max_split_bytes:
            close_partition()
            current_size = 0
        current_size += file_size + openCostInBytes
        current_files.append(file_size)
    close_partition()
    print(f"Number calculated Partitions: {len(partitions)}")
    return partitions

In [141]:
file_size = 200
set_configs(maxPartitionsMB=200000, openCostInMB=4, minPartitions=4)
basic_algorithm(file_size*1024*1024)

 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 200000 MB or 209715200000 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 200.0 MB or 209715200 bytes
Size Per Core: 50.0 MB or 52428800.0 bytes
Partionsize: 50.0 MB or 52428800.0 bytes
EstimatedPartitions: 4, unrounded: 4.0


In [95]:
spark.sparkContext.defaultParallelism

4

In [147]:
68.8 - 50 = 18.8

18.799999999999997

In [150]:
50/14.8

3.378378378378378

In [155]:
68.9*2

137.8

+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows   |compressedMB|calcNumBlocks|
+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00001-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy.parquet|1     |68008198       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00003-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy.parquet|1     |68063741       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00002-6cf07477-2127-47fc-a59f-461f5c3494a3-c000.snappy

DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]

In [89]:
def get_files_as_list(parquet_meta_data_sdf):
    return list(parquet_meta_data_sdf.select("compressedBytes").toPandas()["compressedBytes"])

+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows   |compressedMB|calcNumBlocks|
+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00000-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy.parquet|1     |67726805       |8000000|64.6        |0.5046875    |
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00001-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy.parquet|1     |68008198       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/4_files_32000000_rows.parquet/part-00003-a5294f1d-0061-4db6-8982-3646992e8441-c000.snappy

[67726805, 68008198, 68063741, 68064947]

In [42]:
68064947 + 68063741

136128688

In [43]:
130*1024*1024

136314880

In [116]:
num_files = 8
num_rows = 47000000
path = write_generator(num_rows, num_files)

Num partitions written: 8
Saved Path: D:/Spark/Data/8_files_47000000_rows.parquet


In [163]:
num_files = 8
num_rows = 47000000
path = "D:/Spark/Data/8_files_47000000_rows.parquet"
file_analysis(path, num_files)
row_count_analysis(num_files, num_rows)
set_configs(maxPartitionsMB=104, openCostInMB=4, minPartitions=4)
size = get_parquet_file_size(path)
basic_algorithm(size)
#estimate_num_partitions(size, num_files)
get_actual_num_partitions(path)
noop_write(path)
bytes_rows_per_partition(path)

 
******** FILE SIZE ANALYSIS ********
File Size: 380.0 MB or 398415693 bytes
Num files: 8
Avg file Size: 47.5 MB or 49801961.625 bytes
 
******** ROW COUNT ANALYSIS ********
Num files written: 8
Num rows written: 47000000
Num rows per file: 5875000
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 104 MB or 109051904 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** BASIC ALGORITHM TO ESTIMATE NO PARTITIONS ********
File Size: 380.0 MB or 398415693 bytes
Size Per Core: 95.0 MB or 99603923.25 bytes
Partionsize: 95.0 MB or 99603923.25 bytes
EstimatedPartitions: 4, unrounded: 4.0
 
******** ACTUAL RESULTS ********
ActualNumPartitions: 4
Duration: 5.36 sec
+---------+--------+---------------+------------+--------+
|partition|numFiles|compressedBytes|compressedMB|    rows|
+---------+--------+---------------+------------+--------+
|        0|       2|       99956576|        95.3|11750000|
|        1|       2|       99707363|        95.1|11750000|
|        2| 

DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]

In [93]:


getFilePartitions([64.8, 64.8, 64.8, 64.8], 65)


0
[]
4194368.8
[64.8]
partition_closed
4194368.8
[64.8]
partition_closed
4194368.8
[64.8]
partition_closed
partition_closed


[{'files': [64.8], 'num_files': 1, 'partition_size': 64.8},
 {'files': [64.8], 'num_files': 1, 'partition_size': 64.8},
 {'files': [64.8], 'num_files': 1, 'partition_size': 64.8},
 {'files': [64.8], 'num_files': 1, 'partition_size': 64.8}]

In [92]:
sdf.show()

Py4JJavaError: An error occurred while calling o2722.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 472.0 failed 1 times, most recent failure: Lost task 0.0 in stage 472.0 (TID 16770) (DESKTOP-PNH8CDK executor driver): java.io.IOException: Cannot run program "python3": CreateProcess error=2, Das System kann die angegebene Datei nicht finden
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: CreateProcess error=2, Das System kann die angegebene Datei nicht finden
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(ProcessImpl.java:453)
	at java.lang.ProcessImpl.start(ProcessImpl.java:139)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 33 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.GeneratedMethodAccessor129.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: Cannot run program "python3": CreateProcess error=2, Das System kann die angegebene Datei nicht finden
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:181)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.io.IOException: CreateProcess error=2, Das System kann die angegebene Datei nicht finden
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(ProcessImpl.java:453)
	at java.lang.ProcessImpl.start(ProcessImpl.java:139)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 33 more
