# Today's topic: Discover the Secrets of influecing spark partitions during reads

# Why do we need this?
- Understand how Spark creates partitioning and what influences it makes performance and debugging better
- You learned how the number of partitions, empty partitions and distribution of data within partitions influences the performance (added the previous comments below)
- Coalesce and especially repartition are expensive operation. If we can influence the spark partitions during loading already is a big win

# 0. Set-Ups

General hints for this notebook:
- Spark UI usually accesible by http://localhost:4040/ or http://localhost:4041/
- Deep dive Spark UI happens in later episodes
- sc.setJobDescription("Description") replaces the Job Description of an action in the Spark UI with your own
- sdf.rdd.getNumPartitions() returns the number partitions of the current Spark DataFrame
- sdf.write.format("noop").mode("overwrite").save() is a good way to analyze and initiate actions for transformations without side effects during an actual write

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import math
import time
import pandas as pd
import os
import sys


In [4]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [5]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""
sc = spark.sparkContext

In [6]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [7]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [8]:
def rows_per_partition(sdf: "DataFrame") -> None:
    num_rows = sdf.count()
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id").count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id").show()

In [9]:
def rows_per_partition_col(sdf: "DataFrame", num_rows: int, col: str) -> None:
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id", col).count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id", col).show()


In [10]:
BASE_DIR = "D:/Spark/Data"
results_dict = {}
results_list = []

In [11]:
def write_generator(num_rows, num_files):
    sdf = sdf_generator(num_rows, num_files)
    path = f"{BASE_DIR}/{num_files}_files_{num_rows}_rows.parquet"
    sc.setJobDescription(f"Write {num_files} files, {num_rows} rows")
    sdf.write.format("parquet").mode("overwrite").save(path)
    sc.setJobDescription("None")
    print(f"Num partitions written: {sdf.rdd.getNumPartitions()}")
    print(f"Saved Path: {path}")
    return path

In [12]:
def set_configs(maxPartitionsMB = 128, openCostInMB = 4, minPartitions = 4):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    openCostInBytes = math.ceil(openCostInMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")
    spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInBytes)+"b")
    spark.conf.set("spark.sql.files.minPartitionNum", str(minPartitions))
    print(" ")
    print("******** SPARK CONFIGURATIONS ********")
    print(f"MaxPartitionSize {maxPartitionsMB} MB or {maxPartitionsBytes} bytes")
    print(f"OpenCostInBytes {openCostInMB} MB or {openCostInBytes} bytes")
    print(f"Min Partitions: {minPartitions}")

    results_dict["maxPartitionsBytes"] = maxPartitionsMB

In [13]:
def get_parquet_meta_data(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .withColumn("calcNumBlocks", f.col("compressedMB")/128)
    )
    sdf.show(20, truncate=False)
    return sdf

In [14]:
def get_parquet_blocks(path):
    sdf = (
        spark.read.parquet_blocks(path)
        .dropDuplicates(["filename","block"])
        .orderBy("filename", "block")
        .withColumn("blockEnd", f.col("blockStart") + f.col("compressedBytes") - 1)
        .withColumn("blockMiddle", f.col("blockStart") + 0.5 * f.col("compressedBytes"))
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("filename", "block", "blockStart", "blockEnd", "blockMiddle", "compressedBytes", "compressedMB", "rows")
    )

    sdf.show(20, truncate=False)

In [15]:
def get_spark_partitions(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "start", "end", "length", "blocks", "compressedBytes", "compressedMB", "rows", "filename")
    )

    sdf.show(20, truncate=False)

In [16]:
def get_parquet_window_length(path):
    sdf = spark.read.parquet_partitions(path)
    val = sdf.select(f.max(sdf["length"]))
    max_length = val.collect()[0][0]
    print(f"Max Parquet window length: {round(max_length/1024/1024, 1)} MB or {max_length} bytes")

In [17]:
def get_parquet_file_size(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
    )
    sum = sdf.select(f.sum(sdf["compressedBytes"]))
    size = sum.collect()[0][0]
    return size

In [18]:
def bytes_rows_per_partition(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .groupBy("partition").agg(f.sum("compressedBytes"), f.sum("rows"), f.count("partition"))
        .withColumnRenamed("sum(compressedBytes)", "compressedBytes")
        .withColumnRenamed("sum(rows)", "rows")
        .withColumnRenamed("count(partition)", "numFiles")
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "numFiles", "compressedBytes","compressedMB","rows")
        .orderBy("partition")
    )
    sdf.show(20)
    return sdf

In [19]:
def file_analysis(path, num_files):
    file_size = get_parquet_file_size(path)
    avg_file_size = file_size/num_files
    print(" ")
    print("******** FILE SIZE ANALYSIS ********")
    print(f"File Size: {round(file_size/1024/1024, 1)} MB or {file_size} bytes")
    print(f"Num files: {num_files}")
    print(f"Avg file Size: {round(avg_file_size/1024/1024, 1)} MB or {avg_file_size} bytes")
    


In [20]:
def row_count_analysis(num_files, num_rows):
    print(" ")
    print("******** ROW COUNT ANALYSIS ********")    
    print(f"Num files written: {num_files}")
    print(f"Num rows written: {num_rows}")
    print(f"Num rows per file: {int(num_rows/num_files)}")

In [21]:
def get_actual_num_partitions(path):
    sdf = spark.read.parquet(path)
    print(" ")
    print("******** ACTUAL RESULTS ********")   
    print(f"ActualNumPartitions: {sdf.rdd.getNumPartitions()}")
    results_dict["ActualNumPartitions"] = sdf.rdd.getNumPartitions()


In [22]:
def noop_write(path):
    sdf = spark.read.parquet(path)
    sc.setJobDescription("WRITE")
    start_time = time.time()
    sdf.write.format("noop").mode("overwrite").save()
    end_time = time.time()
    sc.setJobDescription("None")
    duration = round(end_time - start_time, 2)
    results_dict["ExecutionTime"] = duration
    print(f"Duration: {duration} sec")

# 1. What influences the no. of partitions when loading parquet files
- Num Cores in the cluster,
  - Correctly speaking it's the "spark.sql.files.minPartitionNum" config
  - It defaults to the default parallism which is our num of cores = 4
- File Size
- Num parquet files
- Num of blocks/rowgroups within a parquet file
- Max Partition Size:
  - Influences the size of a partition  
  - based on the config"spark.sql.files.maxPartitionBytes"
  - defaults to 128 MB 
- Max Cost Per Bytes
  - Represents the cost of creating a new partition
  - based on the config "spark.sql.files.openCostInBytes"
  - defaults to 4 MB
  - Technically it adds the cost, e.g. 4 MB, to each file which is called padding
  - Through this less but bigger partitions are created around the size of the open cost value
  - Usually no influence, except of smaller files, default of 4MB works
  - Official description: The estimated cost to open a file, measured by the number of bytes that could be scanned in the same time. This is used when putting multiple files into a partition. It is better to over-estimate, then the partitions with small files will be faster than partitions with bigger files (which is scheduled first). This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.

References:
- https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
- https://stackoverflow.com/questions/69034543/number-of-tasks-while-reading-hdfs-in-spark
- https://stackoverflow.com/questions/75924368/skewed-partitions-when-setting-spark-sql-files-maxpartitionbytes
- https://spark.apache.org/docs/latest/sql-performance-tuning.html
- https://www.linkedin.com/pulse/how-initial-number-partitions-determined-pyspark-sugumar-srinivasan#:~:text=Ideally%20partitions%20will%20be%20created,resource%20will%20get%20utilised%20properly

# 2. Finalising algorithm

In [118]:
num_files = 8
num_rows = 64000000
path = write_generator(num_rows, num_files)

Num partitions written: 20
Saved Path: D:/Spark/Data/20_files_2000_rows.parquet


In [38]:
path = "D:/Spark/Data/8_files_64000000_rows.parquet"
num_files = 8
num_rows = 64000000
sdf_meta_data = get_parquet_meta_data(path)
file_analysis(path, num_files)
row_count_analysis(num_files, num_rows)
set_configs(maxPartitionsMB=128, openCostInMB=4, minPartitions=4)
size = get_parquet_file_size(path)

+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows   |compressedMB|calcNumBlocks|
+---------------------------------------------------------------------------------------------------------------------+------+---------------+-------+------------+-------------+
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00007-1ea32cf9-7663-4dd4-8f5f-3415a5abeb4c-c000.snappy.parquet|1     |68063635       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00004-1ea32cf9-7663-4dd4-8f5f-3415a5abeb4c-c000.snappy.parquet|1     |68063118       |8000000|64.9        |0.50703125   |
|file:/D:/Spark/Data/8_files_64000000_rows.parquet/part-00005-1ea32cf9-7663-4dd4-8f5f-3415a5abeb4c-c000.snappy

In [25]:
def maxSplitBytes(file_size, num_files):
    """
    Reference to code: 
    - Stackoverflow: https://stackoverflow.com/questions/70985235/what-is-opencostinbytes
    - GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala#L86-L97
    """
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    
    file_size_padded = file_size + num_files * openCostInBytes
    size_per_core_padded = file_size_padded / minPartitionNum
    max_partition_size = int(min(maxPartitionBytes, max(openCostInBytes, size_per_core_padded)))
    no_partitions_padded = file_size_padded/max_partition_size

    print(" ")
    print("******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********")
    print(f"Padded File Size: {round(file_size_padded/1024/1024, 1)} MB or {file_size_padded} bytes")
    print(f"SizePerCorePadded: {round(size_per_core_padded/1024/1024, 1)} MB or {size_per_core_padded} bytes")
    print(f"MaxPartionsize: {round(max_partition_size/1024/1024, 1)} MB or {max_partition_size} bytes")
    print(f"EstimatedPartitionsAvg: {math.ceil(no_partitions_padded)}, unrounded: {no_partitions_padded}")
    return max_partition_size

In [26]:
max_split_bytes = maxSplitBytes(size, num_files)

 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 526.9 MB or 552505815 bytes
SizePerCorePadded: 131.7 MB or 138126453.75 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 5, unrounded: 4.116489104926586


In [27]:
def get_files_as_list(parquet_meta_data_sdf):
    return list(parquet_meta_data_sdf.select("compressedBytes").toPandas()["compressedBytes"])

In [28]:
#observation... open bytes covers also slight deviations of files
file_size_list = get_files_as_list(sdf_meta_data)
file_size_list

[68063635,
 68063118,
 68063528,
 67726805,
 68008198,
 68063741,
 68063235,
 68064947]

In [30]:
def split_files(files_list, max_split_bytes):
    """
    Reference to code in GitHub: https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala#L45
    """
    result_list = []
    for file_size in files_list:
        remaining = file_size - max_split_bytes
        f = {
            "start": 0,
            "length": min(max_split_bytes, file_size),
            "file_size": file_size
        }
        result_list.append(f)
        while remaining > 0:
            if remaining > max_split_bytes:
                f = {
                    "start": file_size - remaining,
                    "length": max_split_bytes,
                    "file_size": 0
                }
                result_list.append(f)
            else:
                f = {
                    "start": file_size - remaining,
                    "length": remaining,
                    "file_size": 0
                }
                result_list.append(f)  
            remaining = remaining - max_split_bytes
    return sorted(result_list, key=lambda d: d['length'], reverse=True) 


In [31]:
splitted_files = split_files(file_size_list, max_split_bytes)
splitted_files


[{'start': 0, 'length': 68064947, 'file_size': 68064947},
 {'start': 0, 'length': 68063741, 'file_size': 68063741},
 {'start': 0, 'length': 68063635, 'file_size': 68063635},
 {'start': 0, 'length': 68063528, 'file_size': 68063528},
 {'start': 0, 'length': 68063235, 'file_size': 68063235},
 {'start': 0, 'length': 68063118, 'file_size': 68063118},
 {'start': 0, 'length': 68008198, 'file_size': 68008198},
 {'start': 0, 'length': 67726805, 'file_size': 67726805}]

In [32]:
len(splitted_files)

8

In [33]:
def getFilePartitions(splitted_files_list, max_split_bytes):
    """
    Reference to code in GitHub: https://github.com/apache/spark/blob/v3.3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala
    """
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    partitions = []
    current_files = []
    current_size = 0

    def close_partition():
        if current_files:
            partition_details = {
                "files": current_files.copy(),
                "num_files": len(current_files),
            }
        else:
            partition_details = {}
        partitions.append(partition_details)
        current_files.clear()

    for file in splitted_files_list:
        if current_size + file["length"] > max_split_bytes:
            close_partition()
            current_size = 0
        current_size += file["length"] + openCostInBytes
        current_files.append(file)
    close_partition()
    print(f"Number calculated Partitions: {len(partitions)}")
    return partitions


In [34]:
file_partitions = getFilePartitions(splitted_files, max_split_bytes)
file_partitions

Number calculated Partitions: 8


[{'files': [{'start': 0, 'length': 68064947, 'file_size': 68064947}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68063741, 'file_size': 68063741}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68063635, 'file_size': 68063635}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68063528, 'file_size': 68063528}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68063235, 'file_size': 68063235}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68063118, 'file_size': 68063118}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 68008198, 'file_size': 68008198}],
  'num_files': 1},
 {'files': [{'start': 0, 'length': 67726805, 'file_size': 67726805}],
  'num_files': 1}]

In [39]:
get_actual_num_partitions(path)
bytes_rows_per_partition(path)

 
******** ACTUAL RESULTS ********
ActualNumPartitions: 8
+---------+--------+---------------+------------+-------+
|partition|numFiles|compressedBytes|compressedMB|   rows|
+---------+--------+---------------+------------+-------+
|        0|       1|       68064947|        64.9|8000000|
|        1|       1|       68063741|        64.9|8000000|
|        2|       1|       68063635|        64.9|8000000|
|        3|       1|       68063528|        64.9|8000000|
|        4|       1|       68063235|        64.9|8000000|
|        5|       1|       68063118|        64.9|8000000|
|        6|       1|       68008198|        64.9|8000000|
|        7|       1|       67726805|        64.6|8000000|
+---------+--------+---------------+------------+-------+



DataFrame[partition: int, numFiles: bigint, compressedBytes: bigint, compressedMB: double, rows: bigint]

In [36]:
def file_partitions_analysis(file_partitions):
    pdf = pd.DataFrame(file_partitions)
    pdf["partition"] = pdf.index
    sdf_partitions = spark.createDataFrame(pdf)
    sdf_partitions = sdf_partitions.withColumn("files", f.explode(sdf_partitions.files))
    sdf_partitions = (sdf_partitions
                    .withColumn("start", f.col("files").start)
                    .withColumn("length", f.col("files").length)
                    .withColumn("file_size", f.col("files").file_size)
                    .drop("files", "num_files")
                    .withColumn("virt_num_files", f.lit(1))
                    .withColumn("real_num_files", f.when(f.col("file_size") > 0, f.lit(1)).otherwise(f.lit(0)))
                    .select("partition", "start", "length", "file_size", "real_num_files", "virt_num_files")
    )
    sdf_partitions.show()
    sdf_agg = sdf_partitions.groupBy("partition").agg(f.sum("file_size")).withColumnRenamed("sum(file_size)", "file_size")
    sdf_agg = sdf_agg.orderBy("partition")
    sdf_agg = sdf_agg.withColumn("file_size_mb", f.col("file_size")/1024/1024)
    sdf_agg.show()
    max_size = sdf_agg.select(f.max(sdf_agg["file_size"])).collect()[0][0]
    min_size = sdf_agg.select(f.min(sdf_agg["file_size"])).collect()[0][0]
    median_size = sdf_agg.select(f.median(sdf_agg["file_size"])).collect()[0][0]
    sdf_empty = sdf_agg.filter(f.col("file_size") == 0)
    empty_partitions = sdf_empty.count()
    return {
        "max_size": max_size,
        "min_size": min_size,
        "median_size": median_size,
        "num_partitions": len(file_partitions),
        "empty_partition": empty_partitions
    }
    

In [37]:
file_partitions_analysis(file_partitions)

+---------+-----+--------+---------+--------------+--------------+
|partition|start|  length|file_size|real_num_files|virt_num_files|
+---------+-----+--------+---------+--------------+--------------+
|        0|    0|68064947| 68064947|             1|             1|
|        1|    0|68063741| 68063741|             1|             1|
|        2|    0|68063635| 68063635|             1|             1|
|        3|    0|68063528| 68063528|             1|             1|
|        4|    0|68063235| 68063235|             1|             1|
|        5|    0|68063118| 68063118|             1|             1|
|        6|    0|68008198| 68008198|             1|             1|
|        7|    0|67726805| 67726805|             1|             1|
+---------+-----+--------+---------+--------------+--------------+

+---------+---------+-----------------+
|partition|file_size|     file_size_mb|
+---------+---------+-----------------+
|        0| 68064947|64.91179180145264|
|        1| 68063741|64.91064167022

{'max_size': 68064947,
 'min_size': 67726805,
 'median_size': 68063381.5,
 'num_partitions': 8,
 'empty_partition': 0}

# 3. Simulation Max Partition Bytes

In [106]:
results = []
results_dict = {}

for maxMB in [5,10,15,20,25,30,40,50,60,70,80,90,100,110,128, 150,200, 256,512,1024]:
    openCostMB = 4
    minPartitions = 4
    num_files = 20
    file_size_mb = 1024
    file_size = round(file_size_mb*1024*1024,0)
    avg_size = int(round(file_size/num_files, 0))
    file_size_list = [avg_size] * num_files
    set_configs(maxPartitionsMB=maxMB, openCostInMB=openCostMB, minPartitions=minPartitions)
    max_split_bytes = maxSplitBytes(file_size, num_files)
    splitted_files = split_files(file_size_list, max_split_bytes)
    file_partitions = getFilePartitions(splitted_files, max_split_bytes)
    results_dict = file_partitions_analysis(file_partitions)
    results_dict["file_size_mb"] = file_size_mb
    results_dict["num_files"] = num_files
    results_dict["avg_size"] = file_size_mb/num_files
    results_dict["numCores"] = minPartitions
    results_dict["maxPartitionMB"] = maxMB
    results_dict["openCosts"] = openCostMB
    results_dict["maxSplitBytes"] = round(max_split_bytes/1024/1024,1)
    results.append(results_dict)


 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 5 MB or 5242880 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1104.0 MB or 1157627904 bytes
SizePerCorePadded: 276.0 MB or 289406976.0 bytes
MaxPartionsize: 5.0 MB or 5242880 bytes
EstimatedPartitionsAvg: 221, unrounded: 220.8
Number calculated Partitions: 220
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 10 MB or 10485760 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1104.0 MB or 1157627904 bytes
SizePerCorePadded: 276.0 MB or 289406976.0 bytes
MaxPartionsize: 10.0 MB or 10485760 bytes
EstimatedPartitionsAvg: 111, unrounded: 110.4
Number calculated Partitions: 110
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 15 MB or 15728640 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Mi

In [107]:
pdf = pd.DataFrame(results)
sdf_results = spark.createDataFrame(pdf)
sdf_results = sdf_results.withColumn("min_size", f.round(f.col("min_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("median_size", f.round(f.col("median_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("max_size", f.round(f.col("max_size")/1024/1024,1))
sdf_results = sdf_results.select("file_size_mb", "num_files", "avg_size", "numCores", "maxPartitionMB", "openCOsts", "maxSplitBytes", "num_partitions", "empty_partition", "min_size", "median_size", "max_size")
sdf_results.show()

+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|file_size_mb|num_files|avg_size|numCores|maxPartitionMB|openCOsts|maxSplitBytes|num_partitions|empty_partition|min_size|median_size|max_size|
+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|        1024|       20|    51.2|       4|             5|        4|          5.0|           220|            200|     0.0|        0.0|    51.2|
|        1024|       20|    51.2|       4|            10|        4|         10.0|           110|             90|     0.0|        0.0|    51.2|
|        1024|       20|    51.2|       4|            15|        4|         15.0|            80|             60|     0.0|        0.0|    51.2|
|        1024|       20|    51.2|       4|            20|        4|         20.0|            60|             40|     0.0|        0.0|    51.2|

# 4. Simulation Num Files

In [108]:
results = []
results_dict = {}

for num_files in [1,4,5,8,10,12,16,15,20,24, 25,30,40,50,60,70,80,90,100,110,128, 150,200, 256,512,1024]:
    openCostMB = 4
    minPartitions = 4
    maxMB = 128
    file_size_mb = 1024
    file_size = round(file_size_mb*1024*1024,0)
    avg_size = int(round(file_size/num_files, 0))
    file_size_list = [avg_size] * num_files
    set_configs(maxPartitionsMB=maxMB, openCostInMB=openCostMB, minPartitions=minPartitions)
    max_split_bytes = maxSplitBytes(file_size, num_files)
    splitted_files = split_files(file_size_list, max_split_bytes)
    file_partitions = getFilePartitions(splitted_files, max_split_bytes)
    results_dict = file_partitions_analysis(file_partitions)
    results_dict["file_size_mb"] = file_size_mb
    results_dict["num_files"] = num_files
    results_dict["avg_size"] = file_size_mb/num_files
    results_dict["numCores"] = minPartitions
    results_dict["maxPartitionMB"] = maxMB
    results_dict["openCosts"] = openCostMB
    results_dict["maxSplitBytes"] = round(max_split_bytes/1024/1024,1)
    results.append(results_dict)


 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1028.0 MB or 1077936128 bytes
SizePerCorePadded: 257.0 MB or 269484032.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 9, unrounded: 8.03125
Number calculated Partitions: 8
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1040.0 MB or 1090519040 bytes
SizePerCorePadded: 260.0 MB or 272629760.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 9, unrounded: 8.125
Number calculated Partitions: 8
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 

In [109]:
sdf_results = spark.createDataFrame(results)
sdf_results = sdf_results.withColumn("min_size", f.round(f.col("min_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("median_size", f.round(f.col("median_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("max_size", f.round(f.col("max_size")/1024/1024,1))
sdf_results = sdf_results.select("file_size_mb", "num_files", "avg_size", "numCores", "maxPartitionMB", "openCOsts", "maxSplitBytes", "num_partitions", "empty_partition", "min_size", "median_size", "max_size")
sdf_results.show()

+------------+---------+------------------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|file_size_mb|num_files|          avg_size|numCores|maxPartitionMB|openCOsts|maxSplitBytes|num_partitions|empty_partition|min_size|median_size|max_size|
+------------+---------+------------------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|        1024|        1|            1024.0|       4|           128|        4|        128.0|             8|              7|     0.0|        0.0|  1024.0|
|        1024|        4|             256.0|       4|           128|        4|        128.0|             8|              4|     0.0|      128.0|   256.0|
|        1024|        5|             204.8|       4|           128|        4|        128.0|            10|              5|     0.0|      102.4|   204.8|
|        1024|        8|             128.0|       4|           128|        4|     

# 5. Simulation File Size

In [110]:
results = []
results_dict = {}

for file_size_mb in [0.0001,0.001,0.01, 0.1, 1, 4, 8, 10, 20, 40, 50, 100, 128, 256, 512, 1024, 2048, 4096]: 
    openCostMB = 4
    minPartitions = 4
    maxMB = 128
    num_files = 20
    file_size = round(file_size_mb*1024*1024,0)
    avg_size = int(round(file_size/num_files, 0))
    file_size_list = [avg_size] * num_files
    set_configs(maxPartitionsMB=maxMB, openCostInMB=openCostMB, minPartitions=minPartitions)
    max_split_bytes = maxSplitBytes(file_size, num_files)
    splitted_files = split_files(file_size_list, max_split_bytes)
    file_partitions = getFilePartitions(splitted_files, max_split_bytes)
    results_dict = file_partitions_analysis(file_partitions)
    results_dict["file_size_mb"] = file_size_mb
    results_dict["num_files"] = num_files
    results_dict["avg_size"] = file_size_mb/num_files
    results_dict["numCores"] = minPartitions
    results_dict["maxPartitionMB"] = maxMB
    results_dict["openCosts"] = openCostMB
    results_dict["maxSplitBytes"] = round(max_split_bytes/1024/1024,1)
    results.append(results_dict)


 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 80.0 MB or 83886185.0 bytes
SizePerCorePadded: 20.0 MB or 20971546.25 bytes
MaxPartionsize: 20.0 MB or 20971546 bytes
EstimatedPartitionsAvg: 5, unrounded: 4.000000047683657
Number calculated Partitions: 4
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 80.0 MB or 83887129.0 bytes
SizePerCorePadded: 20.0 MB or 20971782.25 bytes
MaxPartionsize: 20.0 MB or 20971782 bytes
EstimatedPartitionsAvg: 5, unrounded: 4.00000004768312
Number calculated Partitions: 4
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB 

In [111]:
pdf = pd.DataFrame(results)
sdf_results = spark.createDataFrame(pdf)
sdf_results = sdf_results.withColumn("min_size", f.round(f.col("min_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("median_size", f.round(f.col("median_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("max_size", f.round(f.col("max_size")/1024/1024,1))
sdf_results = sdf_results.select("file_size_mb", "num_files", "avg_size", "numCores", "maxPartitionMB", "openCOsts", "maxSplitBytes", "num_partitions", "empty_partition", "min_size", "median_size", "max_size")
sdf_results.show()

+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|file_size_mb|num_files|avg_size|numCores|maxPartitionMB|openCOsts|maxSplitBytes|num_partitions|empty_partition|min_size|median_size|max_size|
+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|      1.0E-4|       20|  5.0E-6|       4|           128|        4|         20.0|             4|              0|     0.0|        0.0|     0.0|
|       0.001|       20|  5.0E-5|       4|           128|        4|         20.0|             4|              0|     0.0|        0.0|     0.0|
|        0.01|       20|  5.0E-4|       4|           128|        4|         20.0|             4|              0|     0.0|        0.0|     0.0|
|         0.1|       20|   0.005|       4|           128|        4|         20.0|             4|              0|     0.0|        0.0|     0.0|

# 6. Simulation Open Costs

In [112]:
results = []
results_dict = {}

for openCostMB in [0,0.125, 0.25, 0.5, 0.75, 1,2,3,4,6,8,10,12,16,20]: 
    minPartitions = 4
    maxMB = 128
    num_files = 20
    file_size_mb = 1024
    file_size = round(file_size_mb*1024*1024,0)
    avg_size = int(round(file_size/num_files, 0))
    file_size_list = [avg_size] * num_files
    set_configs(maxPartitionsMB=maxMB, openCostInMB=openCostMB, minPartitions=minPartitions)
    max_split_bytes = maxSplitBytes(file_size, num_files)
    splitted_files = split_files(file_size_list, max_split_bytes)
    file_partitions = getFilePartitions(splitted_files, max_split_bytes)
    results_dict = file_partitions_analysis(file_partitions)
    results_dict["file_size_mb"] = file_size_mb
    results_dict["num_files"] = num_files
    results_dict["avg_size"] = file_size_mb/num_files
    results_dict["numCores"] = minPartitions
    results_dict["maxPartitionMB"] = maxMB
    results_dict["openCosts"] = openCostMB
    results_dict["maxSplitBytes"] = round(max_split_bytes/1024/1024,1)
    results.append(results_dict)


 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 0 MB or 0 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1024.0 MB or 1073741824 bytes
SizePerCorePadded: 256.0 MB or 268435456.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 8, unrounded: 8.0
Number calculated Partitions: 10
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 0.125 MB or 131072 bytes
Min Partitions: 4
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1026.5 MB or 1076363264 bytes
SizePerCorePadded: 256.6 MB or 269090816.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 9, unrounded: 8.01953125
Number calculated Partitions: 10
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 0.25 MB or 26214

In [114]:
pdf = pd.DataFrame(results)
sdf_results = spark.createDataFrame(pdf)
sdf_results = sdf_results.withColumn("min_size", f.round(f.col("min_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("median_size", f.round(f.col("median_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("max_size", f.round(f.col("max_size")/1024/1024,1))
sdf_results = sdf_results.select("file_size_mb", "num_files", "avg_size", "numCores", "maxPartitionMB", "openCOsts", "maxSplitBytes", "num_partitions", "empty_partition", "min_size", "median_size", "max_size")
sdf_results.show()

+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|file_size_mb|num_files|avg_size|numCores|maxPartitionMB|openCOsts|maxSplitBytes|num_partitions|empty_partition|min_size|median_size|max_size|
+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|        1024|       20|    51.2|       4|           128|      0.0|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       4|           128|    0.125|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       4|           128|     0.25|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       4|           128|      0.5|        128.0|            10|              0|   102.4|      102.4|   102.4|

# 7. Simulation Min partitions

In [115]:
results = []
results_dict = {}

for minPartitions in [1,2,4,8,10,12,16,20,24,28,32]: 
    openCostMB = 4
    maxMB = 128
    num_files = 20
    file_size_mb = 1024
    file_size = round(file_size_mb*1024*1024,0)
    avg_size = int(round(file_size/num_files, 0))
    file_size_list = [avg_size] * num_files
    set_configs(maxPartitionsMB=maxMB, openCostInMB=openCostMB, minPartitions=minPartitions)
    max_split_bytes = maxSplitBytes(file_size, num_files)
    splitted_files = split_files(file_size_list, max_split_bytes)
    file_partitions = getFilePartitions(splitted_files, max_split_bytes)
    results_dict = file_partitions_analysis(file_partitions)
    results_dict["file_size_mb"] = file_size_mb
    results_dict["num_files"] = num_files
    results_dict["avg_size"] = file_size_mb/num_files
    results_dict["numCores"] = minPartitions
    results_dict["maxPartitionMB"] = maxMB
    results_dict["openCosts"] = openCostMB
    results_dict["maxSplitBytes"] = round(max_split_bytes/1024/1024,1)
    results.append(results_dict)


 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 1
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1104.0 MB or 1157627904 bytes
SizePerCorePadded: 1104.0 MB or 1157627904.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 9, unrounded: 8.625
Number calculated Partitions: 10
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 4194304 bytes
Min Partitions: 2
 
******** ADVANCED ALGORITHM TO ESTIMATE Partition SIZE AND NO PARTITIONS ********
Padded File Size: 1104.0 MB or 1157627904 bytes
SizePerCorePadded: 552.0 MB or 578813952.0 bytes
MaxPartionsize: 128.0 MB or 134217728 bytes
EstimatedPartitionsAvg: 9, unrounded: 8.625
Number calculated Partitions: 10
 
******** SPARK CONFIGURATIONS ********
MaxPartitionSize 128 MB or 134217728 bytes
OpenCostInBytes 4 MB or 419430

In [117]:
sdf_results = spark.createDataFrame(results)
sdf_results = sdf_results.withColumn("min_size", f.round(f.col("min_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("median_size", f.round(f.col("median_size")/1024/1024,1))
sdf_results = sdf_results.withColumn("max_size", f.round(f.col("max_size")/1024/1024,1))
sdf_results = sdf_results.select("file_size_mb", "num_files", "avg_size", "numCores", "maxPartitionMB", "openCOsts", "maxSplitBytes", "num_partitions", "empty_partition", "min_size", "median_size", "max_size")
sdf_results.show()

+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|file_size_mb|num_files|avg_size|numCores|maxPartitionMB|openCOsts|maxSplitBytes|num_partitions|empty_partition|min_size|median_size|max_size|
+------------+---------+--------+--------+--------------+---------+-------------+--------------+---------------+--------+-----------+--------+
|        1024|       20|    51.2|       1|           128|        4|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       2|           128|        4|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       4|           128|        4|        128.0|            10|              0|   102.4|      102.4|   102.4|
|        1024|       20|    51.2|       8|           128|        4|        128.0|            10|              0|   102.4|      102.4|   102.4|