In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import math

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""

'\nReference gresearch:\n- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/\n- GitHub Spark extension: https://github.com/G-Research/spark-extension\n- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet\n'

In [3]:
sc = spark.sparkContext

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [5]:
sdf = sdf_generator(100, 8)
sdf.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-24|2024-03-24 12:46:...|       0|      0|     0|
|  1|2024-03-24|2024-03-24 12:46:...|       1|      1|     1|
|  2|2024-03-24|2024-03-24 12:46:...|       2|      2|     2|
|  3|2024-03-24|2024-03-24 12:46:...|       3|      3|     3|
|  4|2024-03-24|2024-03-24 12:46:...|       4|      4|     4|
|  5|2024-03-24|2024-03-24 12:46:...|       5|      5|     5|
|  6|2024-03-24|2024-03-24 12:46:...|       6|      6|     6|
|  7|2024-03-24|2024-03-24 12:46:...|       7|      7|     7|
|  8|2024-03-24|2024-03-24 12:46:...|       8|      8|     8|
|  9|2024-03-24|2024-03-24 12:46:...|       9|      9|     9|
| 10|2024-03-24|2024-03-24 12:46:...|      10|      1|     0|
| 11|2024-03-24|2024-03-24 12:46:...|      11|      1|     1|
| 12|2024-03-24|2024-03-24 12:46:...|      12|      1|     2|
| 13|202

In [6]:
def set_max_partitions_bytes(maxPartitionsMB):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [7]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

# Experiment 1: 8 files, 80.000.000 rows, a 81 MB
- A file size factor of num of cores 4 and equal file sizes create well distributed partitions
- reducing the maxPartitionBytes does not change anything as long as above the file size
- Doubling the maxPartitionBytes does reduce the number of partitions to 4

In [8]:
sc.setJobDescription("Save Parquet Exp 1: 8 files, 80.000.000 rows, 81 MB")
path_parquet = "D:/Spark/Data/parquet_1.parquet"
sdf = sdf_generator(80000000, 8)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

8


In [9]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

8


In [10]:
set_max_partitions_bytes(90)
sc.setJobDescription("Load Parquet with 90 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

8


In [11]:
set_max_partitions_bytes(180)
sc.setJobDescription("Load Parquet with 180 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

4


# Experiment 2: 16 files, 160.000.000 rows, a 81 MB
- Again we have a well distrubuted dataset and partitions. But the file size is not 100 % equal distributed
- Choosing a wrong number of maxPartition bytes leads to an undistrubuted number of partitions
- Interestingly 4 partitions cannot be reached

In [12]:
sc.setJobDescription("Save Parquet Exp 2: 16 files, 160.000.000 rows, 81 MB")
path_parquet = "D:/Spark/Data/parquet_2.parquet"
sdf = sdf_generator(160000000, 16)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

16


In [13]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

16


In [14]:
set_max_partitions_bytes(90)
sc.setJobDescription("Load Parquet with 90 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

16


In [15]:
set_max_partitions_bytes(180)
sc.setJobDescription("Load Parquet with 180 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

8


In [16]:
set_max_partitions_bytes(170)
sc.setJobDescription("Load Parquet with 170 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

11


In [17]:
set_max_partitions_bytes(320)
sc.setJobDescription("Load Parquet with 320 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

6


In [18]:
set_max_partitions_bytes(900)
sc.setJobDescription("Load Parquet with 900 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

5


# Experiment 3: 30 files, 110.000.000 rows, a 30 MB
- The default value creates 10 partitions
- By slightly increasing it we can create 8 partitions and reduce the execution time from 5 to 4s
- Reducing with 12 partitions had a similar effect

In [19]:
sc.setJobDescription("Save Parquet Exp 3: 30 files, 110.000.000 rows, 30 MB")
path_parquet = "D:/Spark/Data/parquet_3.parquet"
sdf = sdf_generator(110000000, 30)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

30


In [20]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

10


In [21]:
set_max_partitions_bytes(140)
sc.setJobDescription("Load Parquet with 140 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

8


In [22]:
set_max_partitions_bytes(97)
sc.setJobDescription("Load Parquet with 97 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

12


# Experiment 4: 10 files, 100.000.000 rows, a 81 MB
- Reduction of max Partition Bytes below file size can have a negative effect of empty partitions

In [23]:
sc.setJobDescription("Save Parquet Exp 4: 10 files, 100.000.000 rows, 80 MB")
path_parquet = "D:/Spark/Data/parquet_4.parquet"
sdf = sdf_generator(100000000, 10)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

10


In [24]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

10


In [25]:
set_max_partitions_bytes(60)
sc.setJobDescription("Load Parquet with 60 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

15


In [26]:
set_max_partitions_bytes(10)
sc.setJobDescription("Load Parquet with 10 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

85


# Experiment 5: 1 file, 100.000.000 rows, a 809 MB
- Here we have a skewed partition set as the last row group is smaller. Also we have 7 row groups
- We still can improve performance by increasing max partition size

In [27]:
sc.setJobDescription("Save Parquet Exp 5: 1 file, 100.000.000 rows, 809 MB")
path_parquet = "D:/Spark/Data/parquet_5.parquet"
sdf = sdf_generator(100000000, 1)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

1


In [28]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

7


In [29]:
set_max_partitions_bytes(280)
sc.setJobDescription("Load Parquet with 280 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

4


# Experiment 6: 10 files, 1.000.000.000 rows, a 850 MB
- Here we have files with 7 row groups each and the last one is smaller
- In this case it didnt speed up peformance but we could reduce the no. partitions for a better distrubution
- Risk of empty partitions after filter


In [30]:
sc.setJobDescription("Save Parquet Exp 6: 10 files, 1.000.000.000 rows, 850 MB")
path_parquet = "D:/Spark/Data/parquet_6.parquet"
sdf = sdf_generator(1000000000, 10)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

10


In [31]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

70


In [32]:
set_max_partitions_bytes(150)
sc.setJobDescription("Load Parquet with 150 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

60


In [33]:
set_max_partitions_bytes(270)
sc.setJobDescription("Load Parquet with 270 MB")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

32


# Experiment 7: 10 files, 1.000.000.000 rows, a 81 MB with filter id < 100.000.000
- filters can create empty partitions

In [34]:
sc.setJobDescription("Save Parquet Exp 7: 10 files, 1.000.000.000 rows, 850 MB")
path_parquet = "D:/Spark/Data/parquet_7.parquet"
sdf = sdf_generator(1000000000, 10)
print(sdf.rdd.getNumPartitions())
#sdf.write.format("parquet").mode("overwrite").save(path_parquet)

10


In [35]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB id < 100.000.000")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("id") < 100000000)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

70


In [36]:
set_max_partitions_bytes(260)
sc.setJobDescription("Load Parquet with 260 MB id < 100.000.000")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("id") < 100000000)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

34


In [37]:
set_max_partitions_bytes(270)
sc.setJobDescription("Load Parquet with 270 MB id < 100.000.000")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("id") < 100000000)
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

32


# Experiment 6: 10 files, 1.000.000.000 rows, a 81 MB with filter idlast = 1
- filter based on key through all tables this can be avoided but predicate pushdown cannot be used

In [38]:
sc.setJobDescription("Save Parquet Exp 8: 10 files, 1.000.000.000 rows, 850 MB")
path_parquet = "D:/Spark/Data/parquet_8.parquet"
sdf = sdf_generator(1000000000, 10)
print(sdf.rdd.getNumPartitions())
sdf.write.format("parquet").mode("overwrite").save(path_parquet)

10


In [39]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet with 128 MB idlast 1")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

70


In [40]:
set_max_partitions_bytes(270)
sc.setJobDescription("Load Parquet with 270 MB idlast 1")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

32


In [41]:
set_max_partitions_bytes(370)
sc.setJobDescription("Load Parquet with 370 MB idlast 1")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
print(sdf_parquet.rdd.getNumPartitions())
sdf_parquet.write.format("noop").mode("overwrite").save()

24
