In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import math

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""

'\nReference gresearch:\n- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/\n- GitHub Spark extension: https://github.com/G-Research/spark-extension\n- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet\n'

In [3]:
sc = spark.sparkContext

In [4]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache dataframes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [74]:
spark.conf.get("spark.sql.sources.useV1SourceList")

''

In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
sdf = sdf_generator(10000000, 8)
sdf.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-14|2024-03-14 20:55:...|       0|      0|     0|
|  1|2024-03-14|2024-03-14 20:55:...|       1|      1|     1|
|  2|2024-03-14|2024-03-14 20:55:...|       2|      2|     2|
|  3|2024-03-14|2024-03-14 20:55:...|       3|      3|     3|
|  4|2024-03-14|2024-03-14 20:55:...|       4|      4|     4|
|  5|2024-03-14|2024-03-14 20:55:...|       5|      5|     5|
|  6|2024-03-14|2024-03-14 20:55:...|       6|      6|     6|
|  7|2024-03-14|2024-03-14 20:55:...|       7|      7|     7|
|  8|2024-03-14|2024-03-14 20:55:...|       8|      8|     8|
|  9|2024-03-14|2024-03-14 20:55:...|       9|      9|     9|
| 10|2024-03-14|2024-03-14 20:55:...|      10|      1|     0|
| 11|2024-03-14|2024-03-14 20:55:...|      11|      1|     1|
| 12|2024-03-14|2024-03-14 20:55:...|      12|      1|     2|
| 13|202

In [7]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

# 3 - Parquet

## 3-1 Initial state Parquet

How it works: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [None]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 15
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [18]:
sc.setJobDescription("Save Parquet")
path_parquet = "D:/Spark/Data/format_parquet_large.parquet"
sdf.write.format("parquet").mode("overwrite").save(path_parquet)

In [69]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")

In [70]:
sc.setJobDescription("Load Parquet all data V1")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.select("id", "idstring")
sdf_parquet.write.format("noop").mode("overwrite").save()

In [71]:
spark.conf.set("spark.sql.sources.useV1SourceList", "")

In [72]:
sc.setJobDescription("Load Parquet all data V2")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.select("id", "idstring")
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-2 Count

In [33]:
#https://spark.apache.org/docs/latest/sql-data-sources-parquet.html
spark.conf.set("spark.sql.parquet.aggregatePushdown", "true")

In [51]:
spark.conf.get("spark.sql.parquet.aggregatePushdown")

'true'

In [58]:
spark.conf.set("spark.sql.sources.useV1SourceList", "")

In [73]:
sc.setJobDescription("Count parquet V2")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet.count()


10000000

In [59]:
sc.setJobDescription("Max parquet V2")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_max = sdf_parquet.groupBy().max("id")
sdf_max.show()

+-------+
|max(id)|
+-------+
|9999999|
+-------+



In [60]:
sdf_max.explain(True)

== Parsed Logical Plan ==
Aggregate [max(id#617L) AS max(id)#636L]
+- RelationV2[id#617L, date#618, timestamp#619, idstring#620, idfirst#621, idlast#622]  parquet file:/D:/Spark/Data/format_parquet_large.parquet

== Analyzed Logical Plan ==
max(id): bigint
Aggregate [max(id#617L) AS max(id)#636L]
+- RelationV2[id#617L, date#618, timestamp#619, idstring#620, idfirst#621, idlast#622]  parquet file:/D:/Spark/Data/format_parquet_large.parquet

== Optimized Logical Plan ==
Aggregate [max(agg_func_0#648L) AS max(id)#636L]
+- Project [max(id)#649L AS agg_func_0#648L]
   +- RelationV2[max(id)#649L] parquet file:/D:/Spark/Data/format_parquet_large.parquet

== Physical Plan ==
*(2) HashAggregate(keys=[], functions=[max(agg_func_0#648L)], output=[max(id)#636L])
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=898]
   +- *(1) HashAggregate(keys=[], functions=[partial_max(agg_func_0#648L)], output=[max#651L])
      +- *(1) Project [max(id)#649L AS agg_func_0#648L]
         +- *(1) Columna

In [31]:
sc.setJobDescription("Count avro")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro)
sdf_avro.count()

10000000

In [32]:
sc.setJobDescription("Count SDF")
sdf.count()

10000000

In [38]:
sdf.select(f.max(sdf.id)).show()

+-------+
|max(id)|
+-------+
|9999999|
+-------+



## 3-3 Row Filter id

In [28]:
sc.setJobDescription("Parquet Row Filter id")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("id") < 300)
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-4 Bad Ordering Row Filter ID

In [29]:
sc.setJobDescription("Save Parquet Bad")
path_parquet = "D:/Spark/Data/format_parquet_large_rep_badorder.parquet"
sdf.orderBy("idlast").write.format("parquet").mode("overwrite").save(path_parquet)

In [30]:
sdf.orderBy("idlast").write.format("noop").mode("overwrite").save()

In [31]:
sc.setJobDescription("Parquet Row Filter id bad ordering")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("id") < 300)
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-5 Row Filter idlast initial data

In [32]:
path_parquet = "D:/Spark/Data/format_parquet_large.parquet"
sc.setJobDescription("Parquet idlast initial data")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-6 Row filter sorted data

In [33]:
path_parquet = "D:/Spark/Data/format_parquet_large_rep_badorder.parquet"
sc.setJobDescription("Parquet idlast ordered data")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-7 Row filter idlast good


See also partition write video: [Writing Partitions Part 5](https://youtu.be/zfEuMNh01Uk)

In [34]:
sc.setJobDescription("Save Parquet Repartitioned idlast")
path_parquet = "D:/Spark/Data/format_parquet_large_rep.parquet"
sdf.repartition(10, "idlast").write.format("parquet").mode("overwrite").save(path_parquet)

In [35]:
sc.setJobDescription("Idlast repartitioned data")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet = sdf_parquet.filter(f.col("idlast") == "1")
sdf_parquet.write.format("noop").mode("overwrite").save()

# 4 - Avro

## 4-1 Initial state AVRO

How it works: https://spark.apache.org/docs/latest/sql-data-sources-avro.html

In [22]:
sc.setJobDescription("Save Avro")
path_avro = "D:/Spark/Data/format_avro_large.avro"
sdf.write.format("avro").mode("overwrite").save(path_avro)

In [25]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 15
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [26]:
sc.setJobDescription("Load Avro All data")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()

In [27]:
sdf_avro.rdd.getNumPartitions()

8

## 4-2 Column Filter

Results:
- Load data: 1.6 s

In [21]:
sc.setJobDescription("Avro Column Filter")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro)
sdf_avro = sdf_avro.select("id", "idstring")
sdf_avro.write.format("noop").mode("overwrite").save()

## 4-3 Row Filter

In [22]:
sc.setJobDescription("Avro ID Filter")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro)
sdf_avro = sdf_avro.filter(f.col("id") < 300)
sdf_avro.write.format("noop").mode("overwrite").save()