In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import math

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""

'\nReference gresearch:\n- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/\n- GitHub Spark extension: https://github.com/G-Research/spark-extension\n- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet\n'

In [3]:
sc = spark.sparkContext

In [4]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
sdf = sdf_generator(10000000, 8)
sdf.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-08|2024-03-08 21:38:...|       0|      0|     0|
|  1|2024-03-08|2024-03-08 21:38:...|       1|      1|     1|
|  2|2024-03-08|2024-03-08 21:38:...|       2|      2|     2|
|  3|2024-03-08|2024-03-08 21:38:...|       3|      3|     3|
|  4|2024-03-08|2024-03-08 21:38:...|       4|      4|     4|
|  5|2024-03-08|2024-03-08 21:38:...|       5|      5|     5|
|  6|2024-03-08|2024-03-08 21:38:...|       6|      6|     6|
|  7|2024-03-08|2024-03-08 21:38:...|       7|      7|     7|
|  8|2024-03-08|2024-03-08 21:38:...|       8|      8|     8|
|  9|2024-03-08|2024-03-08 21:38:...|       9|      9|     9|
| 10|2024-03-08|2024-03-08 21:38:...|      10|      1|     0|
| 11|2024-03-08|2024-03-08 21:38:...|      11|      1|     1|
| 12|2024-03-08|2024-03-08 21:38:...|      12|      1|     2|
| 13|202

In [7]:

#https://vincent.doba.fr/posts/20211004_spark_data_description_language_for_defining_spark_schema/
ddl_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

# 1 - JSON

## 1-1 Initial state JSON
- Writing and reading json we default settings

Results:
- Write time: 6.6 s 
- Load time: 11.7 s
- Data size: 1208 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L323
https://spark.apache.org/docs/latest/sql-data-sources-json.html

In [8]:
sc.setJobDescription("Save Json")
path_json = "D:/Spark/Data/format_json.json"
sdf.write.format("json").mode("overwrite").save(path_json)

In [9]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 160
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [10]:
sc.setJobDescription("Load Json without schema")
sdf_json = spark.read.format("json").load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [11]:
sdf_json

DataFrame[date: string, id: bigint, idfirst: string, idlast: string, idstring: string, timestamp: string]

## 1-2 Load with Auto Schema
- Use the schema of the loaded JSON Dataframe for loading the data and move to correct order

Results:
- Load time: 6.8 s

In [12]:
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with auto schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [13]:
sdf_json

DataFrame[id: bigint, date: string, timestamp: string, idstring: string, idfirst: string, idlast: string]

## 1-3 Cast output dataframe
- When loading JSON the schema is not as expected as initially defined in the saved SDF
- Let's additionally cast our dataframes with and without schema

Results:
- With manual casting without schema: 16.5 s
- With manual casting with schema: 12.4 s


In [14]:
sc.setJobDescription("Load Json without schema and manual casting")
sdf_json = spark.read.format("json").load(path_json)
sdf_json = sdf_json.withColumn("date", f.col("date").cast("date"))
sdf_json = sdf_json.withColumn("timestamp", f.col("timestamp").cast("timestamp"))
sdf_json = sdf_json.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_json.write.format("noop").mode("overwrite").save()

In [15]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [16]:
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with auto schema and manual casting")
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json = sdf_json.withColumn("date", f.col("date").cast("date"))
sdf_json = sdf_json.withColumn("timestamp", f.col("timestamp").cast("timestamp"))
sdf_json = sdf_json.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_json.write.format("noop").mode("overwrite").save()

In [17]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 1-4 Define expected Schema

Results:
- Load data: 13.4 s

In [20]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with DataFrame schema")
sdf_json = spark.read.format("json").schema(sdf_schema).load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [21]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 1-5 Save and load with Snappy compression

Results:
- Write time: 6.3 s 
- Load time default: 12.1 s
- Load time with auto schema and casting: 12.3 s
- Load time with sdf schema: 11.8 s
- Data size: 130 MB

In [23]:
sc.setJobDescription("Save Json Snappy")
path_json = "D:/Spark/Data/format_json_snappy.json"
sdf.write.format("json").option("compression", "snappy").mode("overwrite").save(path_json)

In [24]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 21
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [25]:
sc.setJobDescription("Load Json without schema")
sdf_json = spark.read.format("json").load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [None]:
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with auto schema and manual casting")
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json = sdf_json.withColumn("date", f.col("date").cast("date"))
sdf_json = sdf_json.withColumn("timestamp", f.col("timestamp").cast("timestamp"))
sdf_json = sdf_json.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_json.write.format("noop").mode("overwrite").save()

In [26]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with DataFrame schema")
sdf_json = spark.read.format("json").schema(sdf_schema).load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

## 1-2 - CSV
- Write time: 6.2 s
- Load time: 5.3 s
- Data size: 593 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L470
https://spark.apache.org/docs/latest/sql-data-sources-csv.html

In [7]:
sc.setJobDescription("Save CSV")
path_csv = "D:/Spark/Data/format_csv.csv"
sdf.write.format("csv").mode("overwrite").option("header", "True").save(path_csv)

NameError: name 'sdf' is not defined

In [5]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 80
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [10]:
sc.setJobDescription("Load CSV without schema")
sdf_csv = spark.read.format("csv").option("header", "True").load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [11]:
sdf_csv

DataFrame[id: string, date: string, timestamp: string, idstring: string, idfirst: string, idlast: string]

In [14]:
ddl_schema_csv = "id string, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load CSV with schema")
sdf_csv = spark.read.format("csv").option("header", "True").schema(ddl_schema_csv).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [15]:
sc.setJobDescription("Load CSV with schema")
sdf_csv = spark.read.format("csv").option("header", "True").schema(ddl_schema).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

## 1-2 - CSV infer schema load
- Write time: 6.2 s
- Load time: 23.5 s
- Data size: 593 MB

In [16]:
sc.setJobDescription("Load CSV with infer schema")
sdf_csv2 = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv)
sdf_csv2.write.format("noop").mode("overwrite").save()

In [17]:
sdf_csv2

DataFrame[id: int, date: date, timestamp: timestamp, idstring: int, idfirst: int, idlast: int]

In [18]:
ddl_schema_csv2 = "id int, date date, timestamp timestamp, idstring int, idfirst int, idlast int"
sc.setJobDescription("Load CSV with schema")
sdf_csv = spark.read.format("csv").option("header", "True").schema(ddl_schema_csv2).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

## 1-3 - PARQUET
- Write time: 5.4 s
- Load time: 1.5 s
- Data size: 81.5 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [19]:
sc.setJobDescription("Save Parquet")
path_parquet = "D:/Spark/Data/format_parquet.parquet"
sdf.write.format("parquet").mode("overwrite").save(path_parquet)

NameError: name 'sdf' is not defined

In [20]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 15
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [21]:
sc.setJobDescription("Load Parquet without schema")
sdf_parquet = spark.read.format("parquet").load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [22]:
sc.setJobDescription("Load Parquet with schema")
sdf_parquet = spark.read.format("parquet").schema(ddl_schema).load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

## 1-4 - AVRO
- Write time: 2.5 s
- Load time: 2.3 s
- Data size: 69.2 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-avro.html

In [23]:
sc.setJobDescription("Save Avro")
path_avro = "D:/Spark/Data/format_avro.avro"
sdf.write.format("avro").mode("overwrite").save(path_avro)

NameError: name 'sdf' is not defined

In [18]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 10
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [19]:
sc.setJobDescription("Load Avro")
sdf_avro = spark.read.format("avro").load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()

In [None]:
sc.setJobDescription("Load Avro")
sdf_avro = spark.read.format("avro").schema(ddl_schema).load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()

# Schema check

ignoring columns, add empty columns with Null values

In [23]:
sdf

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [24]:
sdf_csv

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [25]:
sdf_csv2

NameError: name 'sdf_csv2' is not defined

In [26]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [27]:
sdf_parquet

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [28]:
sdf_avro

NameError: name 'sdf_avro' is not defined