In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from delta import configure_spark_with_delta_pip

In [3]:
builder = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()


#https://medium.com/@hareesha1906/apache-spark-delta-lake-3c2b7c56879c
#What is the catalog

In [4]:
sc = spark.sparkContext

In [None]:
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.mode.html#pyspark.sql.DataFrameWriter.mode

In [5]:
data = spark.range(start=1, end=6, numSlices=1)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")

In [6]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

+---+
| id|
+---+
|  3|
|  4|
|  2|
|  0|
|  1|
+---+



In [7]:
data = spark.range(start=6, end=11, numSlices=1)
data.write.format("delta").mode("append").save("/tmp/delta-table")

In [8]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

+---+
| id|
+---+
|  8|
|  9|
|  7|
|  5|
|  6|
+---+



In [None]:
data = spark.range(start=0, end=20, numSlices=1)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")

In [None]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

In [4]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache dataframes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
sdf_large = sdf_generator(10000000, 8)
sdf_small = sdf_generator(10000000, 100000)
sdf_small.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-22|2024-03-22 07:49:...|       0|      0|     0|
|  1|2024-03-22|2024-03-22 07:49:...|       1|      1|     1|
|  2|2024-03-22|2024-03-22 07:49:...|       2|      2|     2|
|  3|2024-03-22|2024-03-22 07:49:...|       3|      3|     3|
|  4|2024-03-22|2024-03-22 07:49:...|       4|      4|     4|
|  5|2024-03-22|2024-03-22 07:49:...|       5|      5|     5|
|  6|2024-03-22|2024-03-22 07:49:...|       6|      6|     6|
|  7|2024-03-22|2024-03-22 07:49:...|       7|      7|     7|
|  8|2024-03-22|2024-03-22 07:49:...|       8|      8|     8|
|  9|2024-03-22|2024-03-22 07:49:...|       9|      9|     9|
| 10|2024-03-22|2024-03-22 07:49:...|      10|      1|     0|
| 11|2024-03-22|2024-03-22 07:49:...|      11|      1|     1|
| 12|2024-03-22|2024-03-22 07:49:...|      12|      1|     2|
| 13|202

In [7]:
sdf_small

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [8]:

#https://vincent.doba.fr/posts/20211004_spark_data_description_language_for_defining_spark_schema/
ddl_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

spark_schema = t.StructType(
    [
        t.StructField('id', t.LongType(), True), 
        t.StructField('date', t.DateType(), True), 
        t.StructField('timestamp', t.TimestampType(), True), 
        t.StructField('idstring', t.StringType(), True), 
        t.StructField('idfirst', t.StringType(), True), 
        t.StructField('idlast', t.StringType(), True)
    ]
)

In [9]:
def set_max_partitions_bytes(maxPartitionsMB):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

def set_cost_in_bytes(openCostInMB):
    openCostInBytes = math.ceil(openCostInMB*1024*1024)
    spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInBytes)+"b")

# 1 - JSON

## 1-1 Save JSON

Results:
- Write time: small 11 min; big 3 s
- Data size: 1218 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L323
https://spark.apache.org/docs/latest/sql-data-sources-json.html

In [10]:
sc.setJobDescription("Save Json Small")
path_json_small = "D:/Spark/Data/format_json_small.json"
#sdf_small.write.format("json").mode("overwrite").save(path_json_small)

In [11]:
sc.setJobDescription("Save Json Large")
path_json_large = "D:/Spark/Data/format_json_large.json"
#sdf_large.write.format("json").mode("overwrite").save(path_json_large)

## 1-2 Load without schema

Results:
- Small:
    - Listing leaf files and directories: 21 s
    - Schema interference: 6,5 min
    - Load data 18 s
    - Total: 7,7 min
- Big: 
    - Listing leaf files and directories: 0 s
    - Schema interference: 2 s
    - Load data 4 s
    - Total: 6 s

In [12]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Json Small without schema")
sdf_json = spark.read.format("json").load(path_json_small)
sdf_json.write.format("noop").mode("overwrite").save()

In [13]:
set_max_partitions_bytes(160)
sc.setJobDescription("Load Json Large without schema")
sdf_json = spark.read.format("json").load(path_json_large)
sdf_json.write.format("noop").mode("overwrite").save()

## 1-3 Load with Auto Schema

Results:
- Small:
    - Load data 18 s
- Big: 
    - Load data 3 s


In [14]:
set_max_partitions_bytes(128)
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json Small with auto schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json_small)
sdf_json.write.format("noop").mode("overwrite").save()

In [15]:
set_max_partitions_bytes(160)
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json Large with auto schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json_large)
sdf_json.write.format("noop").mode("overwrite").save()

## 1-4 Load with Target Schema

Results:
- Small:
    - Load data 21 s
- Big: 
    - Load data 6 s

In [16]:
set_max_partitions_bytes(128)
schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json Small with target schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json_small)
sdf_json.write.format("noop").mode("overwrite").save()

In [17]:
set_max_partitions_bytes(160)
schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json Large with target schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json_large)
sdf_json.write.format("noop").mode("overwrite").save()

# 2 - CSV with schema interference

## 2-1 Save CSV

Results:
- Write time: small 8,3 min; big 3 s
- Data size: small: 593 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L470
https://spark.apache.org/docs/latest/sql-data-sources-csv.html

In [10]:
sc.setJobDescription("Save CSV Small")
path_csv_small = "D:/Spark/Data/format_csv_small.json"
#sdf_small.write.format("csv").mode("overwrite").save(path_csv_small)

In [11]:
sc.setJobDescription("Save CSV Large")
path_csv_large = "D:/Spark/Data/format_csv_large.json"
#sdf_large.write.format("csv").mode("overwrite").save(path_csv_large)

## 2-2 Load without schema

- Small:
    - Listing leaf files and directories: 19 s
    - Schema interference: 86 ms + 6,9 min
    - Load data 30 s
    - Total: 7,7 min
- Big: 
    - Listing leaf files and directories: 0 s
    - Schema interference: 25 ms + 6 s
    - Load data 6 s
    - Total: 12 s

In [12]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load CSV Small without schema")
sdf_csv = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv_small)
sdf_csv.write.format("noop").mode("overwrite").save()

In [13]:
set_max_partitions_bytes(80)
sc.setJobDescription("Load CSV Large without schema")
sdf_csv = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv_large)
sdf_csv.write.format("noop").mode("overwrite").save()

## 2-3 Load with Auto Schema

- Small:
    - Load data 32 s
- Big: 
    - Load data 6 s


In [14]:
set_max_partitions_bytes(128)
schema = "id int, date date, timestamp timestamp, idstring int, idfirst int, idlast int"
sc.setJobDescription("Load CSV Small with auto schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv_small)
sdf_csv.write.format("noop").mode("overwrite").save()

In [15]:
set_max_partitions_bytes(80)
schema = "id int, date date, timestamp timestamp, idstring int, idfirst int, idlast int"
sc.setJobDescription("Load CSV Large with auto schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv_large)
sdf_csv.write.format("noop").mode("overwrite").save()

## 2-4 Load with Target schema

- Small:
    - Load data 31 s
- Big: 
    - Load data 6 s


In [16]:
set_max_partitions_bytes(128)
schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load CSV Small with target schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv_small)
sdf_csv.write.format("noop").mode("overwrite").save()

In [17]:
set_max_partitions_bytes(80)
schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load CSV Large with target schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv_large)
sdf_csv.write.format("noop").mode("overwrite").save()

# 3 - Parquet

## 3-1 Save Parquet

Results:
- Write time: small 9,3 min; big 2 s
- Data size: 81.5 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [18]:
sc.setJobDescription("Save Parquet Small")
path_parquet_small = "D:/Spark/Data/format_parquet_small.parquet"
#sdf_small.write.format("parquet").mode("overwrite").save(path_parquet_small)

In [19]:
sc.setJobDescription("Save Parquet Large")
path_parquet_large = "D:/Spark/Data/format_parquet_large.parquet"
#sdf_large.write.format("parquet").mode("overwrite").save(path_parquet_large)

## 3-2 Load without schema

- Small:
    - Schema interference: 0,3 s
    - Load data 5,8 min
    - Total: 5,8 min
- Big: 
    - Schema interference: 30 ms
    - Load data 0,6 s
    - Total: 0,6 s

In [20]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Parquet Small without schema")
sdf_parquet = spark.read.format("parquet").load(path_parquet_small)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [21]:
set_max_partitions_bytes(15)
sc.setJobDescription("Load Parquet Large without schema")
sdf_parquet = spark.read.format("parquet").load(path_parquet_large)
sdf_parquet.write.format("noop").mode("overwrite").save()

## 3-3 Load with Schema

- Small:
    - Load data 33 s
- Big: 
    - Load data 0,4 s


In [22]:
set_max_partitions_bytes(128)
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Parquet Small with schema")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet_small)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [23]:
set_max_partitions_bytes(15)
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Parquet Large with schema")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet_large)
sdf_parquet.write.format("noop").mode("overwrite").save()

# 4 - Avro

## 4-1 Save AVRO

Results:
- Write time: small 27 min; big 1 s
- Data size: 69.2 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-avro.html

In [20]:
sc.setJobDescription("Save Avro Small")
path_avro_small = "D:/Spark/Data/format_avro_small.avro"
#sdf_small.write.format("avro").mode("overwrite").save(path_avro_small)

In [21]:
sc.setJobDescription("Save Avro Large")
path_avro_large = "D:/Spark/Data/format_avro_large.avro"
#sdf_large.write.format("avro").mode("overwrite").save(path_avro_large)

## 4-2 Load without schema

- Small:
    - Load data 5,1 min
- Big: 
    - Load data 0,9 s


In [26]:
set_max_partitions_bytes(128)
sc.setJobDescription("Load Avro Small without schema")
sdf_avro = spark.read.format("avro").load(path_avro_small)
sdf_avro.write.format("noop").mode("overwrite").save()

In [27]:
set_max_partitions_bytes(10)
sc.setJobDescription("Load Avro Large without schema")
sdf_avro = spark.read.format("avro").load(path_avro_large)
sdf_avro.write.format("noop").mode("overwrite").save()

## 4-3 Load with schema

- Small:
    - Load data 16 s
- Big: 
    - Load data 1 s

In [28]:
set_max_partitions_bytes(128)
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Avro Small with Schema")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro_small)
sdf_avro.write.format("noop").mode("overwrite").save()

In [29]:
set_max_partitions_bytes(10)
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Avro Large with Schema")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro_large)
sdf_avro.write.format("noop").mode("overwrite").save()

## Increase number of Partitions

In [18]:
for openCostInMB in [1, 2, 4, 6, 8, 10]:
    set_cost_in_bytes(openCostInMB)
    sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
    sc.setJobDescription(f"Load JSON Small Open Cost: {openCostInMB} MB")
    sdf_json = spark.read.format("json").schema(sdf_schema).load(path_json_small)
    sdf_json.write.format("noop").mode("overwrite").save()

In [22]:
for openCostInMB in [1, 2, 4, 6, 8, 10]:
    set_cost_in_bytes(openCostInMB)
    sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
    sc.setJobDescription(f"Load Avro Small Open Cost: {openCostInMB} MB")
    sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro_small)
    sdf_avro.write.format("noop").mode("overwrite").save()
    