In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""

'\nReference gresearch:\n- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/\n- GitHub Spark extension: https://github.com/G-Research/spark-extension\n- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet\n'

In [53]:
sc = spark.sparkContext

In [3]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [43]:
sdf = sdf_generator(10000000, 8)

In [65]:
sdf.schema

StructType([StructField('id', LongType(), False), StructField('date', DateType(), False), StructField('timestamp', TimestampType(), False), StructField('idstring', StringType(), False), StructField('idfirst', StringType(), False), StructField('idlast', StringType(), False)])

In [44]:
#593 MB
path_csv = "D:/Spark/Data/format_csv.csv"
sdf.write.format("csv").mode("overwrite").save(path_csv)

In [45]:
#1,18 GB
path_json = "D:/Spark/Data/format_json.json"
sdf.write.format("json").mode("overwrite").save(path_json)

In [46]:
#81,4 MB
path_parquet = "D:/Spark/Data/format_parquet.parquet"
sdf.write.format("parquet").mode("overwrite").save(path_parquet)

In [47]:
#69,2 MB
path_avro = "D:/Spark/Data/format_avro.avro"
sdf.write.format("avro").mode("overwrite").save(path_avro)

In [63]:
sc.setJobDescription("Load CSV")
sdf_csv = spark.read.format("csv").options(inferSchema=True).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [64]:
sdf_csv.schema

StructType([StructField('_c0', IntegerType(), True), StructField('_c1', DateType(), True), StructField('_c2', TimestampType(), True), StructField('_c3', IntegerType(), True), StructField('_c4', IntegerType(), True), StructField('_c5', IntegerType(), True)])

In [55]:
sc.setJobDescription("Load Json")
sdf_json = spark.read.format("json").load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [60]:
sdf_json.schema

StructType([StructField('date', StringType(), True), StructField('id', LongType(), True), StructField('idfirst', StringType(), True), StructField('idlast', StringType(), True), StructField('idstring', StringType(), True), StructField('timestamp', StringType(), True)])

In [56]:
sc.setJobDescription("Load Parquet")
sdf_parquet = spark.read.format("parquet").load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [57]:
sc.setJobDescription("Load Avro")
sdf_avro = spark.read.format("avro").load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()