In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t

In [3]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""
sc = spark.sparkContext

In [4]:
BASE_DIR = "D:/Spark/Data"


In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
def write_generator(num_rows, num_files, format = "parquet"):
    sdf = sdf_generator(num_rows, num_files)
    path = f"{BASE_DIR}/{num_files}_files_{num_rows}_rows.{format}"
    sc.setJobDescription(f"Write {num_files} files, {num_rows} rows")
    sdf.write.format(format).mode("overwrite").save(path)
    sc.setJobDescription("None")
    print(f"Num partitions written: {sdf.rdd.getNumPartitions()}")
    print(f"Saved Path: {path}")
    return path

In [19]:
write_generator(1000, 4, "csv")

Num partitions written: 4
Saved Path: D:/Spark/Data/4_files_1000_rows.csv


'D:/Spark/Data/4_files_1000_rows.csv'

In [8]:
path = "D:/Spark/Data/4_files_1000_rows.parquet"
path2 = 'D:/Spark/Data/4_files_1000_rows.csv'

In [9]:
sc.setJobDescription("Parquet all")
sdf = spark.read.parquet(path)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [id#0L,date#1,timestamp#2,idstring#3,idfirst#4,idlast#5] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint,date:date,timestamp:timestamp,idstring:string,idfirst:string,idlast:string>




In [10]:
sc.setJobDescription("Parquet one column")
sdf = spark.read.parquet(path).select("id")
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [id#18L] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>




In [11]:
sc.setJobDescription("Parquet filter")
sdf = spark.read.parquet(path).filter(f.col("id") < 100)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
*(1) Filter (isnotnull(id#33L) AND (id#33L < 100))
+- *(1) ColumnarToRow
   +- FileScan parquet [id#33L,date#34,timestamp#35,idstring#36,idfirst#37,idlast#38] Batched: true, DataFilters: [isnotnull(id#33L), (id#33L < 100)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(id), LessThan(id,100)], ReadSchema: struct<id:bigint,date:date,timestamp:timestamp,idstring:string,idfirst:string,idlast:string>




In [13]:
sc.setJobDescription("CSV filter")
sdf = spark.read.csv(path2).filter(f.col("_c0") < 100)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
*(1) Filter (isnotnull(_c0#111) AND (cast(_c0#111 as int) < 100))
+- FileScan csv [_c0#111,_c1#112,_c2#113,_c3#114,_c4#115,_c5#116] Batched: false, DataFilters: [isnotnull(_c0#111), (cast(_c0#111 as int) < 100)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.csv], PartitionFilters: [], PushedFilters: [IsNotNull(_c0)], ReadSchema: struct<_c0:string,_c1:string,_c2:string,_c3:string,_c4:string,_c5:string>




In [16]:
sc.setJobDescription("CSV Schema")
ddl_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sdf = spark.read.schema(ddl_schema).csv(path2)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
FileScan csv [id#194L,date#195,timestamp#196,idstring#197,idfirst#198,idlast#199] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint,date:date,timestamp:timestamp,idstring:string,idfirst:string,idlast:string>




In [17]:
sc.setJobDescription("CSV without Schema")
sdf = spark.read.csv(path2)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
FileScan csv [_c0#235,_c1#236,_c2#237,_c3#238,_c4#239,_c5#240] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string,_c2:string,_c3:string,_c4:string,_c5:string>




In [17]:
sdf = spark.read.parquet(path)
sdf = sdf.cache()
sdf = sdf.filter(f.col("id") < 100)
sdf.write.format("noop").mode("overwrite").save()
sdf.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(id#137L) AND (id#137L < 100))
   +- InMemoryTableScan [id#137L, date#138, timestamp#139, idstring#140, idfirst#141, idlast#142], [isnotnull(id#137L), (id#137L < 100)]
         +- InMemoryRelation [id#137L, date#138, timestamp#139, idstring#140, idfirst#141, idlast#142], StorageLevel(disk, memory, deserialized, 1 replicas)
               +- *(1) ColumnarToRow
                  +- FileScan parquet [id#137L,date#138,timestamp#139,idstring#140,idfirst#141,idlast#142] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint,date:date,timestamp:timestamp,idstring:string,idfirst:string,idlast:string>




In [13]:
sdf.explain()

== Physical Plan ==
*(1) Filter (isnotnull(id#66L) AND (id#66L < 100))
+- *(1) ColumnarToRow
   +- FileScan parquet [id#66L,date#67,timestamp#68,idstring#69,idfirst#70,idlast#71] Batched: true, DataFilters: [isnotnull(id#66L), (id#66L < 100)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/D:/Spark/Data/4_files_1000_rows.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(id), LessThan(id,100)], ReadSchema: struct<id:bigint,date:date,timestamp:timestamp,idstring:string,idfirst:string,idlast:string>


