# Today's topic: Introducing Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import os
import sys


In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

#Needed to specify the used cores: .master("local[4]")
#Needed for Hive Support: .enableHiveSupport()
#Needed for the spark extension package: .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5")

#Used Tools:
#https://github.com/apache/parquet-mr/blob/master/parquet-cli/README.md
#https://github.com/G-Research/spark-extension/tree/master
#PyArrow: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_metadata.html#
#Install Maven: https://maven.apache.org/install.html


sc = spark.sparkContext

In [4]:
#ADJUST PATH BASED ON YOUR CHOSEN ONE
path = "D:/Data/metadata.parquet"

In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
#100 million rows, 2 files
sdf = sdf_generator(100000000, 2)

In [7]:
sc.setJobDescription("Write Dataset")
sdf.write.format("parquet").mode("overwrite").save(path)

In [14]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

Doku: https://github.com/G-Research/spark-extension/blob/master/python/gresearch/spark/parquet/__init__.py

# File Meta Data
- 2 files with 4 blocks  (row groups)
- 424.235.869 bytes (405 MB) and 424.623.387 (405 MB) big, Sum: 848.859.256 (810) MB
- In explorer it's 816 MB (856.260.608 bytes)
- 50 million rows per file
- no null values
- schema as above

This provides the following per-file information:
- filename (string): The file name
- blocks (int): Number of blocks / RowGroups in the Parquet file
- compressedBytes (long): Number of compressed bytes of all blocks
- uncompressedBytes (long): Number of uncompressed bytes of all blocks
- rows (long): Number of rows in the file
- columns (int): Number of rows in the file
- values (long): Number of values in the file
- nulls (long): Number of null values in the file
- createdBy (string): The createdBy string of the Parquet file, e.g. library used to write the file
- schema (string): The schema
- encryption (string): The encryption
- keyValues (string-to-string map): Key-value data of the file

In [16]:
424235869 + 424623387

848859256

In [37]:
sc.setJobDescription("read.parquet_metadata")
sdf_meta = (
    spark.read.parquet_metadata(path)
    .dropDuplicates(["filename"])
    .orderBy("filename")
    .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 2))
    .withColumn("uncompressedMB", f.round(f.col("uncompressedBytes")/1024/1024, 2))
    .select(['filename','blocks','compressedBytes','uncompressedBytes','compressedMB','uncompressedMB','rows','columns','values','nulls','createdBy','schema','encryption','keyValues'])
    .show(20, False)
)

+--------------------------------------------------------------------------------------------------+------+---------------+-----------------+------------+--------------+--------+-------+---------+-----+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
sdf.schema

StructType([StructField('id', LongType(), False), StructField('date', DateType(), False), StructField('timestamp', TimestampType(), False), StructField('idstring', StringType(), False), StructField('idfirst', StringType(), False), StructField('idlast', StringType(), False)])

- Show stages size and number of rows
- Show SQL size and number of files

In [38]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Load Parquet all data V1")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet.write.format("noop").mode("overwrite").save()

# Parquet Blocks
- First block starts at 4 bytes (after Header)
- First block is 133949814 bytes (127,44 MB) big and has 15790100 rows
- We can also see that our assumption of 128 MB per Row Group is correct

This provides the following per-block information:
- filename (string): The file name
- block (int): Block / RowGroup number starting at 1
- blockStart (long): Start position of the block in the Parquet file
- compressedBytes (long): Number of compressed bytes in block
- uncompressedBytes (long): Number of uncompressed bytes in block
- rows (long): Number of rows in block
- columns (int): Number of columns in block
- values (long): Number of values in block
- nulls (long): Number of null values in block

In [36]:
sc.setJobDescription("read.parquet_blocks")
sdf_meta = (
    spark.read.parquet_blocks(path)
    .dropDuplicates(["filename", "block"])
    .orderBy("filename", "block")
    .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 2))
    .withColumn("uncompressedMB", f.round(f.col("uncompressedBytes")/1024/1024, 2))
    .select(['filename','block', 'blockStart', 'compressedBytes', 'uncompressedBytes', 'compressedMB', 'uncompressedMB', 'rows', 'columns', 'values', 'nulls'])
    .show(20, False)
)

+--------------------------------------------------------------------------------------------------+-----+----------+---------------+-----------------+------------+--------------+--------+-------+--------+-----+
|filename                                                                                          |block|blockStart|compressedBytes|uncompressedBytes|compressedMB|uncompressedMB|rows    |columns|values  |nulls|
+--------------------------------------------------------------------------------------------------+-----+----------+---------------+-----------------+------------+--------------+--------+-------+--------+-----+
|file:/D:/Data/metadata.parquet/part-00000-6b37e43d-21db-4d02-b136-bdb45f90ecb1-c000.snappy.parquet|1    |4         |133949814      |312755950        |127.74      |298.27        |15790100|6      |94740600|0    |
|file:/D:/Data/metadata.parquet/part-00000-6b37e43d-21db-4d02-b136-bdb45f90ecb1-c000.snappy.parquet|2    |133949818 |133895545      |323456519        |1

# Parquet Partitions
- We can see that every partition is one row group per partition except of the last one per file beeing small enough to be together in a partition. Meaning 7 partitions
- Simplified Reason max partition around the same as row group size. Just the last row group suits into 128 MB.
- Check this video for more details: https://youtu.be/Inr0vH9EsEY
- length: Value of MaxPartitionBytes in Spark
- compressedBytes the actual data without meta information per partition

This provides the following per-partition information:
- partition (int): The Spark partition id
- partitionStart (long): The start position of the partition
- partitionEnd (long): The end position of the partition
- partitionLength (long): The length of the partition
- blocks (int): The number of Parquet blocks / RowGroups in this partition
- compressedBytes (long): The number of compressed bytes in this partition
- uncompressedBytes (long): The number of uncompressed bytes in this partition
- rows (long): The number of rows in this partition
- columns (int): The number of columns in this partition
- values (long): The number of values in this partition
- nulls (long): The number of null values in this partition
- filename (string): The Parquet file name
- fileLength (long): The length of the Parquet file

In [39]:
sc.setJobDescription("read.parquet_partitions")
sdf_meta = (
    spark.read.parquet_partitions(path)
    .withColumn("partitionLengthMB", f.round(f.col("length")/1024/1024, 2))
    .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 2))
    .withColumn("uncompressedMB", f.round(f.col("uncompressedBytes")/1024/1024, 2))
    .select(['partition', 'start', 'end', 'length', 'blocks', 'compressedBytes', 'uncompressedBytes', 'partitionLengthMB', 'compressedMB', 'uncompressedMB', 'rows', 'columns', 'values', 'nulls', 'filename', 'fileLength'])
)
sdf_meta.show(20, False)

+---------+---------+---------+---------+------+---------------+-----------------+-----------------+------------+--------------+--------+-------+--------+-----+--------------------------------------------------------------------------------------------------+----------+
|partition|start    |end      |length   |blocks|compressedBytes|uncompressedBytes|partitionLengthMB|compressedMB|uncompressedMB|rows    |columns|values  |nulls|filename                                                                                          |fileLength|
+---------+---------+---------+---------+------+---------------+-----------------+-----------------+------------+--------------+--------+-------+--------+-----+--------------------------------------------------------------------------------------------------+----------+
|0        |0        |134217728|134217728|1     |133949814      |312755950        |128.0            |127.74      |298.27        |15790100|6      |94740600|0    |file:/D:/Data/metadata.parq

- Splittability on Row Group level

# Parquet Block Columns

This provides the following per-block-column information:
- filename (string): The file name
- block (int): Block / RowGroup number starting at 1
- column (array<string>): Block / RowGroup column name
- codec (string): The coded used to compress the block column values
- type (string): The data type of the block column
- encodings (array<string>): Encodings of the block column
- minValue (string): Minimum value of this column in this block
- maxValue (string): Maximum value of this column in this block
- columnStart (long): Start position of the block column in the Parquet file
- compressedBytes (long): Number of compressed bytes of this block column
- uncompressedBytes (long): Number of uncompressed bytes of this block column
- values (long): Number of values in this block column
- nulls (long): Number of null values in this block column

In [42]:
sdf_meta = (
    spark.read.parquet_block_columns(path)
    .dropDuplicates(["filename", "block", "column"])
    .orderBy("filename", "block", "columnStart")
)
sdf_meta.show(48, False)

+--------------------------------------------------------------------------------------------------+-----+-----------+------+---------------------------------+------------------------------+--------------------------+--------------------------+-----------+---------------+-----------------+--------+-----+
|filename                                                                                          |block|column     |codec |type                             |encodings                     |minValue                  |maxValue                  |columnStart|compressedBytes|uncompressedBytes|values  |nulls|
+--------------------------------------------------------------------------------------------------+-----+-----------+------+---------------------------------+------------------------------+--------------------------+--------------------------+-----------+---------------+-----------------+--------+-----+
|file:/D:/Data/metadata.parquet/part-00000-6b37e43d-21db-4d02-b136-bdb45f90ecb1-c0

In [43]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Parquet id <= 49.999.999")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet = sdf_parquet.filter(f.col("id") <= 49999999)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [44]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Parquet id <= 15.790.099")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet = sdf_parquet.filter(f.col("id") <= 15790099)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [52]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Parquet id >= 15.790.100 and <= 31.560.199")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet = sdf_parquet.filter((f.col("id") >= 15790100) & (f.col("id") <= 31560199))
sdf_parquet.write.format("noop").mode("overwrite").save()

In [47]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Parquet id <= 15.790.098")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet = sdf_parquet.filter(f.col("id") <= 15790098)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [48]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Parquet id <= 15.790.100")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet = sdf_parquet.filter(f.col("id") <= 15790100)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [55]:
spark.conf.set("spark.sql.sources.useV1SourceList", "")
spark.conf.set("spark.sql.parquet.aggregatePushdown", "true")
sc.setJobDescription("Parquet count aggegratePushdown")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_parquet.count()

100000000

In [56]:
spark.conf.set("spark.sql.sources.useV1SourceList", "")
spark.conf.set("spark.sql.parquet.aggregatePushdown", "true")
sc.setJobDescription("Parquet max aggegratePushdown")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path)
sdf_max = sdf_parquet.groupBy().max("id")
sdf_max.show()

+--------+
| max(id)|
+--------+
|99999999|
+--------+



# PyArrow for analysis

In [58]:
import pyarrow.parquet as pq

In [66]:
path2 = "D:\part-00000-6b37e43d-21db-4d02-b136-bdb45f90ecb1-c000.snappy.parquet"

In [96]:
pq.read_metadata(path2)

<pyarrow._parquet.FileMetaData object at 0x0000023E7580AED0>
  created_by: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
  num_columns: 6
  num_rows: 50000000
  num_row_groups: 4
  format_version: 1.0
  serialized_size: 3475

In [97]:
md = pq.read_metadata(path2).metadata
md

{b'org.apache.spark.version': b'3.5.1',
 b'org.apache.spark.sql.parquet.row.metadata': b'{"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"date","type":"date","nullable":false,"metadata":{}},{"name":"timestamp","type":"timestamp","nullable":false,"metadata":{}},{"name":"idstring","type":"string","nullable":false,"metadata":{}},{"name":"idfirst","type":"string","nullable":false,"metadata":{}},{"name":"idlast","type":"string","nullable":false,"metadata":{}}]}'}

In [74]:
md.metadata

{b'org.apache.spark.version': b'3.5.1',
 b'org.apache.spark.sql.parquet.row.metadata': b'{"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"date","type":"date","nullable":false,"metadata":{}},{"name":"timestamp","type":"timestamp","nullable":false,"metadata":{}},{"name":"idstring","type":"string","nullable":false,"metadata":{}},{"name":"idfirst","type":"string","nullable":false,"metadata":{}},{"name":"idlast","type":"string","nullable":false,"metadata":{}}]}'}

In [102]:
md_dict = pq.read_metadata(path2).to_dict()
md_dict

{'created_by': 'parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)',
 'num_columns': 6,
 'num_rows': 50000000,
 'num_row_groups': 4,
 'row_groups': [{'num_columns': 6,
   'num_rows': 15790100,
   'total_byte_size': 312755950,
   'columns': [{'file_offset': 4,
     'file_path': '',
     'physical_type': 'INT64',
     'num_values': 15790100,
     'path_in_schema': 'id',
     'is_stats_set': True,
     'statistics': {'has_min_max': True,
      'min': 0,
      'max': 15790099,
      'null_count': 0,
      'distinct_count': None,
      'num_values': 15790100,
      'physical_type': 'INT64'},
     'compression': 'SNAPPY',
     'encodings': ('PLAIN', 'BIT_PACKED'),
     'has_dictionary_page': False,
     'dictionary_page_offset': None,
     'data_page_offset': 4,
     'total_compressed_size': 63229241,
     'total_uncompressed_size': 126343653},
    {'file_offset': 63229270,
     'file_path': '',
     'physical_type': 'INT32',
     'num_values': 15790100,
     'path_in

In [103]:
md_dict.get("row_groups")[0]

{'num_columns': 6,
 'num_rows': 15790100,
 'total_byte_size': 312755950,
 'columns': [{'file_offset': 4,
   'file_path': '',
   'physical_type': 'INT64',
   'num_values': 15790100,
   'path_in_schema': 'id',
   'is_stats_set': True,
   'statistics': {'has_min_max': True,
    'min': 0,
    'max': 15790099,
    'null_count': 0,
    'distinct_count': None,
    'num_values': 15790100,
    'physical_type': 'INT64'},
   'compression': 'SNAPPY',
   'encodings': ('PLAIN', 'BIT_PACKED'),
   'has_dictionary_page': False,
   'dictionary_page_offset': None,
   'data_page_offset': 4,
   'total_compressed_size': 63229241,
   'total_uncompressed_size': 126343653},
  {'file_offset': 63229270,
   'file_path': '',
   'physical_type': 'INT32',
   'num_values': 15790100,
   'path_in_schema': 'date',
   'is_stats_set': True,
   'statistics': {'has_min_max': True,
    'min': datetime.date(2024, 3, 17),
    'max': datetime.date(2024, 3, 17),
    'null_count': 0,
    'distinct_count': None,
    'num_values': 

In [104]:
md_dict.get("row_groups")[0].get("columns")[0]

{'file_offset': 4,
 'file_path': '',
 'physical_type': 'INT64',
 'num_values': 15790100,
 'path_in_schema': 'id',
 'is_stats_set': True,
 'statistics': {'has_min_max': True,
  'min': 0,
  'max': 15790099,
  'null_count': 0,
  'distinct_count': None,
  'num_values': 15790100,
  'physical_type': 'INT64'},
 'compression': 'SNAPPY',
 'encodings': ('PLAIN', 'BIT_PACKED'),
 'has_dictionary_page': False,
 'dictionary_page_offset': None,
 'data_page_offset': 4,
 'total_compressed_size': 63229241,
 'total_uncompressed_size': 126343653}