# Today's topic: Introducing Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import os
import sys


In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

#Needed to specify the used cores: .master("local[4]")
#Needed for Hive Support: .enableHiveSupport()
#Needed for the spark extension package: .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5")


sc = spark.sparkContext

In [11]:
#ADJUST PATH BASED ON YOUR CHOSEN ONE
path = "D:/Data/metadata.parquet"

In [12]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [35]:
#100 million rows, 2 files
sdf = sdf_generator(100000000, 2)

In [36]:
sdf.write.format("parquet").mode("overwrite").save(path)

Doku: https://github.com/G-Research/spark-extension/blob/master/python/gresearch/spark/parquet/__init__.py

# File Meta Data
- 2 files with 4 blocks  (row groups)
- 424235869 bytes (404 MB) and 424623387 (404 MB) big  
- 50 million rows per file
- no null values
- schema

This provides the following per-file information:
- filename (string): The file name
- blocks (int): Number of blocks / RowGroups in the Parquet file
- compressedBytes (long): Number of compressed bytes of all blocks
- uncompressedBytes (long): Number of uncompressed bytes of all blocks
- rows (long): Number of rows in the file
- columns (int): Number of rows in the file
- values (long): Number of values in the file
- nulls (long): Number of null values in the file
- createdBy (string): The createdBy string of the Parquet file, e.g. library used to write the file
- schema (string): The schema
- encryption (string): The encryption
- keyValues (string-to-string map): Key-value data of the file

In [37]:
sdf_meta = spark.read.parquet_metadata(path)
sdf_meta.dropDuplicates(["filename"]).orderBy("filename").show(20, False)

+--------------------------------------------------------------------------------------------------+------+---------------+-----------------+--------+-------+---------+-----+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filename        

# Parquet Blocks
- First block starts at 4 bytes (after Header)
- First block is 133949814 bytes (127,44 MB) big and has 15790100 rows

This provides the following per-block information:
- filename (string): The file name
- block (int): Block / RowGroup number starting at 1
- blockStart (long): Start position of the block in the Parquet file
- compressedBytes (long): Number of compressed bytes in block
- uncompressedBytes (long): Number of uncompressed bytes in block
- rows (long): Number of rows in block
- columns (int): Number of columns in block
- values (long): Number of values in block
- nulls (long): Number of null values in block

In [40]:
spark.read.parquet_blocks(path).dropDuplicates(["filename", "block"]).orderBy("filename", "block").show(50, False)

+--------------------------------------------------------------------------------------------------+-----+----------+---------------+-----------------+--------+-------+--------+-----+
|filename                                                                                          |block|blockStart|compressedBytes|uncompressedBytes|rows    |columns|values  |nulls|
+--------------------------------------------------------------------------------------------------+-----+----------+---------------+-----------------+--------+-------+--------+-----+
|file:/D:/Data/metadata.parquet/part-00000-ed21d362-019d-480c-b67f-0fefd2cca93b-c000.snappy.parquet|1    |4         |133949814      |312755950        |15790100|6      |94740600|0    |
|file:/D:/Data/metadata.parquet/part-00000-ed21d362-019d-480c-b67f-0fefd2cca93b-c000.snappy.parquet|2    |133949818 |133895545      |323456519        |15770100|6      |94620600|0    |
|file:/D:/Data/metadata.parquet/part-00000-ed21d362-019d-480c-b67f-0fefd2cca93b-

# Parquet Partitions
- We can see that every partition is one row group per partition except of the last one per file beeing small enough to be together in a partition. Meaning 7 partitions
- Simplified Reason max partition around the same as row group size. Just the last row group suits into 128 MB.

This provides the following per-partition information:
- partition (int): The Spark partition id
- partitionStart (long): The start position of the partition
- partitionEnd (long): The end position of the partition
- partitionLength (long): The length of the partition
- blocks (int): The number of Parquet blocks / RowGroups in this partition
- compressedBytes (long): The number of compressed bytes in this partition
- uncompressedBytes (long): The number of uncompressed bytes in this partition
- rows (long): The number of rows in this partition
- columns (int): The number of columns in this partition
- values (long): The number of values in this partition
- nulls (long): The number of null values in this partition
- filename (string): The Parquet file name
- fileLength (long): The length of the Parquet file

In [41]:
spark.read.parquet_partitions(path).show(20, False)

+---------+---------+---------+---------+------+---------------+-----------------+--------+-------+--------+-----+--------------------------------------------------------------------------------------------------+----------+
|partition|start    |end      |length   |blocks|compressedBytes|uncompressedBytes|rows    |columns|values  |nulls|filename                                                                                          |fileLength|
+---------+---------+---------+---------+------+---------------+-----------------+--------+-------+--------+-----+--------------------------------------------------------------------------------------------------+----------+
|0        |0        |134217728|134217728|1     |133949814      |312755950        |15790100|6      |94740600|0    |file:/D:/Data/metadata.parquet/part-00000-ed21d362-019d-480c-b67f-0fefd2cca93b-c000.snappy.parquet|424615468 |
|1        |134217728|268435456|134217728|1     |133895545      |323456519        |15770100|6      |9

# Parquet Block Columns

In [43]:
spark.read.parquet_block_columns(path).dropDuplicates(["filename", "block", "column"]).orderBy("filename", "block", "column").show(50, False)

+--------------------------------------------------------------------------------------------------+-----+-----------+------+---------------------------------+------------------------------+--------------------------+--------------------------+-----------+---------------+-----------------+--------+-----+
|filename                                                                                          |block|column     |codec |type                             |encodings                     |minValue                  |maxValue                  |columnStart|compressedBytes|uncompressedBytes|values  |nulls|
+--------------------------------------------------------------------------------------------------+-----+-----------+------+---------------------------------+------------------------------+--------------------------+--------------------------+-----------+---------------+-----------------+--------+-----+
|file:/D:/Data/metadata.parquet/part-00000-ed21d362-019d-480c-b67f-0fefd2cca93b-c0

In [None]:
https://github.com/apache/parquet-mr/blob/master/parquet-cli/README.md