# Today's topic: How Spark Parititions influence saving data with parquet

# 0. Set-Ups

General hints for this notebook:
- Spark UI usually accesible by http://localhost:4040/ or http://localhost:4041/
- Deep dive Spark UI happens in later episodes
- sc.setJobDescription("Description") replaces the Job Description of an action in the Spark UI with your own
- sdf.rdd.getNumPartitions() returns the number partitions of the current Spark DataFrame
- sdf.write.format("noop").mode("overwrite").save() is a good way to analyze and initiate actions for transformations without side effects during an actual write

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import math


In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: 
"""
sc = spark.sparkContext

In [3]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache datafrimes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [4]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [5]:
def rows_per_partition(sdf: "DataFrame") -> None:
    num_rows = sdf.count()
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id").count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id").show()

In [6]:
def rows_per_partition_col(sdf: "DataFrame", num_rows: int, col: str) -> None:
    sdf_part = sdf.withColumn("partition_id", f.spark_partition_id())
    sdf_part_count = sdf_part.groupBy("partition_id", col).count()
    sdf_part_count = sdf_part_count.withColumn("count_perc", 100*f.col("count")/num_rows)
    sdf_part_count.orderBy("partition_id", col).show()


In [7]:
BASE_DIR = "D:/Spark/Data"

In [8]:
def write_generator(num_rows, num_files):
    sdf = sdf_generator(num_rows, num_files)
    path = f"{BASE_DIR}/{num_files}_files_{num_rows}_rows.parquet"
    sc.setJobDescription(f"Write {num_files} files, {num_rows} rows")
    sdf.write.format("parquet").mode("overwrite").save(path)
    sc.setJobDescription("None")
    print(f"Num files written: {num_files}")
    print(f"Num rows written: {num_rows}")
    print(f"Num partitions written: {sdf.rdd.getNumPartitions()}")
    print(f"Saved Path: {path}")
    return path

In [125]:
def set_configs(maxPartitionsMB = 128, openCostInMB = 4, minPartitions = 4):
    maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
    openCostInBytes = math.ceil(openCostInMB*1024*1024)
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")
    spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInBytes)+"b")
    spark.conf.set("spark.sql.files.minPartitionNum", str(minPartitions))
    print(f"MaxPartitionSize {maxPartitionsMB} MB or {maxPartitionsBytes} bytes")
    print(f"OpenCostInBytes {openCostInMB} MB or {openCostInBytes} bytes")
    print(f"Min Partitions: {minPartitions}")

In [10]:
def get_parquet_meta_data(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .withColumn("calcNumBlocks", f.col("compressedMB")/128)
    )
    sdf.show(20, truncate=False)

In [11]:
def get_parquet_blocks(path):
    sdf = (
        spark.read.parquet_blocks(path)
        .dropDuplicates(["filename","block"])
        .orderBy("filename", "block")
        .withColumn("blockEnd", f.col("blockStart") + f.col("compressedBytes") - 1)
        .withColumn("blockMiddle", f.col("blockStart") + 0.5 * f.col("compressedBytes"))
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("filename", "block", "blockStart", "blockEnd", "blockMiddle", "compressedBytes", "compressedMB", "rows")
    )

    sdf.show(20, truncate=False)

In [12]:
def get_spark_partitions(path):
    sdf = (
        spark.read.parquet_partitions(path)
        .withColumn("compressedMB", f.round(f.col("compressedBytes")/1024/1024, 1))
        .select("partition", "start", "end", "length", "blocks", "compressedBytes", "compressedMB", "rows", "filename")
    )

    sdf.show(20, truncate=False)

In [13]:
def get_parquet_window_length(path):
    sdf = spark.read.parquet_partitions(path)
    val = sdf.select(f.max(sdf["length"]))
    max_length = val.collect()[0][0]
    print(f"Max Parquet window length: {round(max_length/1024/1024, 1)} MB or {max_length} bytes")

In [14]:
def get_parquet_file_size(path):
    sdf = (
        spark.read.parquet_metadata(path)
        .select("filename", "blocks", "compressedBytes", "rows")
        .dropDuplicates(["filename"])
    )
    sum = sdf.select(f.sum(sdf["compressedBytes"]))
    size = sum.collect()[0][0]
    print(f"File Size: {round(size/1024/1024, 1)} MB or {size} bytes")
    return size

In [100]:
def round_half_up(n, decimals=0):
    multiplier = 10**decimals
    return math.floor(n * multiplier + 0.5) / multiplier

#source: https://realpython.com/python-rounding/

In [105]:
def estimate_num_partitions(file_size, num_files):
    maxPartitionBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
    openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
    minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
    padded_file_size = file_size + num_files * openCostInBytes
    avg_file_size = file_size/num_files
    avg_file_size_padded = avg_file_size + openCostInBytes
    bytesPerCore = file_size / minPartitionNum
    bytesPerCorePadded = padded_file_size / minPartitionNum
    maxSplitPartitionBytes = min(maxPartitionBytes, max(openCostInBytes, bytesPerCorePadded))
    files_per_partition = round_half_up(maxSplitPartitionBytes/avg_file_size_padded, 0)
    estimated_num_partitions = math.ceil(num_files/files_per_partition)
    estimated_num_partitions_int = math.ceil(padded_file_size/maxSplitPartitionBytes)
    print(f"Avg file Size: {round(avg_file_size/1024/1024, 1)} MB or {avg_file_size} bytes")
    print(f"Avg file Size Padded: {round(avg_file_size_padded/1024/1024, 1)} MB or {avg_file_size_padded} bytes")
    print(f"Padded File Size: {round(padded_file_size/1024/1024, 1)} MB or {padded_file_size} bytes")
    print(f"SizePerCore: {round(bytesPerCore/1024/1024, 1)} MB or {bytesPerCore} bytes")
    print(f"SizePerCorePadded: {round(bytesPerCorePadded/1024/1024, 1)} MB or {bytesPerCorePadded} bytes")
    print(f"MaxSplitPartitionBytes: {round(maxSplitPartitionBytes/1024/1024, 1)} MB or {maxSplitPartitionBytes} bytes")
    print(f"MaxFilesPerPartition {files_per_partition}")
    print(f"EstimatedPartitions: {math.ceil(estimated_num_partitions)}, unrounded: {estimated_num_partitions}")
    print(f"EstimatedPartitionsInternet: {math.ceil(estimated_num_partitions_int)}, unrounded: {estimated_num_partitions_int}")

In [99]:
round_half_up(2.8)

3.0

In [37]:
def get_actual_num_partitions(path):
    sdf = spark.read.parquet(path)
    print(f"ActualNumPartitions: {sdf.rdd.getNumPartitions()}")
    sc.setJobDescription("WRITE")
    sdf.write.format("noop").mode("overwrite").save()
    sc.setJobDescription("None")


In [17]:
num_files = 20
num_rows = 6000000
path = write_generator(num_rows, num_files)

Num files written: 20
Num rows written: 6000000
Num partitions written: 20
Saved Path: D:/Spark/Data/20_files_6000000_rows.parquet


In [135]:
set_configs(maxPartitionsMB=60, openCostInMB=12, minPartitions=2)
size = get_parquet_file_size(path)
estimate_num_partitions(size, num_files)
get_actual_num_partitions(path)
#get_parquet_window_length(path)
get_parquet_meta_data(path)
#get_parquet_blocks(path)
get_spark_partitions(path)

MaxPartitionSize 60 MB or 62914560 bytes
OpenCostInBytes 12 MB or 12582912 bytes
Min Partitions: 2
File Size: 48.4 MB or 50768643 bytes
Avg file Size: 2.4 MB or 2538432.15 bytes
Avg file Size Padded: 14.4 MB or 15121344.15 bytes
Padded File Size: 288.4 MB or 302426883 bytes
SizePerCore: 24.2 MB or 25384321.5 bytes
SizePerCorePadded: 144.2 MB or 151213441.5 bytes
MaxSplitPartitionBytes: 60.0 MB or 62914560 bytes
MaxFilesPerPartition 4.0
EstimatedPartitions: 5, unrounded: 5
EstimatedPartitionsInternet: 5, unrounded: 5
ActualNumPartitions: 5
+---------------------------------------------------------------------------------------------------------------------+------+---------------+------+------------+-------------+
|filename                                                                                                             |blocks|compressedBytes|rows  |compressedMB|calcNumBlocks|
+----------------------------------------------------------------------------------------------------

In [65]:
def calPartitions(splitbytes, paddedfilesize):
    

calPartitions(64.2, 6.4)

10.0
2


In [139]:
s = spark.read.parquet_partitions(path)
#s.groupBy("partition").sum("compressedBytes")

ConnectionRefusedError: [WinError 10061] Es konnte keine Verbindung hergestellt werden, da der Zielcomputer die Verbindung verweigerte

In [58]:
32.1/6.4

5.015625

In [264]:
c = 679947769/4
c

169986942.25

In [246]:
t = 87165529 

diff = t-c
diff

271507.234375

In [247]:
diff/8

271507.234375

In [250]:
c = 679947769/16
c

42496735.5625

In [251]:
t = 43582764
diff = t-c
diff

1086028.4375

In [257]:
diff/4

271507.109375

In [233]:
679947769/4

169986942.25

In [201]:
43509334/1024/1024

41.49373435974121

In [223]:
87116716/1024/1024

83.08097457885742

+---------+---------+---------+---------+------+---------------+------------+--------+---------------------------------------------------------------------------------------------------------------------+
|partition|start    |end      |length   |blocks|compressedBytes|compressedMB|rows    |filename                                                                                                             |
+---------+---------+---------+---------+------+---------------+------------+--------+---------------------------------------------------------------------------------------------------------------------+
|0        |0        |107202443|107202443|1     |133949814      |127.7       |15790100|file:/D:/Spark/Data/1_files_50000000_rows.parquet/part-00000-6cde06e8-a2fb-4a79-9488-b3a53bd5e038-c000.snappy.parquet|
|1        |107202443|214404886|107202443|1     |133895545      |127.7       |15770100|file:/D:/Spark/Data/1_files_50000000_rows.parquet/part-00000-6cde06e8-a2fb-4a79-9488-b3a53bd5e

# 1. 1 File - 

https://mageswaran1989.medium.com/a-dive-into-apache-spark-parquet-reader-for-small-file-sizes-fabb9c35f64e#:~:text=maxPartitionBytes%3A%20128MB%20(The%20maximum%20number,sql.
https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala


In [104]:
maxPartitionsMB = 70
openCostInMB = 4
minPartitions = 4
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsMB*1024*1024)+"b")
spark.conf.set("spark.sql.files.openCostInBytes", str(openCostInMB*1024*1024)+"b")
spark.conf.set("spark.sql.files.minPartitionNum", str(minPartitions))
#https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
#https://issues.apache.org/jira/browse/SPARK-17998
#spark.sql.files.maxPartitionNum = None	
#https://spark.apache.org/docs/latest/sql-performance-tuning.html
#https://db-blog.web.cern.ch/blog/luca-canali/2017-06-diving-spark-and-parquet-workloads-example
#https://www.linkedin.com/pulse/how-initial-number-partitions-determined-pyspark-sugumar-srinivasan#:~:text=Ideally%20partitions%20will%20be%20created,resource%20will%20get%20utilised%20properly.

In [95]:
int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])

134217728

In [105]:
data_size = 404.9*1024*1024
number_files = 
defaultMaxSplitBytes = int(spark.conf.get("spark.sql.files.maxPartitionBytes")[:-1])
openCostInBytes = int(spark.conf.get("spark.sql.files.openCostInBytes")[:-1])
minPartitionNum = int(spark.conf.get("spark.sql.files.minPartitionNum"))
totalBytes = data_size + (number_files * openCostInBytes)
bytesPerCore = totalBytes / minPartitionNum
targetSizePerPartition = min(defaultMaxSplitBytes, max(openCostInBytes, bytesPerCore))
noPartitions = totalBytes/targetSizePerPartition
print(defaultMaxSplitBytes/1024/1024)
print(openCostInBytes/1024/1024)
print(minPartitionNum)
print(totalBytes/1024/1024)
print(bytesPerCore/1024/1024)
print(targetSizePerPartition/1024/1024)
print(noPartitions)


70.0
4.0
4
484.9
121.225
70.0
6.927142857142857


In [None]:
def maxSplitBytes(
      sparkSession: SparkSession,
      selectedPartitions: Seq[PartitionDirectory]): Long = {
defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
    val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes
    val minPartitionNum = sparkSession.sessionState.conf.filesMinPartitionNum
      .getOrElse(sparkSession.leafNodeDefaultParallelism)
    val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
    val bytesPerCore = totalBytes / minPartitionNum

    Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))

In [85]:
spark.conf.get("spark.sql.leafNodeDefaultParallelism")

In [80]:
spark.conf.set("spark.sql.files.openCostInBytes", "1")


In [78]:
spark.conf.set("spark.sql.files.minPartitionNum", "1")


In [100]:
sdf = sdf_generator(50000000, 1)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Write 1 file, 404,9 MB")
sdf.write.format("parquet").mode("overwrite").save(f"{base_dir}/test_1_file.parquet")
sc.setJobDescription("None")

1


In [102]:
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsMB*1024*1024)+"b")
sc.setJobDescription("Read 1 file with 128m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


In [81]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "70m")
sc.setJobDescription("Read 1 file with 70m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

6


In [79]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "51m")
sc.setJobDescription("Read 1 file with 51m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

8


In [77]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "34m")
sc.setJobDescription("Read 1 file with 34m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

12


In [59]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "200m")
sc.setJobDescription("Read 1 file with 200m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


In [75]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "400m")
sc.setJobDescription("Read 1 file with 400m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

2


In [76]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "500m")
sc.setJobDescription("Read 1 file with 400m")
sdf_load = spark.read.parquet(f"{base_dir}/test_1_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

1


# 2. 4 files a 101,5 MB

In [10]:
def read_experiment(maxPartitionsMB, path, file_size, num_files):
    spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsMB*1024*1024)+"b")
    sdf_load = spark.read.parquet(path)
    print(f"Number partitions: {sdf_load.rdd.getNumPartitions()}")
    sc.setJobDescription(f"Read {num_files} file with {maxPartitionsMB} MB")
    sdf_load.write.format("noop").mode("overwrite").save()
    sc.setJobDescription("None")
    partition_size = (file_size)/4
    target_partition_size = min(maxPartitionsMB, partition_size)
    print(target_partition_size)
    print((file_size)/target_partition_size)

In [59]:
num_files = 1
num_rows = 10000000
size = 406
sdf = sdf_generator(num_rows, 1)
path = f"{base_dir}/test_{num_files}_file.parquet"
print(sdf.rdd.getNumPartitions())
sc.setJobDescription(f"Write {num_files} files, {size} MB")
sdf.write.format("parquet").mode("overwrite").save(path)
sc.setJobDescription("None")

1


In [73]:
write_generator(100, 3)

Num files written: 3
Num rows written: 100
Num partitions written: 3


'D:/Spark/Data/3_files_100_rows.parquet'

In [65]:
spark.read.parquet_metadata(path).dropDuplicates(["filename"]).collect()

[Row(filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-6ceb4061-1dcd-458e-8453-36d8001bbdd1-c000.snappy.parquet', blocks=1, compressedBytes=84687614, uncompressedBytes=193996611, rows=10000000, columns=6, values=60000000, nulls=0, createdBy='parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)', schema='message spark_schema {\n  required int64 id;\n  required int32 date (DATE);\n  required int96 timestamp;\n  required binary idstring (STRING);\n  required binary idfirst (STRING);\n  required binary idlast (STRING);\n}\n', encryption='UNENCRYPTED', keyValues={'org.apache.spark.version': '3.5.0', 'org.apache.spark.sql.parquet.row.metadata': '{"type":"struct","fields":[{"name":"id","type":"long","nullable":false,"metadata":{}},{"name":"date","type":"date","nullable":false,"metadata":{}},{"name":"timestamp","type":"timestamp","nullable":false,"metadata":{}},{"name":"idstring","type":"string","nullable":false,"metadata":{}},{"name":"idfirst","type":"string",

In [66]:
84687614/1024/1024

80.76440238952637

In [25]:
spark.read.parquet_partitions(path).collect()

[Row(partition=0, start=0, end=107202443, length=107202443, blocks=1, compressedBytes=133949814, uncompressedBytes=312755950, rows=15790100, columns=6, values=94740600, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-fc9294ce-8c57-4be8-a157-fba53a12f902-c000.snappy.parquet', fileLength=424615468),
 Row(partition=1, start=107202443, end=214404886, length=107202443, blocks=1, compressedBytes=133895545, uncompressedBytes=323456519, rows=15770100, columns=6, values=94620600, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-fc9294ce-8c57-4be8-a157-fba53a12f902-c000.snappy.parquet', fileLength=424615468),
 Row(partition=2, start=214404886, end=321607329, length=107202443, blocks=0, compressedBytes=0, uncompressedBytes=0, rows=0, columns=0, values=0, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-fc9294ce-8c57-4be8-a157-fba53a12f902-c000.snappy.parquet', fileLength=424615468),
 Row(partition=3, start=321607329, end=424615468, l

In [64]:
spark.read.parquet_blocks(path).dropDuplicates(["filename"]).collect()

[Row(filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-6ceb4061-1dcd-458e-8453-36d8001bbdd1-c000.snappy.parquet', block=1, blockStart=4, compressedBytes=84687614, uncompressedBytes=193996611, rows=10000000, columns=6, values=60000000, nulls=0)]

In [None]:
sc.hadoopConfiguration.setInt("parquet.block.size",blockSize)


In [58]:
spark.conf.get("spark.sql.parquet.block.size")

https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

Py4JJavaError: An error occurred while calling o33.get.
: org.apache.spark.SparkNoSuchElementException: [SQL_CONF_NOT_FOUND] The SQL config "spark.sql.parquet.block.size" cannot be found. Please verify that the config exists.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.sqlConfigNotFoundError(QueryExecutionErrors.scala:1984)
	at org.apache.spark.sql.internal.SQLConf.$anonfun$getConfString$3(SQLConf.scala:5234)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.internal.SQLConf.getConfString(SQLConf.scala:5234)
	at org.apache.spark.sql.RuntimeConfig.get(RuntimeConfig.scala:81)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [44]:
d = 120*1024*1024
d

125829120

In [55]:
b1 = 4
b2 = 133949818
b3 = 267845363
b4 = 401778296
e = b4 + 22457577 -1
print(e)

424235872


In [49]:
m1 = (b1+b2)/2
m2 = (b2+b3)/2
m3 = (b3+b4)/2
m4 = (b4+e)/2

print(m1)
print(m2)
print(m3)
print(m4)

66974911.0
200897590.5
334811829.5
413007084.0


In [56]:
4*d

503316480

In [46]:
3*d

377487360

In [45]:
133949818-d

8120698

In [39]:
read_experiment(120, path, 442, 4)

Number partitions: 4
110.5
4.0


In [36]:
133949814/1024/1024

127.74449729919434

In [60]:
spark.read.parquet_partitions(path).collect()

[Row(partition=0, start=0, end=22238867, length=22238867, blocks=0, compressedBytes=0, uncompressedBytes=0, rows=0, columns=0, values=0, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-6ceb4061-1dcd-458e-8453-36d8001bbdd1-c000.snappy.parquet', fileLength=84761164),
 Row(partition=1, start=22238867, end=44477734, length=22238867, blocks=1, compressedBytes=84687614, uncompressedBytes=193996611, rows=10000000, columns=6, values=60000000, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-6ceb4061-1dcd-458e-8453-36d8001bbdd1-c000.snappy.parquet', fileLength=84761164),
 Row(partition=2, start=44477734, end=66716601, length=22238867, blocks=0, compressedBytes=0, uncompressedBytes=0, rows=0, columns=0, values=0, nulls=0, filename='file:/D:/Spark/Data/test_1_file.parquet/part-00000-6ceb4061-1dcd-458e-8453-36d8001bbdd1-c000.snappy.parquet', fileLength=84761164),
 Row(partition=3, start=66716601, end=84761164, length=18044563, blocks=0, compressedBytes=0,

In [26]:
156390510/1024/1024

149.1456127166748

In [27]:
133949814/1024/1024

127.74449729919434

In [18]:
424235869/1024/1024

404.5828523635864

In [67]:
for size in [10, 20, 30, 70, 100, 128, 200]:
    read_experiment(size, path, size, num_files)

Number partitions: 9
2.5
4.0
Number partitions: 5
5.0
4.0
Number partitions: 4
7.5
4.0
Number partitions: 4
17.5
4.0
Number partitions: 4
25.0
4.0
Number partitions: 4
32.0
4.0
Number partitions: 4
50.0
4.0


In [107]:
sdf = sdf_generator(50000000, 4)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Write 4 file, 101,5 MB")
sdf.write.format("parquet").mode("overwrite").save(f"{base_dir}/test_4_file.parquet")
sc.setJobDescription("None")

4


In [115]:
sdf = sdf_generator(50000000, 20)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Write 4 file, 20 MB")
sdf.write.format("parquet").mode("overwrite").save(f"{base_dir}/test_20_file.parquet")
sc.setJobDescription("None")

20


In [113]:
for size in [8, 16, 32, 64, 128, 256, 512]:
    read_experiment(size, f"{base_dir}/test_4_file.parquet", 406)

Number partitions: 52
8
52.75
Number partitions: 26
16
26.375
Number partitions: 14
32
13.1875
Number partitions: 8
64
6.59375
Number partitions: 4
105.5
4.0
Number partitions: 4
105.5
4.0
Number partitions: 4
105.5
4.0


In [116]:
for size in [8, 16, 32, 64, 128, 256, 512]:
    read_experiment(size, f"{base_dir}/test_20_file.parquet", 406)

Number partitions: 60
8
50.75
Number partitions: 30
16
25.375
Number partitions: 20
32
12.6875
Number partitions: 10
64
6.34375
Number partitions: 4
101.5
4.0
Number partitions: 4
101.5
4.0
Number partitions: 4
101.5
4.0


In [None]:
read_experiment(128, f"{base_dir}/test_4_file.parquet", 406)

In [None]:
read_experiment(64, f"{base_dir}/test_4_file.parquet")

In [62]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "128m")
sc.setJobDescription("Read 4 file with 128m")
sdf_load = spark.read.parquet(f"{base_dir}/test_4_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


In [63]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "70m")
sc.setJobDescription("Read 4 file with 70m")
sdf_load = spark.read.parquet(f"{base_dir}/test_4_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

6


In [65]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "51m")
sc.setJobDescription("Read 4 file with 51m")
sdf_load = spark.read.parquet(f"{base_dir}/test_4_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

8


# 3. 20 files

In [67]:
sdf = sdf_generator(50000000, 20)
print(sdf.rdd.getNumPartitions())
sc.setJobDescription("Write 20 file, 20,3 MB")
sdf.write.format("parquet").mode("overwrite").save(f"{base_dir}/test_20_file.parquet")
sc.setJobDescription("None")

20


In [69]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "128m")
sc.setJobDescription("Read 20 file with 128m")
sdf_load = spark.read.parquet(f"{base_dir}/test_20_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


In [71]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "70m")
sc.setJobDescription("Read 20 file with 70m")
sdf_load = spark.read.parquet(f"{base_dir}/test_20_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

7


In [70]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "51m")
sc.setJobDescription("Read 20 file with 51m")
sdf_load = spark.read.parquet(f"{base_dir}/test_20_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

10


In [72]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "200m")
sc.setJobDescription("Read 20 file with 200m")
sdf_load = spark.read.parquet(f"{base_dir}/test_20_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4


In [73]:
spark.conf.set("spark.sql.files.maxPartitionBytes", "400m")
sc.setJobDescription("Read 20 file with 400m")
sdf_load = spark.read.parquet(f"{base_dir}/test_20_file.parquet")
print(sdf_load.rdd.getNumPartitions())
sdf_load.write.format("noop").mode("overwrite").save()
sc.setJobDescription("None")

4
