In [36]:
from pyspark.sql.functions import col, lag, unix_timestamp, avg, min, max, median
from pyspark.sql.window import Window

In [29]:
data_path = "/work/samsung/data/blocks"
blocks_df = spark.read.parquet(data_path)

In [30]:
blocks_df.show(20)  
blocks_df.printSchema()  

+--------------------+--------------------+------------+--------+--------------------+----------+----------------+--------+
|          block_hash|              author|block_number|gas_used|          extra_data| timestamp|base_fee_per_gas|chain_id|
+--------------------+--------------------+------------+--------+--------------------+----------+----------------+--------+
|[C5 FD 92 68 93 8...|[95 22 22 90 DD 7...|    21181452| 9861510|[62 65 61 76 65 7...|1731533231|     67853586232|       1|
|[86 D3 F7 66 04 7...|[48 38 B1 06 FC E...|    21181453|13637078|[54 69 74 61 6E 2...|1731533243|     64948044780|       1|
|[87 C7 B5 F5 B7 5...|[95 22 22 90 DD 7...|    21181454|14187397|[62 65 61 76 65 7...|1731533255|     64210385455|       1|
|[EF 3A 49 FA C2 3...|[48 38 B1 06 FC E...|    21181455|12210092|[54 69 74 61 6E 2...|1731533267|     63775572523|       1|
|[AE AA EE F2 9B A...|[48 38 B1 06 FC E...|    21181456|14737857|[54 69 74 61 6E 2...|1731533279|     62292839357|       1|
|[61 F6 

In [31]:
blocks_df = blocks_df.withColumn("timestamp", (col("timestamp").cast("timestamp")))

In [32]:
window_spec = Window.orderBy("timestamp")
blocks_df = blocks_df.withColumn("time_diff", 
    col("timestamp").cast("long") - lag("timestamp").over(window_spec).cast("long")
)

In [45]:
avg_block_time = blocks_df.select(avg("time_diff")).collect()[0][0]
print(f"mean time between blocks: {avg_block_time} seconds")

mean time between blocks: 56.273475782526084 seconds


25/03/15 00:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/15 00:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/15 00:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/15 00:18:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [37]:
gas_stats = blocks_df.select(
    max("gas_used").alias("max_gas_used"),
    min("gas_used").alias("min_gas_used"),
    avg("gas_used").alias("avg_gas_used"),
    median("gas_used").alias("median_gas_used")
).collect()[0]

In [41]:
print(f"max gas used: {gas_stats['max_gas_used']}")
print(f"min gas used: {gas_stats['min_gas_used']}")
print(f"mean gas used: {gas_stats['avg_gas_used']}")
print(f"median gas used: {gas_stats['median_gas_used']}")

max gas used: 30057615
min gas used: 0
mean gas used: 15117197.1897
median gas used: 14470383.0


In [44]:
from pyspark.sql.functions import count

top_authors = blocks_df.groupBy("author").agg(
    count("*").alias("block_count")
).orderBy(col("block_count").desc())

top_authors.show(10)

+--------------------+-----------+
|              author|block_count|
+--------------------+-----------+
|[95 22 22 90 DD 7...|      13945|
|[48 38 B1 06 FC E...|      12431|
|[1F 90 90 AA E2 8...|        947|
|[38 8C 81 8C A8 B...|        860|
|[DF 99 A0 83 98 1...|        158|
|[7E 2A 2F A2 A0 6...|        115|
|[DA DB 0D 80 17 8...|         64|
|[98 ED 2D 46 A2 7...|         48|
|[E6 88 B8 4B 23 F...|         45|
|[7A DC 0E 86 7E B...|         41|
+--------------------+-----------+
only showing top 10 rows

