In [1]:
import os
from datetime import datetime, timedelta
import pandas as pd
from pathlib import Path
from lib import Lab

# Initialize lab
lab = Lab('beacon-chain-timings', '../config.yaml')
lab.setup()
lab.setup_pandaops_clickhouse()
log = lab.log

# Get notebook specific config
notebook_config = lab.get_notebook_config()

writer = lab.get_data_writer()

pandaops_clickhouse_client = lab.get_pandaops_clickhouse_client()

## Clear the data directory
lab.delete_directory('')

log.info("Good to go!")

2025-01-10 16:31:53,437 - beacon-chain-timings - INFO - Good to go!


In [2]:
beacon_chain_timings_config = lab.get_notebook_config().as_beacon_chain_timings()
beacon_chain_timings_config


BeaconChainTimings(time_windows=[TimeWindow(file='last_30_days', step='6h', label='Last 30d', range='-30d'), TimeWindow(file='last_90_days', step='1d', label='Last 90d', range='-90d')], networks=['mainnet', 'sepolia', 'holesky'], data_dir='../data/beacon-chain-timings')

In [13]:
from sqlalchemy import text
from datetime import datetime, timezone
query = text("""
    WITH time_slots AS (
        SELECT 
            toStartOfInterval(slot_start_date_time, INTERVAL :step_seconds second) as time_slot,
            meta_network_name,
            min(propagation_slot_start_diff) as min_arrival,
            max(propagation_slot_start_diff) as max_arrival,
            avg(propagation_slot_start_diff) as avg_arrival,
            quantile(0.05)(propagation_slot_start_diff) as p05_arrival,
            quantile(0.50)(propagation_slot_start_diff) as p50_arrival,
            quantile(0.95)(propagation_slot_start_diff) as p95_arrival,
            count(*) as total_blocks
        FROM beacon_api_eth_v1_events_block FINAL
        WHERE
            slot_start_date_time BETWEEN toDateTime(:start_date) AND toDateTime(:end_date)
            AND meta_network_name IN (:networks)
            AND propagation_slot_start_diff < 6000
        GROUP BY time_slot, meta_network_name
    )
    SELECT
        time_slot as time,
        meta_network_name,
        min_arrival,
        max_arrival,
        avg_arrival,
        p05_arrival,
        p50_arrival,
        p95_arrival,
        total_blocks
    FROM time_slots
    ORDER BY time_slot ASC
""")

for window in beacon_chain_timings_config.time_windows:
    start_date, end_date = window.get_time_range(datetime.now(timezone.utc))
    step_seconds = window.get_step_seconds()
    
    start_str = start_date.strftime('%Y-%m-%d %H:%M:%S')
    end_str = end_date.strftime('%Y-%m-%d %H:%M:%S')

    log.info(f"Fetching data for {window.file}")
    
    result = pandaops_clickhouse_client.execute(
        query,
        {
            "start_date": start_str,
            "end_date": end_str,
            "networks": beacon_chain_timings_config.networks,
            "step_seconds": step_seconds
        }
    )
    timings = result.fetchall()

    if len(timings) == 0:
        log.warning(f"No data found for time window {window.file}")
        continue

    log.info(f"Found {len(timings)} entries for time window {window.file}")
    
    # Process each network separately
    for network in beacon_chain_timings_config.networks:
        network_timings = [t for t in timings if t[1] == network]
        if not network_timings:
            continue
            
        # Structure data as arrays to save space
        times = []
        mins = []
        maxs = []
        avgs = []
        p05s = []
        p50s = []
        p95s = []
        blocks = []
        
        for t in network_timings:
            times.append(int(t[0].timestamp()))
            mins.append(round(t[2], 3))
            maxs.append(round(t[3], 3))
            avgs.append(round(t[4], 3))
            p05s.append(round(t[5], 3))
            p50s.append(round(t[6], 3))
            p95s.append(round(t[7], 3))
            blocks.append(t[8])
            
        # Write compact array format
        formatted_data = {
            "timestamps": times,
            "mins": mins,
            "maxs": maxs,
            "avgs": avgs,
            "p05s": p05s,
            "p50s": p50s,
            "p95s": p95s,
            "blocks": blocks
        }
            
        # Write to file per time window and network
        lab.write_json(f"block_timings/{network}/{window.file}.json", formatted_data)


2025-01-10 16:56:54,199 - beacon-chain-timings - INFO - Fetching data for last_30_days
2025-01-10 16:56:55,844 - beacon-chain-timings - INFO - Found 363 entries for time window last_30_days
2025-01-10 16:56:55,848 - beacon-chain-timings - INFO - Fetching data for last_90_days
2025-01-10 16:56:56,731 - beacon-chain-timings - INFO - Found 273 entries for time window last_90_days


In [19]:
import numpy as np

# Fetch block sizes, blob data and build CDF for each time window
for window in beacon_chain_timings_config.time_windows:
    start_date, end_date = window.get_time_range(datetime.now(timezone.utc))
    start_str = start_date.strftime('%Y-%m-%d %H:%M:%S')
    end_str = end_date.strftime('%Y-%m-%d %H:%M:%S')

    log.info(f"Fetching block and blob data for {window.file} ({start_str} to {end_str})")

    # Get blob data
    log.info("Querying blob data...")
    blob_query = text("""
        SELECT
            slot,
            COUNT(*) * 131072 as total_blob_bytes -- 128KB per blob
        FROM canonical_beacon_blob_sidecar FINAL
        WHERE
            slot_start_date_time BETWEEN toDateTime(:start_date) AND toDateTime(:end_date)
            AND meta_network_name IN (:networks)
        GROUP BY slot
    """)

    blob_result = pandaops_clickhouse_client.execute(
        blob_query,
        {
            "start_date": start_str,
            "end_date": end_str,
            "networks": beacon_chain_timings_config.networks
        }
    )
    blob_data = {r[0]: r[1] for r in blob_result.fetchall()}
    log.info(f"Found blob data for {len(blob_data)} slots")

    # Get MEV relay data
    log.info("Querying MEV relay data...")
    mev_query = text("""
        SELECT DISTINCT
            slot
        FROM mev_relay_proposer_payload_delivered FINAL
        WHERE
            slot_start_date_time BETWEEN toDateTime(:start_date) AND toDateTime(:end_date)
            AND meta_network_name IN (:networks)
    """)
    
    mev_result = pandaops_clickhouse_client.execute(
        mev_query,
        {
            "start_date": start_str,
            "end_date": end_str,
            "networks": beacon_chain_timings_config.networks
        }
    )
    mev_slots = set(r[0] for r in mev_result.fetchall())
    log.info(f"Found {len(mev_slots)} MEV relay slots")

    # Get block arrival data
    log.info("Querying block arrival data...")
    block_arrival_query = text("""
        SELECT 
            slot,
            meta_network_name,
            min(propagation_slot_start_diff) as arrival_time
        FROM beacon_api_eth_v1_events_block FINAL
        WHERE
            slot_start_date_time BETWEEN toDateTime(:start_date) AND toDateTime(:end_date)
            AND meta_network_name IN (:networks)
        GROUP BY slot, meta_network_name
    """)

    # Get block size data
    log.info("Querying block size data...")
    block_size_query = text("""
        SELECT 
            slot,
            meta_network_name,
            proposer_index,
            block_total_bytes_compressed
        FROM canonical_beacon_block FINAL
        WHERE
            slot_start_date_time BETWEEN toDateTime(:start_date) AND toDateTime(:end_date)
            AND meta_network_name IN (:networks)
    """)

    params = {
        "start_date": start_str,
        "end_date": end_str,
        "networks": beacon_chain_timings_config.networks
    }

    # Execute queries and convert to dataframes
    arrival_df = pd.DataFrame(
        pandaops_clickhouse_client.execute(block_arrival_query, params).fetchall(),
        columns=['slot', 'meta_network_name', 'arrival_time']
    )
    log.info(f"Found arrival data for {len(arrival_df)} blocks")
    
    size_df = pd.DataFrame(
        pandaops_clickhouse_client.execute(block_size_query, params).fetchall(),
        columns=['slot', 'meta_network_name', 'proposer_index', 'block_size']
    )
    log.info(f"Found size data for {len(size_df)} blocks")

    # Get proposer entities
    log.info("Getting proposer entities...")
    proposer_query = text("""
        SELECT 
            `index` as proposer_index,
            entity
        FROM ethseer_validator_entity
        WHERE 
            meta_network_name IN (:networks)
    """)
    proposer_entities = pd.DataFrame(
        pandaops_clickhouse_client.execute(proposer_query, params).fetchall(),
        columns=['proposer_index', 'entity']
    )

    # Merge dataframes and only keep slots that exist in size_df (canonical blocks)
    block_data = pd.merge(
        arrival_df, 
        size_df,
        on=['slot', 'meta_network_name'],
        how='right'
    ).dropna()
    log.info(f"Merged data contains {len(block_data)} blocks")

    # Process each network
    for network in beacon_chain_timings_config.networks:
        log.info(f"Processing network {network}...")
        network_df = block_data[block_data.meta_network_name == network].copy()
        if network_df.empty:
            log.warning(f"No data found for network {network}")
            continue
            
        # Add blob sizes, MEV flag and entity info
        network_df['total_size'] = network_df.apply(
            lambda row: max(row.block_size + blob_data.get(row.slot, 0), 1),  # Ensure minimum size of 1 byte
            axis=1
        )
        network_df['is_mev'] = network_df.slot.isin(mev_slots)
        network_df = pd.merge(network_df, proposer_entities, on='proposer_index', how='left')
        network_df['is_solo'] = network_df.entity == 'solo_stakers'

        # Bucket sizes into 32KB chunks and get average arrival time per bucket
        network_df['size_bucket'] = (network_df.total_size / (32 * 1024)).round() * 32
        network_df['size_bucket'] = network_df['size_bucket'].apply(lambda x: max(x, 32))  # Minimum bucket of 32KB
        
        # Calculate averages for all blocks, MEV blocks, non-MEV blocks, and solo staker blocks
        avg_all = network_df.groupby('size_bucket')['arrival_time'].mean().round().reset_index()
        avg_mev = network_df[network_df.is_mev].groupby('size_bucket')['arrival_time'].mean().round().reset_index()
        avg_non_mev = network_df[~network_df.is_mev].groupby('size_bucket')['arrival_time'].mean().round().reset_index()
        avg_solo_mev = network_df[network_df.is_solo & network_df.is_mev].groupby('size_bucket')['arrival_time'].mean().round().reset_index()
        avg_solo_non_mev = network_df[network_df.is_solo & ~network_df.is_mev].groupby('size_bucket')['arrival_time'].mean().round().reset_index()

        # Write data
        formatted_data = {
            "sizes_kb": avg_all.size_bucket.tolist(),
            "arrival_times_ms": {
                "all": avg_all.arrival_time.tolist(),
                "mev": avg_mev.arrival_time.tolist() if not avg_mev.empty else [],
                "non_mev": avg_non_mev.arrival_time.tolist() if not avg_non_mev.empty else [],
                "solo_mev": avg_solo_mev.arrival_time.tolist() if not avg_solo_mev.empty else [],
                "solo_non_mev": avg_solo_non_mev.arrival_time.tolist() if not avg_solo_non_mev.empty else []
            }
        }

        output_path = f"size_cdf/{network}/{window.file}.json"
        log.info(f"Writing data to {output_path}")
        lab.write_json(output_path, formatted_data)


2025-01-10 17:38:04,941 - beacon-chain-timings - INFO - Fetching block and blob data for last_30_days (2024-12-11 07:38:04 to 2025-01-10 07:38:04)
2025-01-10 17:38:04,942 - beacon-chain-timings - INFO - Querying blob data...
2025-01-10 17:38:09,475 - beacon-chain-timings - INFO - Found blob data for 503400 slots
2025-01-10 17:38:09,476 - beacon-chain-timings - INFO - Querying MEV relay data...
2025-01-10 17:38:11,216 - beacon-chain-timings - INFO - Found 264044 MEV relay slots
2025-01-10 17:38:11,216 - beacon-chain-timings - INFO - Querying block arrival data...
2025-01-10 17:38:11,217 - beacon-chain-timings - INFO - Querying block size data...
2025-01-10 17:38:17,907 - beacon-chain-timings - INFO - Found arrival data for 621757 blocks
2025-01-10 17:38:22,072 - beacon-chain-timings - INFO - Found size data for 619833 blocks
2025-01-10 17:38:22,073 - beacon-chain-timings - INFO - Getting proposer entities...
2025-01-10 17:38:31,121 - beacon-chain-timings - INFO - Merged data contains 61

KeyError: 'solo_stakers'