Detection of blocks that propagated slower than expected given their blob count.

In [None]:
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from IPython.display import HTML

from loaders import load_parquet, display_sql

target_date = None  # Set via papermill, or auto-detect from manifest

In [None]:
display_sql("block_production_timeline", target_date)

In [None]:
df = pl.from_pandas(load_parquet("block_production_timeline", target_date))

# Filter to valid blocks (exclude missed slots)
df = df.filter(pl.col("block_first_seen_ms").is_not_null())
df = df.filter((pl.col("block_first_seen_ms") >= 0) & (pl.col("block_first_seen_ms") < 60000))

# Flag MEV vs local blocks
df = df.with_columns([
    pl.col("winning_bid_value").is_not_null().alias("has_mev"),
])
df = df.with_columns([
    pl.when(pl.col("has_mev")).then(pl.lit("MEV")).otherwise(pl.lit("Local")).alias("block_type"),
])

# Get max blob count for charts
max_blobs = df["blob_count"].max()

print(f"Total valid blocks: {len(df):,}")
print(f"MEV blocks: {df['has_mev'].sum():,} ({df['has_mev'].mean()*100:.1f}%)")
print(f"Local blocks: {(~df['has_mev']).sum():,} ({(~df['has_mev']).mean()*100:.1f}%)")

## Anomaly detection method

Blocks that are slow *relative to their blob count* are more interesting than blocks that are simply slow. A 500ms block with 15 blobs may be normal; with 0 blobs it's anomalous.

The method:
1. Fit linear regression: `block_first_seen_ms ~ blob_count`
2. Calculate residuals (actual - expected)
3. Flag blocks with residuals > 2σ as anomalies

Points above the ±2σ band propagated slower than expected given their blob count.

In [None]:
# Conditional outliers: blocks slow relative to their blob count
df_anomaly = df.clone()

# Fit regression: block_first_seen_ms ~ blob_count (need numpy arrays)
blob_count_arr = df_anomaly["blob_count"].cast(pl.Float64).to_numpy()
block_ms_arr = df_anomaly["block_first_seen_ms"].to_numpy()

slope, intercept, r_value, p_value, std_err = stats.linregress(blob_count_arr, block_ms_arr)

# Calculate expected value and residual
df_anomaly = df_anomaly.with_columns([
    (pl.lit(intercept) + pl.lit(slope) * pl.col("blob_count").cast(pl.Float64)).alias("expected_ms"),
])
df_anomaly = df_anomaly.with_columns([
    (pl.col("block_first_seen_ms") - pl.col("expected_ms")).alias("residual_ms"),
])

# Calculate residual standard deviation
residual_std = df_anomaly["residual_ms"].std()

# Flag anomalies: residual > 2σ (unexpectedly slow)
df_anomaly = df_anomaly.with_columns([
    (pl.col("residual_ms") > 2 * residual_std).alias("is_anomaly"),
])

n_anomalies = df_anomaly["is_anomaly"].sum()
pct_anomalies = n_anomalies / len(df_anomaly) * 100

# Prepare outliers dataframe
df_outliers = df_anomaly.filter(pl.col("is_anomaly"))
df_outliers = df_outliers.with_columns([
    pl.col("winning_relays").list.get(0).fill_null("Local").alias("relay"),
    pl.col("proposer_entity").fill_null("Unknown").alias("proposer"),
    pl.when(pl.col("winning_builder").is_not_null() & (pl.col("winning_builder") != ""))
        .then(pl.col("winning_builder").str.slice(0, 10) + pl.lit("..."))
        .otherwise(pl.lit("Local"))
        .alias("builder"),
])

print(f"Regression: block_ms = {intercept:.1f} + {slope:.2f} × blob_count (R² = {r_value**2:.3f})")
print(f"Residual σ = {residual_std:.1f}ms")
print(f"Anomalies (>2σ slow): {n_anomalies:,} ({pct_anomalies:.1f}%)")

In [None]:
# Create scatter plot with regression band
x_range = np.array([0, int(max_blobs)])
y_pred = intercept + slope * x_range
y_upper = y_pred + 2 * residual_std
y_lower = y_pred - 2 * residual_std

fig = go.Figure()

# Add ±2σ band
fig.add_trace(go.Scatter(
    x=np.concatenate([x_range, x_range[::-1]]),
    y=np.concatenate([y_upper, y_lower[::-1]]),
    fill="toself",
    fillcolor="rgba(100,100,100,0.2)",
    line=dict(width=0),
    name="±2σ band",
    hoverinfo="skip",
))

# Add regression line
fig.add_trace(go.Scatter(
    x=x_range,
    y=y_pred,
    mode="lines",
    line=dict(color="white", width=2, dash="dash"),
    name="Expected",
))

# Normal points (sample to avoid overplotting)
df_normal = df_anomaly.filter(~pl.col("is_anomaly"))
if len(df_normal) > 2000:
    df_normal = df_normal.sample(n=2000, seed=42)

# Convert to pandas for plotly
df_normal_pd = df_normal.to_pandas()
df_outliers_pd = df_outliers.to_pandas()

fig.add_trace(go.Scatter(
    x=df_normal_pd["blob_count"],
    y=df_normal_pd["block_first_seen_ms"],
    mode="markers",
    marker=dict(size=4, color="rgba(100,150,200,0.4)"),
    name=f"Normal ({len(df_anomaly) - n_anomalies:,})",
    hoverinfo="skip",
))

# Anomaly points
fig.add_trace(go.Scatter(
    x=df_outliers_pd["blob_count"],
    y=df_outliers_pd["block_first_seen_ms"],
    mode="markers",
    marker=dict(
        size=7,
        color="#e74c3c",
        line=dict(width=1, color="white"),
    ),
    name=f"Anomalies ({n_anomalies:,})",
    customdata=np.column_stack([
        df_outliers_pd["slot"],
        df_outliers_pd["residual_ms"].round(0),
        df_outliers_pd["relay"],
    ]),
    hovertemplate="<b>Slot %{customdata[0]}</b><br>Blobs: %{x}<br>Actual: %{y:.0f}ms<br>+%{customdata[1]}ms vs expected<br>Relay: %{customdata[2]}<extra></extra>",
))

fig.update_layout(
    margin=dict(l=60, r=30, t=30, b=60),
    xaxis=dict(title="Blob count", range=[-0.5, int(max_blobs) + 0.5]),
    yaxis=dict(title="Block first seen (ms from slot start)"),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    height=500,
)
fig.show(config={"responsive": True})

## All propagation anomalies

Blocks that propagated much slower than expected given their blob count, sorted by residual (worst first).

In [None]:
# All anomalies table with selectable text and Lab links
if n_anomalies > 0:
    df_table = df_outliers.sort("residual_ms", descending=True).select([
        "slot", "blob_count", "block_first_seen_ms", "expected_ms", "residual_ms", "proposer", "builder", "relay"
    ])
    df_table = df_table.with_columns([
        pl.col("block_first_seen_ms").round(0).cast(pl.Int64),
        pl.col("expected_ms").round(0).cast(pl.Int64),
        pl.col("residual_ms").round(0).cast(pl.Int64),
    ])
    
    # Convert to pandas for iteration
    df_table_pd = df_table.to_pandas()
    
    # Build HTML table
    html = '''
    <style>
    .anomaly-table { border-collapse: collapse; width: 100%; font-family: monospace; font-size: 13px; }
    .anomaly-table th { background: #2c3e50; color: white; padding: 8px 12px; text-align: left; position: sticky; top: 0; }
    .anomaly-table td { padding: 6px 12px; border-bottom: 1px solid #eee; }
    .anomaly-table tr:hover { background: #f5f5f5; }
    .anomaly-table .num { text-align: right; }
    .anomaly-table .delta { background: #ffebee; color: #c62828; font-weight: bold; }
    .anomaly-table a { color: #1976d2; text-decoration: none; }
    .anomaly-table a:hover { text-decoration: underline; }
    .table-container { max-height: 600px; overflow-y: auto; }
    </style>
    <div class="table-container">
    <table class="anomaly-table">
    <thead>
    <tr><th>Slot</th><th class="num">Blobs</th><th class="num">Actual (ms)</th><th class="num">Expected (ms)</th><th class="num">Δ (ms)</th><th>Proposer</th><th>Builder</th><th>Relay</th></tr>
    </thead>
    <tbody>
    '''
    
    for _, row in df_table_pd.iterrows():
        slot_link = f'<a href="https://lab.ethpandaops.io/ethereum/slots/{row["slot"]}" target="_blank">{row["slot"]}</a>'
        html += f'''<tr>
            <td>{slot_link}</td>
            <td class="num">{row["blob_count"]}</td>
            <td class="num">{row["block_first_seen_ms"]}</td>
            <td class="num">{row["expected_ms"]}</td>
            <td class="num delta">+{row["residual_ms"]}</td>
            <td>{row["proposer"]}</td>
            <td>{row["builder"]}</td>
            <td>{row["relay"]}</td>
        </tr>'''
    
    html += '</tbody></table></div>'
    display(HTML(html))
    print(f"\nTotal anomalies: {len(df_table):,}")
else:
    print("No anomalies detected.")

## Anomalies by relay

Which relays have the most propagation anomalies?

In [None]:
if n_anomalies > 0:
    # Count anomalies by relay
    relay_counts = df_outliers.group_by("relay").agg(pl.len().alias("anomaly_count"))
    
    # Get total blocks per relay for context
    df_anomaly = df_anomaly.with_columns([
        pl.col("winning_relays").list.get(0).fill_null("Local").alias("relay"),
    ])
    total_by_relay = df_anomaly.group_by("relay").agg(pl.len().alias("total_blocks"))
    
    relay_counts = relay_counts.join(total_by_relay, on="relay")
    relay_counts = relay_counts.with_columns([
        (pl.col("anomaly_count") / pl.col("total_blocks") * 100).alias("anomaly_rate"),
    ])
    relay_counts = relay_counts.sort("anomaly_count")
    
    # Convert to pandas for plotly
    relay_counts_pd = relay_counts.to_pandas()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        y=relay_counts_pd["relay"],
        x=relay_counts_pd["anomaly_count"],
        orientation="h",
        marker_color="#e74c3c",
        text=relay_counts_pd.apply(lambda r: f"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)", axis=1),
        textposition="outside",
        hovertemplate="<b>%{y}</b><br>Anomalies: %{x}<br>Total blocks: %{customdata[0]:,}<br>Rate: %{customdata[1]:.1f}%<extra></extra>",
        customdata=np.column_stack([relay_counts_pd["total_blocks"], relay_counts_pd["anomaly_rate"]]),
    ))
    
    fig.update_layout(
        margin=dict(l=150, r=80, t=30, b=60),
        xaxis=dict(title="Number of anomalies"),
        yaxis=dict(title=""),
        height=350,
    )
    fig.show(config={"responsive": True})

## Anomalies by proposer entity

Which proposer entities have the most propagation anomalies?

In [None]:
if n_anomalies > 0:
    # Count anomalies by proposer entity
    proposer_counts = df_outliers.group_by("proposer").agg(pl.len().alias("anomaly_count"))
    
    # Get total blocks per proposer for context
    df_anomaly = df_anomaly.with_columns([
        pl.col("proposer_entity").fill_null("Unknown").alias("proposer"),
    ])
    total_by_proposer = df_anomaly.group_by("proposer").agg(pl.len().alias("total_blocks"))
    
    proposer_counts = proposer_counts.join(total_by_proposer, on="proposer")
    proposer_counts = proposer_counts.with_columns([
        (pl.col("anomaly_count") / pl.col("total_blocks") * 100).alias("anomaly_rate"),
    ])
    
    # Show top 15 by anomaly count
    proposer_counts = proposer_counts.sort("anomaly_count", descending=True).head(15).sort("anomaly_count")
    
    # Convert to pandas for plotly
    proposer_counts_pd = proposer_counts.to_pandas()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        y=proposer_counts_pd["proposer"],
        x=proposer_counts_pd["anomaly_count"],
        orientation="h",
        marker_color="#e74c3c",
        text=proposer_counts_pd.apply(lambda r: f"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)", axis=1),
        textposition="outside",
        hovertemplate="<b>%{y}</b><br>Anomalies: %{x}<br>Total blocks: %{customdata[0]:,}<br>Rate: %{customdata[1]:.1f}%<extra></extra>",
        customdata=np.column_stack([proposer_counts_pd["total_blocks"], proposer_counts_pd["anomaly_rate"]]),
    ))
    
    fig.update_layout(
        margin=dict(l=150, r=80, t=30, b=60),
        xaxis=dict(title="Number of anomalies"),
        yaxis=dict(title=""),
        height=450,
    )
    fig.show(config={"responsive": True})

## Anomalies by builder

Which builders have the most propagation anomalies? (Truncated pubkeys shown for MEV blocks)

In [None]:
if n_anomalies > 0:
    # Count anomalies by builder
    builder_counts = df_outliers.group_by("builder").agg(pl.len().alias("anomaly_count"))
    
    # Get total blocks per builder for context
    df_anomaly = df_anomaly.with_columns([
        pl.when(pl.col("winning_builder").is_not_null() & (pl.col("winning_builder") != ""))
            .then(pl.col("winning_builder").str.slice(0, 10) + pl.lit("..."))
            .otherwise(pl.lit("Local"))
            .alias("builder"),
    ])
    total_by_builder = df_anomaly.group_by("builder").agg(pl.len().alias("total_blocks"))
    
    builder_counts = builder_counts.join(total_by_builder, on="builder")
    builder_counts = builder_counts.with_columns([
        (pl.col("anomaly_count") / pl.col("total_blocks") * 100).alias("anomaly_rate"),
    ])
    
    # Show top 15 by anomaly count
    builder_counts = builder_counts.sort("anomaly_count", descending=True).head(15).sort("anomaly_count")
    
    # Convert to pandas for plotly
    builder_counts_pd = builder_counts.to_pandas()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        y=builder_counts_pd["builder"],
        x=builder_counts_pd["anomaly_count"],
        orientation="h",
        marker_color="#e74c3c",
        text=builder_counts_pd.apply(lambda r: f"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)", axis=1),
        textposition="outside",
        hovertemplate="<b>%{y}</b><br>Anomalies: %{x}<br>Total blocks: %{customdata[0]:,}<br>Rate: %{customdata[1]:.1f}%<extra></extra>",
        customdata=np.column_stack([builder_counts_pd["total_blocks"], builder_counts_pd["anomaly_rate"]]),
    ))
    
    fig.update_layout(
        margin=dict(l=150, r=80, t=30, b=60),
        xaxis=dict(title="Number of anomalies"),
        yaxis=dict(title=""),
        height=450,
    )
    fig.show(config={"responsive": True})

## Anomalies by blob count

Are anomalies more common at certain blob counts?

In [None]:
if n_anomalies > 0:
    # Count anomalies by blob count
    blob_anomalies = df_outliers.group_by("blob_count").agg(pl.len().alias("anomaly_count"))
    blob_total = df_anomaly.group_by("blob_count").agg(pl.len().alias("total_blocks"))
    
    blob_stats = blob_total.join(blob_anomalies, on="blob_count", how="left").fill_null(0)
    blob_stats = blob_stats.with_columns([
        pl.col("anomaly_count").cast(pl.Int64),
        (pl.col("anomaly_count") / pl.col("total_blocks") * 100).alias("anomaly_rate"),
    ])
    
    # Convert to pandas for plotly
    blob_stats_pd = blob_stats.to_pandas()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=blob_stats_pd["blob_count"],
        y=blob_stats_pd["anomaly_count"],
        marker_color="#e74c3c",
        hovertemplate="<b>%{x} blobs</b><br>Anomalies: %{y}<br>Total: %{customdata[0]:,}<br>Rate: %{customdata[1]:.1f}%<extra></extra>",
        customdata=np.column_stack([blob_stats_pd["total_blocks"], blob_stats_pd["anomaly_rate"]]),
    ))
    
    fig.update_layout(
        margin=dict(l=60, r=30, t=30, b=60),
        xaxis=dict(title="Blob count", dtick=1),
        yaxis=dict(title="Number of anomalies"),
        height=350,
    )
    fig.show(config={"responsive": True})