Analysis of data column propagation timing across the 128 column subnets in PeerDAS.

In [None]:
import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px
import plotly.graph_objects as go

from loaders import load_parquet, display_sql
from plotly_theme import horizontal_colorbar

# Number of data columns in PeerDAS
NUM_COLUMNS = 128

target_date = None  # Set via papermill, or auto-detect from manifest

In [None]:
display_sql("col_first_seen", target_date)

In [None]:
# Load column propagation data
df_col_first_seen = pl.from_pandas(load_parquet("col_first_seen", target_date))

print(f"Slots with column data: {len(df_col_first_seen)}")

## Column first seen

Heatmap showing when each of the 128 data columns was first observed, measured in milliseconds from slot start. Consistent patterns across columns indicate healthy propagation; outliers may signal network issues.

In [None]:
# Panel 1: Column first seen (ms into slot start) - 128 columns heatmap

# Reshape for heatmap: rows = columns (c0-c127), columns = time
col_names = [f"c{i}" for i in range(NUM_COLUMNS)]

# Convert to pandas for reshaping (plotly needs pandas)
df_pd = df_col_first_seen.select(["time", "slot"] + col_names).to_pandas()
df_cols = df_pd[col_names].T
df_cols.columns = df_pd["time"]

# Create slot lookup for hover data
slot_values = df_pd["slot"].values

# Build customdata: slot number for each column in the heatmap
customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])

fig = go.Figure(
    data=go.Heatmap(
        z=df_cols.values,
        x=df_cols.columns,
        y=[str(i) for i in range(NUM_COLUMNS)],
        zmin=1500,
        zmax=4000,
        colorbar=horizontal_colorbar("ms"),
        customdata=customdata,
        hovertemplate="<b>Slot:</b> %{customdata}<br><b>Time:</b> %{x}<br><b>Column Index:</b> %{y}<br><b>First Seen:</b> %{z} ms<extra></extra>",
    )
)
fig.update_layout(
    margin=dict(l=10, r=10, t=10, b=80),
    xaxis=dict(automargin=True),
    yaxis=dict(title="Column", automargin=True, autorange="reversed"),
    height=800,
)
fig.show()

## Delta from fastest column (intraslot, ms)

Shows how much slower each column arrived compared to the fastest column in that slot. Highlights columns that consistently lag behind, which may indicate propagation bottlenecks.

In [None]:
# Compute delta from min value per slot for each column
col_names = [f"c{i}" for i in range(NUM_COLUMNS)]

# Calculate row-wise minimum using polars horizontal operations
df_delta = df_col_first_seen.with_columns(
    pl.min_horizontal(*col_names).alias("row_min")
).with_columns(
    [(pl.col(col) - pl.col("row_min")).alias(col) for col in col_names]
).drop("row_min")

# Convert to pandas for reshaping (plotly needs pandas)
df_delta_pd = df_delta.select(["time", "slot"] + col_names).to_pandas()
df_delta_cols = df_delta_pd[col_names].T
df_delta_cols.columns = df_delta_pd["time"]

# Create slot lookup for hover data
slot_values = df_delta_pd["slot"].values

# Build customdata: slot number for each column in the heatmap
customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])

fig = go.Figure(
    data=go.Heatmap(
        z=df_delta_cols.values,
        x=df_delta_cols.columns,
        y=[str(i) for i in range(NUM_COLUMNS)],
        colorscale="Inferno",
        reversescale=False,
        zmin=0,
        zmax=250,
        colorbar=horizontal_colorbar("ms"),
        customdata=customdata,
        hovertemplate="<b>Slot:</b> %{customdata}<br><b>Time:</b> %{x}<br><b>Column Index:</b> %{y}<br><b>Delta:</b> %{z} ms<extra></extra>",
    )
)
fig.update_layout(
    margin=dict(l=10, r=10, t=10, b=80),
    xaxis=dict(automargin=True),
    yaxis=dict(title="Column", automargin=True, autorange="reversed"),
    height=800,
)
fig.show()

## Delta normalized (0-1)

Same delta data normalized to a 0â€“1 scale per slot, making it easier to compare relative propagation order regardless of absolute timing. Columns closer to 0 arrived first; those near 1 arrived last.

In [None]:
# Normalize delta values to 0-1 range per slot
col_names = [f"c{i}" for i in range(NUM_COLUMNS)]

# Calculate row-wise min, max, and range using polars horizontal operations
df_normalized = df_col_first_seen.with_columns([
    pl.min_horizontal(*col_names).alias("row_min"),
    pl.max_horizontal(*col_names).alias("row_max"),
]).with_columns(
    (pl.col("row_max") - pl.col("row_min")).alias("row_range")
).with_columns([
    pl.when(pl.col("row_range") == 0)
    .then(None)
    .otherwise((pl.col(col) - pl.col("row_min")) / pl.col("row_range"))
    .alias(col)
    for col in col_names
]).drop(["row_min", "row_max", "row_range"])

# Convert to pandas for reshaping (plotly needs pandas)
df_norm_pd = df_normalized.select(["time", "slot"] + col_names).to_pandas()
df_norm_cols = df_norm_pd[col_names].T
df_norm_cols.columns = df_norm_pd["time"]

# Create slot lookup for hover data
slot_values = df_norm_pd["slot"].values

# Build customdata: slot number for each column in the heatmap
customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])

fig = go.Figure(
    data=go.Heatmap(
        z=df_norm_cols.values,
        x=df_norm_cols.columns,
        y=[str(i) for i in range(NUM_COLUMNS)],
        colorscale="YlGnBu",
        reversescale=True,
        zmin=0,
        zmax=1,
        colorbar=horizontal_colorbar("Normalized"),
        customdata=customdata,
        hovertemplate="<b>Slot:</b> %{customdata}<br><b>Time:</b> %{x}<br><b>Column Index:</b> %{y}<br><b>Normalized:</b> %{z:.2f}<extra></extra>",
    )
)
fig.update_layout(
    margin=dict(l=10, r=10, t=10, b=80),
    xaxis=dict(automargin=True),
    yaxis=dict(title="Column", automargin=True, autorange="reversed"),
    height=800,
)
fig.show()

## Column arrival spread

Time between when the first column arrives and when the last column arrives for each slot. A wide spread indicates some columns arriving much later than others.

In [None]:
# Compute column spread (max - min across all columns per slot)
col_names = [f"c{i}" for i in range(NUM_COLUMNS)]

df_spread = df_col_first_seen.with_columns(
    (pl.max_horizontal(*col_names) - pl.min_horizontal(*col_names)).alias("column_spread_ms")
)

# Convert to pandas for plotly
df_spread_pd = df_spread.select(["time", "slot", "column_spread_ms"]).to_pandas()

fig = px.histogram(
    df_spread_pd,
    x="column_spread_ms",
    nbins=60,
    color_discrete_sequence=["#EF553B"],
)
fig.update_layout(
    margin=dict(l=60, r=30, t=30, b=60),
    xaxis=dict(title="Column spread (ms)"),
    yaxis=dict(title="Slots"),
    height=400,
)
fig.show(config={"responsive": True})

In [None]:
# Summary statistics using polars
stats = df_spread.select("column_spread_ms").to_series().describe()
percentiles = df_spread.select(
    pl.col("column_spread_ms").quantile(0.5).alias("p50"),
    pl.col("column_spread_ms").quantile(0.9).alias("p90"),
    pl.col("column_spread_ms").quantile(0.95).alias("p95"),
    pl.col("column_spread_ms").quantile(0.99).alias("p99"),
    pl.col("column_spread_ms").max().alias("max"),
).row(0)

print("Column spread (ms):")
print(f"  Median: {percentiles[0]:.0f}")
print(f"  P90:    {percentiles[1]:.0f}")
print(f"  P95:    {percentiles[2]:.0f}")
print(f"  P99:    {percentiles[3]:.0f}")
print(f"  Max:    {percentiles[4]:.0f}")

## Column spread over time

How column spread varies throughout the day. Useful for spotting periods of degraded propagation.

In [None]:
fig = px.scatter(
    df_spread_pd,
    x="time",
    y="column_spread_ms",
    opacity=0.5,
    color_discrete_sequence=["#EF553B"],
    hover_data={"slot": True, "column_spread_ms": ":.0f", "time": False},
)
fig.update_layout(
    margin=dict(l=60, r=30, t=30, b=60),
    xaxis=dict(title="Time (UTC)", tickformat="%H:%M"),
    yaxis=dict(title="Column spread (ms)"),
    height=400,
)
fig.show(config={"responsive": True})

## Missing columns

Slots where specific columns were never observed. Missing columns indicate gaps in network coverage for that column subnet.

In [None]:
# Missing columns heatmap - shows gaps in network coverage
col_names = [f"c{i}" for i in range(NUM_COLUMNS)]

# Count missing data using polars
total_missing = df_col_first_seen.select([
    pl.col(col).is_null().sum() for col in col_names
]).sum_horizontal().item()

slots_with_missing = df_col_first_seen.select(
    pl.any_horizontal([pl.col(col).is_null() for col in col_names])
).sum().item()

print(f"Total missing column observations: {total_missing:,}")
print(f"Slots with at least one missing column: {slots_with_missing:,} ({slots_with_missing/len(df_col_first_seen)*100:.1f}%)")

if total_missing > 0:
    # Convert to pandas for the heatmap visualization
    df_pd = df_col_first_seen.select(["time", "slot"] + col_names).to_pandas()
    
    # Create boolean mask: True (1) where column is missing (NaN)
    df_missing = df_pd[col_names].isna().astype(int).T
    df_missing.columns = df_pd["time"]
    
    # Create slot lookup for hover data
    slot_values = df_pd["slot"].values
    customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])

    fig = go.Figure(
        data=go.Heatmap(
            z=df_missing.values,
            x=df_missing.columns,
            y=[str(i) for i in range(NUM_COLUMNS)],
            colorscale=[[0, "#E8E8E8"], [1, "#1E1E1E"]],
            zmin=0,
            zmax=1,
            showscale=False,
            customdata=customdata,
            hovertemplate="<b>Slot:</b> %{customdata}<br><b>Time:</b> %{x}<br><b>Column:</b> %{y}<br><b>Status:</b> Missing<extra></extra>",
        )
    )
    fig.update_layout(
        margin=dict(l=10, r=10, t=10, b=80),
        xaxis=dict(automargin=True),
        yaxis=dict(title="Column", automargin=True, autorange="reversed"),
        height=800,
    )
    fig.show()
else:
    print("No missing columns detected.")