Analysis of consensus client versions connected to Xatu nodes on Ethereum mainnet.

In [None]:
target_date = None  # Set via papermill, or auto-detect from manifest

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from loaders import load_parquet

In [None]:
# Load client versions data
df = load_parquet("client_versions", target_date)

# Fill missing values
df["remote_agent_implementation"] = df["remote_agent_implementation"].fillna("unknown")
df["remote_agent_version"] = df["remote_agent_version"].fillna("unknown")

print(f"Total connections: {len(df):,}")
print(f"Unique peer IDs: {df['remote_peer_id'].nunique():,}")
print(f"Unique client implementations: {df['remote_agent_implementation'].nunique()}")

## Client Implementation Distribution

Distribution of consensus client implementations observed across all connections. This shows the diversity of the Ethereum validator client ecosystem.

In [None]:
# Known consensus clients to track individually
KNOWN_CLIENTS = {"lighthouse", "teku", "nimbus", "erigon", "grandine", "lodestar", "prysm"}

# Filter out unknown implementations for cleaner visualization
df_known = df[df["remote_agent_implementation"] != "unknown"].copy()

# Map non-standard clients to "Others"
df_known["client"] = df_known["remote_agent_implementation"].apply(
    lambda x: x if x.lower() in KNOWN_CLIENTS else "Others"
)

# Count by implementation
impl_counts = df_known.groupby("client").size().reset_index(name="count")
impl_counts = impl_counts.sort_values("count", ascending=False)

fig = px.pie(
    impl_counts,
    values="count",
    names="client",
    title="Client Implementation Distribution",
    color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.update_layout(height=500)
fig.show()

## Connections by Client Implementation

Bar chart showing the number of connections per client implementation.

In [None]:
fig = px.bar(
    impl_counts,
    x="client",
    y="count",
    title="Connections by Client Implementation",
    labels={"client": "Client", "count": "Connections"},
    color="client",
    color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_layout(
    showlegend=False,
    height=500,
    xaxis_tickangle=-45,
)
fig.show()

## Client Connections Over Time

Stacked area chart showing how client connections are distributed across implementations over time.

In [None]:
# Group by hour and implementation
df_known["hour"] = pd.to_datetime(df_known["event_date_time"]).dt.floor("h")
hourly_impl = df_known.groupby(["hour", "client"]).size().reset_index(name="count")

# Pivot for stacked area
hourly_pivot = hourly_impl.pivot(index="hour", columns="client", values="count").fillna(0)

fig = go.Figure()
for col in hourly_pivot.columns:
    fig.add_trace(go.Scatter(
        x=hourly_pivot.index,
        y=hourly_pivot[col],
        mode="lines",
        stackgroup="one",
        name=col,
    ))

fig.update_layout(
    title="Client Implementation Connections Over Time",
    xaxis_title="Time",
    yaxis_title="Connections",
    height=500,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig.show()

## Version Distribution by Client

Detailed breakdown of version distribution for each major client implementation.

In [None]:
# Get known clients (exclude "Others" for version breakdown)
top_clients = [c for c in impl_counts["client"].tolist() if c != "Others"]

for client in top_clients:
    df_client = df_known[df_known["client"] == client]
    
    if len(df_client) == 0:
        continue
    
    # Count by version
    version_counts = df_client.groupby("remote_agent_version").size().reset_index(name="count")
    version_counts = version_counts.sort_values("count", ascending=False).head(15)
    
    fig = px.bar(
        version_counts,
        x="remote_agent_version",
        y="count",
        title=f"{client.capitalize()} Version Distribution",
        labels={"remote_agent_version": "Version", "count": "Connections"},
        color_discrete_sequence=[px.colors.qualitative.Set2[top_clients.index(client) % len(px.colors.qualitative.Set2)]],
    )
    fig.update_layout(
        height=400,
        xaxis_tickangle=-45,
    )
    fig.show()

## Summary Statistics

In [None]:
# Summary table
summary_data = []

for client in df_known["client"].unique():
    df_client = df_known[df_known["client"] == client]
    summary_data.append({
        "Client": client,
        "Connections": len(df_client),
        "Unique Peers": df_client["remote_peer_id"].nunique(),
        "Versions": df_client["remote_agent_version"].nunique(),
        "Top Version": df_client["remote_agent_version"].mode().iloc[0] if len(df_client) > 0 else "N/A",
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values("Connections", ascending=False)
summary_df["Connections"] = summary_df["Connections"].apply(lambda x: f"{x:,}")
summary_df["Unique Peers"] = summary_df["Unique Peers"].apply(lambda x: f"{x:,}")
summary_df