Skip to content
20 changes: 19 additions & 1 deletion pkg/bee/api/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ func (n *NodeService) Balances(ctx context.Context) (resp Balances, err error) {
return resp, err
}

// HasChunk returns true/false if node has a chunk
// HasChunk returns true/false if node has a chunk.
//
// NOTE: This uses GET /chunks/{addr}, which on bee falls back to network
// retrieval if the chunk is not present locally. It is therefore NOT a clean
// local-only check. Use LocalHasChunk for that.
func (n *NodeService) HasChunk(ctx context.Context, a swarm.Address) (bool, error) {
resp := struct {
Message string `json:"message,omitempty"`
Expand All @@ -95,6 +99,20 @@ func (n *NodeService) HasChunk(ctx context.Context, a swarm.Address) (bool, erro
return true, nil
}

// LocalHasChunk reports whether the chunk is stored locally on the node
// without triggering a network retrieval. Uses HEAD /chunks/{addr} which
// maps to storer.ChunkStore().Has on the bee side.
func (n *NodeService) LocalHasChunk(ctx context.Context, a swarm.Address) (bool, error) {
err := n.client.request(ctx, http.MethodHead, "/chunks/"+a.String(), nil, nil)
if IsHTTPStatusErrorCode(err, http.StatusNotFound) {
return false, nil
}
if err != nil {
return false, err
}
return true, nil
}

// Health represents node's health
type Health struct {
Status string `json:"status"`
Expand Down
10 changes: 10 additions & 0 deletions pkg/bee/api/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ import (

type StatusService service

// Canonical values for the BeeMode field of StatusResponse, mirroring the
// strings emitted by bee/v2/pkg/api.BeeNodeMode.String().
const (
BeeModeFull = "full"
BeeModeLight = "light"
BeeModeUltraLight = "ultra-light"
BeeModeUnknown = "unknown"
)

type StatusResponse struct {
Overlay string `json:"overlay"`
Proximity uint `json:"proximity"`
Expand All @@ -22,6 +31,7 @@ type StatusResponse struct {
IsReachable bool `json:"isReachable"`
LastSyncedBlock uint64 `json:"lastSyncedBlock"`
CommittedDepth uint8 `json:"committedDepth"`
IsWarmingUp bool `json:"isWarmingUp"`
}

// Ping pings given node
Expand Down
10 changes: 9 additions & 1 deletion pkg/bee/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,19 @@ func (c *Client) DownloadActFile(ctx context.Context, a swarm.Address, opts *api
return size, h.Sum(nil), nil
}

// HasChunk returns true/false if node has a chunk
// HasChunk returns true/false if node has a chunk.
//
// NOTE: Backed by GET /chunks/{addr}, which falls back to network retrieval on
// a local miss. Use LocalHasChunk for a strict local-only check.
func (c *Client) HasChunk(ctx context.Context, a swarm.Address) (bool, error) {
return c.api.Node.HasChunk(ctx, a)
}

// LocalHasChunk is a strict local-only check; see api.NodeService.LocalHasChunk.
func (c *Client) LocalHasChunk(ctx context.Context, a swarm.Address) (bool, error) {
return c.api.Node.LocalHasChunk(ctx, a)
}

func (c *Client) HasChunks(ctx context.Context, a []swarm.Address) (has []bool, count int, err error) {
has = make([]bool, len(a))
for i, addr := range a {
Expand Down
156 changes: 141 additions & 15 deletions pkg/check/smoke/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,43 @@ import (
)

type metrics struct {
BatchCreateErrors prometheus.Counter
BatchCreateAttempts prometheus.Counter
UploadErrors *prometheus.CounterVec
UploadAttempts *prometheus.CounterVec
UploadSuccess *prometheus.CounterVec
DownloadErrors *prometheus.CounterVec
DownloadMismatch *prometheus.CounterVec
DownloadAttempts *prometheus.CounterVec
DownloadSuccess *prometheus.CounterVec
UploadDuration *prometheus.HistogramVec
DownloadDuration *prometheus.HistogramVec
UploadThroughput *prometheus.GaugeVec
DownloadThroughput *prometheus.GaugeVec
UploadedBytes *prometheus.CounterVec
DownloadedBytes *prometheus.CounterVec
BatchCreateErrors prometheus.Counter
BatchCreateAttempts prometheus.Counter
UploadErrors *prometheus.CounterVec
UploadAttempts *prometheus.CounterVec
UploadSuccess *prometheus.CounterVec
DownloadErrors *prometheus.CounterVec
DownloadEOFErrors *prometheus.CounterVec
DownloadMismatch *prometheus.CounterVec
DownloadAttempts *prometheus.CounterVec
DownloadSuccess *prometheus.CounterVec
UploadDuration *prometheus.HistogramVec
DownloadDuration *prometheus.HistogramVec
UploadThroughput *prometheus.GaugeVec
DownloadThroughput *prometheus.GaugeVec
UploadedBytes *prometheus.CounterVec
DownloadedBytes *prometheus.CounterVec
NodeHealthVerdict *prometheus.GaugeVec
ClusterFullNodeCount prometheus.Gauge
ClusterLightNodeCount prometheus.Gauge
UnhealthyAbortsPreUp prometheus.Counter
UnhealthyAbortsPreDown prometheus.Counter
// Chunk walk: per-chunk presence check across the full upload tree.
ChunksChecked prometheus.Counter
ChunksMissingTotal *prometheus.CounterVec // {position}
ChunksMissingOutOfAOR *prometheus.CounterVec // {position} — bug 1 fingerprint (out-of-depth storing)
ChunksMissingInAOR *prometheus.CounterVec // {position} — bug 2/3 fingerprint (in-depth but not stored)
ChunksPresentOutOfAOR *prometheus.CounterVec // {position} — bug 1 confirmed (chunk exists outside its AOR)
FilesWithLoss prometheus.Counter
EOFWithCleanWalk prometheus.Counter
}

const (
labelSizeBytes = "size_bytes"
labelNodeName = "node_name"
labelRedundancyLevel = "redundancy_level"
labelPhase = "phase"
labelPosition = "position"
)

func newMetrics(subsystem string) metrics {
Expand Down Expand Up @@ -166,6 +182,116 @@ func newMetrics(subsystem string) metrics {
},
[]string{labelNodeName, labelRedundancyLevel},
),
DownloadEOFErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "download_eof_errors_count",
Help: "Download errors classified as unexpected EOF, which indicate the chunk is likely missing from the cluster.",
},
[]string{labelSizeBytes, labelNodeName, labelRedundancyLevel},
),
NodeHealthVerdict: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "node_health_verdict",
Help: "Topology health verdict for a node: 0=unknown, 1=unhealthy, 2=degraded, 3=healthy. Sampled per phase (pre_upload, pre_download, on_failure).",
},
[]string{labelNodeName, labelPhase},
),
ClusterFullNodeCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "cluster_full_node_count",
Help: "Number of full (non-bootnode) nodes in the cluster.",
},
),
ClusterLightNodeCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "cluster_light_node_count",
Help: "Number of light nodes in the cluster.",
},
),
UnhealthyAbortsPreUp: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "unhealthy_aborts_pre_upload",
Help: "Iterations aborted because the uploader was UNHEALTHY before upload.",
},
),
UnhealthyAbortsPreDown: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "unhealthy_aborts_pre_download",
Help: "Iterations skipped because the downloader was UNHEALTHY before download.",
},
),
ChunksChecked: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "chunks_checked_total",
Help: "Total chunks inspected by the on-failure chunk walk (denominator for chunks_missing_* rates).",
},
),
ChunksMissingTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "chunks_missing_total",
Help: "Chunks not found on their closest full node (HEAD /chunks/{addr} returned 404). Labelled by tree position.",
},
[]string{labelPosition},
),
ChunksMissingOutOfAOR: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "chunks_missing_out_of_aor_total",
Help: "Missing chunks whose closest storer in the cluster still has PO(chunk, storer) < storageRadius. Indicates a cluster-coverage gap — the address falls outside every node's AOR. Common in small testnets.",
},
[]string{labelPosition},
),
ChunksMissingInAOR: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "chunks_missing_in_aor_total",
Help: "Missing chunks whose closest storer covers the address (PO >= storageRadius) but does not have the chunk. Bee#5400 bug-2/3 fingerprint: shallow receipt short-circuit or false ChunkSynced.",
},
[]string{labelPosition},
),
ChunksPresentOutOfAOR: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "chunks_present_out_of_aor_total",
Help: "Chunks held by a node whose AOR does not cover them (PO < storageRadius). Direct bee#5400 bug-1 confirmation: out-of-depth storing.",
},
[]string{labelPosition},
),
FilesWithLoss: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "files_with_chunk_loss_total",
Help: "Files where the on-failure chunk walk found at least one missing chunk.",
},
),
EOFWithCleanWalk: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: subsystem,
Name: "eof_with_clean_walk_total",
Help: "Downloads that failed with unexpected EOF where the chunk walk found nothing missing and no out-of-AOR-present chunks. Strong signal that the EOF was NOT bee#5400 chunk loss but something else (transient retrieval, downloader networking, etc.).",
},
),
}
}

Expand Down
Loading
Loading