From 18d9bfff4756a5bcf2f2fedf4298154612288c96 Mon Sep 17 00:00:00 2001 From: Matthew Slipper Date: Mon, 1 Aug 2022 18:25:54 -0600 Subject: [PATCH] op-node: Derivation metrics Adds support for the following batch derivation metrics: - 0/1 gauge for if the derivation is idle - Count of total pipeline resets - Count of total unsafe payloads received - Count of total derivation errors - Gauge tracking the various L1/L2 safe/unsafe heads --- op-node/metrics/metrics.go | 53 +++++++++++++++++++++++++++++++-- op-node/node/node.go | 2 +- op-node/rollup/driver/driver.go | 6 ++-- op-node/rollup/driver/state.go | 15 +++++++++- 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/op-node/metrics/metrics.go b/op-node/metrics/metrics.go index 5cec5eb73e51..d1e54f4d53ef 100644 --- a/op-node/metrics/metrics.go +++ b/op-node/metrics/metrics.go @@ -27,14 +27,21 @@ const ( ) type Metrics struct { - Info *prometheus.GaugeVec - Up prometheus.Gauge + Info *prometheus.GaugeVec + Up prometheus.Gauge + RPCServerRequestsTotal *prometheus.CounterVec RPCServerRequestDurationSeconds *prometheus.HistogramVec RPCClientRequestsTotal *prometheus.CounterVec RPCClientRequestDurationSeconds *prometheus.HistogramVec RPCClientResponsesTotal *prometheus.CounterVec + DerivationIdle prometheus.Gauge + PipelineResetsTotal prometheus.Counter + UnsafePayloadsTotal prometheus.Counter + DerivationErrorsTotal prometheus.Counter + Heads *prometheus.GaugeVec + registry *prometheus.Registry } @@ -60,6 +67,7 @@ func NewMetrics(procName string) *Metrics { Name: "up", Help: "1 if the op node has finished starting up", }), + RPCServerRequestsTotal: promauto.With(registry).NewCounterVec(prometheus.CounterOpts{ Namespace: ns, Subsystem: RPCServerSubsystem, @@ -103,6 +111,35 @@ func NewMetrics(procName string) *Metrics { "method", "error", }), + + DerivationIdle: promauto.With(registry).NewGauge(prometheus.GaugeOpts{ + Namespace: ns, + Name: "derivation_idle", + Help: "1 if the derivation pipeline is idle", + }), + PipelineResetsTotal: promauto.With(registry).NewCounter(prometheus.CounterOpts{ + Namespace: ns, + Name: "pipeline_resets_total", + Help: "Count of derivation pipeline resets", + }), + UnsafePayloadsTotal: promauto.With(registry).NewCounter(prometheus.CounterOpts{ + Namespace: ns, + Name: "unsafe_payloads_total", + Help: "Count of unsafe payloads received via p2p", + }), + DerivationErrorsTotal: promauto.With(registry).NewCounter(prometheus.CounterOpts{ + Namespace: ns, + Name: "derivation_errors_total", + Help: "Count of total derivation errors", + }), + Heads: promauto.With(registry).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: ns, + Name: "heads", + Help: "Gauge representing the different L1/L2 heads", + }, []string{ + "type", + }), + registry: registry, } } @@ -166,6 +203,18 @@ func (m *Metrics) RecordRPCClientResponse(method string, err error) { m.RPCClientResponsesTotal.WithLabelValues(method, errStr).Inc() } +func (m *Metrics) SetDerivationIdle(status bool) { + var val float64 + if status { + val = 1 + } + m.DerivationIdle.Set(val) +} + +func (m *Metrics) SetHead(kind string, num uint64) { + m.Heads.WithLabelValues(kind).Set(float64(num)) +} + // Serve starts the metrics server on the given hostname and port. // The server will be closed when the passed-in context is cancelled. func (m *Metrics) Serve(ctx context.Context, hostname string, port int) error { diff --git a/op-node/node/node.go b/op-node/node/node.go index 1c2c0b9c7155..5883e54e7fff 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -144,7 +144,7 @@ func (n *OpNode) initL2(ctx context.Context, cfg *Config, snapshotLog log.Logger return err } - n.l2Engine = driver.NewDriver(&cfg.Driver, &cfg.Rollup, source, n.l1Source, n, n.log, snapshotLog) + n.l2Engine = driver.NewDriver(&cfg.Driver, &cfg.Rollup, source, n.l1Source, n, n.log, snapshotLog, n.metrics) return nil } diff --git a/op-node/rollup/driver/driver.go b/op-node/rollup/driver/driver.go index 031b4e2132cd..52789e8a1368 100644 --- a/op-node/rollup/driver/driver.go +++ b/op-node/rollup/driver/driver.go @@ -4,6 +4,8 @@ import ( "context" "math/big" + "github.com/ethereum-optimism/optimism/op-node/metrics" + "github.com/ethereum-optimism/optimism/op-node/eth" "github.com/ethereum-optimism/optimism/op-node/l1" "github.com/ethereum-optimism/optimism/op-node/l2" @@ -56,7 +58,7 @@ type Network interface { PublishL2Payload(ctx context.Context, payload *eth.ExecutionPayload) error } -func NewDriver(driverCfg *Config, cfg *rollup.Config, l2 *l2.Source, l1 *l1.Source, network Network, log log.Logger, snapshotLog log.Logger) *Driver { +func NewDriver(driverCfg *Config, cfg *rollup.Config, l2 *l2.Source, l1 *l1.Source, network Network, log log.Logger, snapshotLog log.Logger, metrics *metrics.Metrics) *Driver { output := &outputImpl{ Config: cfg, dl: l1, @@ -67,7 +69,7 @@ func NewDriver(driverCfg *Config, cfg *rollup.Config, l2 *l2.Source, l1 *l1.Sour var state *state verifConfDepth := NewConfDepth(driverCfg.VerifierConfDepth, func() eth.L1BlockRef { return state.l1Head }, l1) derivationPipeline := derive.NewDerivationPipeline(log, cfg, verifConfDepth, l2) - state = NewState(driverCfg, log, snapshotLog, cfg, l1, l2, output, derivationPipeline, network) + state = NewState(driverCfg, log, snapshotLog, cfg, l1, l2, output, derivationPipeline, network, metrics) return &Driver{s: state} } diff --git a/op-node/rollup/driver/state.go b/op-node/rollup/driver/state.go index 4b6b9162dd0d..a296f9dacddb 100644 --- a/op-node/rollup/driver/state.go +++ b/op-node/rollup/driver/state.go @@ -9,6 +9,7 @@ import ( "time" "github.com/ethereum-optimism/optimism/op-node/eth" + "github.com/ethereum-optimism/optimism/op-node/metrics" "github.com/ethereum-optimism/optimism/op-node/rollup" "github.com/ethereum/go-ethereum/log" ) @@ -67,6 +68,7 @@ type state struct { output outputInterface network Network // may be nil, network for is optional + metrics *metrics.Metrics log log.Logger snapshotLog log.Logger done chan struct{} @@ -77,7 +79,7 @@ type state struct { // NewState creates a new driver state. State changes take effect though // the given output, derivation pipeline and network interfaces. func NewState(driverCfg *Config, log log.Logger, snapshotLog log.Logger, config *rollup.Config, l1Chain L1Chain, l2Chain L2Chain, - output outputInterface, derivationPipeline DerivationPipeline, network Network) *state { + output outputInterface, derivationPipeline DerivationPipeline, network Network, metrics *metrics.Metrics) *state { return &state{ derivation: derivationPipeline, idleDerivation: false, @@ -91,6 +93,7 @@ func NewState(driverCfg *Config, log log.Logger, snapshotLog log.Logger, config l2: l2Chain, output: output, network: network, + metrics: metrics, l1Heads: make(chan eth.L1BlockRef, 10), unsafeL2Payloads: make(chan *eth.ExecutionPayload, 10), } @@ -315,6 +318,7 @@ func (s *state) eventLoop() { cancel() if err != nil { s.log.Error("Error creating new L2 block", "err", err) + s.metrics.DerivationErrorsTotal.Inc() } // We need to catch up to the next origin as quickly as possible. We can do this by @@ -330,14 +334,17 @@ func (s *state) eventLoop() { s.snapshot("New unsafe payload") s.log.Info("Optimistically queueing unsafe L2 execution payload", "id", payload.ID()) s.derivation.AddUnsafePayload(payload) + s.metrics.UnsafePayloadsTotal.Inc() reqStep() case newL1Head := <-s.l1Heads: s.log.Info("new l1 Head") s.snapshot("New L1 Head") s.handleNewL1Block(newL1Head) + s.metrics.SetHead("l1", newL1Head.Number) reqStep() // a new L1 head may mean we have the data to not get an EOF again. case <-stepReqCh: + s.metrics.SetDerivationIdle(false) s.idleDerivation = false s.log.Debug("Derivation process step", "onto_origin", s.derivation.Progress().Origin, "onto_closed", s.derivation.Progress().Closed) stepCtx, cancel := context.WithTimeout(ctx, time.Second*10) // TODO pick a timeout for executing a single step @@ -346,11 +353,14 @@ func (s *state) eventLoop() { if err == io.EOF { s.log.Debug("Derivation process went idle", "progress", s.derivation.Progress().Origin) s.idleDerivation = true + s.metrics.SetDerivationIdle(true) continue } else if err != nil { // If the pipeline corrupts, e.g. due to a reorg, simply reset it s.log.Warn("Derivation pipeline is reset", "err", err) s.derivation.Reset() + s.metrics.PipelineResetsTotal.Inc() + s.metrics.DerivationErrorsTotal.Inc() } else { finalized, safe, unsafe := s.derivation.Finalized(), s.derivation.SafeL2Head(), s.derivation.UnsafeL2Head() // log sync progress when it changes @@ -361,6 +371,9 @@ func (s *state) eventLoop() { s.l2Finalized = finalized s.l2SafeHead = safe s.l2Head = unsafe + s.metrics.SetHead("l2_finalized", finalized.Number) + s.metrics.SetHead("l2_safe", safe.Number) + s.metrics.SetHead("l2_unsafe", unsafe.Number) reqStep() // continue with the next step if we can } case respCh := <-s.syncStatusReq: