-
Notifications
You must be signed in to change notification settings - Fork 108
/
threshold_monitor.go
105 lines (93 loc) 路 2.46 KB
/
threshold_monitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package metrics
import (
"context"
"strings"
"sync"
"time"
"github.com/drand/drand/log"
)
type ThresholdMonitor struct {
lock sync.RWMutex
log log.Logger
beaconID string
threshold int
failedConnections map[string]bool
ctx context.Context
cancel func()
period time.Duration
}
func NewThresholdMonitor(beaconID string, l log.Logger, threshold int) *ThresholdMonitor {
ctx, cancel := context.WithCancel(context.Background())
return &ThresholdMonitor{
lock: sync.RWMutex{},
log: l,
beaconID: beaconID,
threshold: threshold,
failedConnections: make(map[string]bool),
ctx: ctx,
cancel: cancel,
period: 1 * time.Minute,
}
}
func (t *ThresholdMonitor) Start() {
t.log.Infow("starting threshold monitor", "beaconID", t.beaconID)
go func() {
for {
select {
case <-t.ctx.Done():
t.log.Infow("ending threshold monitor", "beaconID", t.beaconID)
return
default:
t.lock.RLock()
var failingNodes []string
for address := range t.failedConnections {
failingNodes = append(failingNodes, address)
}
if len(failingNodes) >= t.threshold {
t.log.Errorw(
"failed connections crossed threshold in the last minute",
"beaconID", t.beaconID,
"threshold", t.threshold,
"failures", len(failingNodes),
"nodes", strings.Join(failingNodes, ","),
)
} else if len(failingNodes) >= t.threshold/2 {
t.log.Warnw(
"failed connections crossed half threshold in the last minute",
"beaconID", t.beaconID,
"threshold", t.threshold,
"failures", len(failingNodes),
"nodes", strings.Join(failingNodes, ","),
)
} else {
t.log.Debugw(
"threshold monitor healthy",
"threshold", t.threshold,
"beaconID", t.beaconID,
"failures", len(failingNodes),
"nodes", strings.Join(failingNodes, ","),
)
}
t.lock.RUnlock()
t.lock.Lock()
t.failedConnections = make(map[string]bool)
t.lock.Unlock()
time.Sleep(t.period)
}
}
}()
}
func (t *ThresholdMonitor) Stop() {
t.cancel()
}
func (t *ThresholdMonitor) ReportFailure(beaconID, addr string) {
ErrorSendingPartial(beaconID, addr)
t.lock.Lock()
t.failedConnections[addr] = true
t.lock.Unlock()
}
func (t *ThresholdMonitor) UpdateThreshold(newThreshold int) {
t.lock.Lock()
t.threshold = newThreshold
t.lock.Unlock()
}