-
Notifications
You must be signed in to change notification settings - Fork 681
/
ambassador.go
191 lines (159 loc) · 6.23 KB
/
ambassador.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// Copyright 2020 Datawire. All rights reserved.
//
// package acp contains stuff dealing with the Ambassador Control Plane as a whole.
//
// This is the AmbassadorWatcher, which is a class that can keep an eye on a running
// Ambassador as a whole, and tell you whether it's alive and ready, or not.
//
// THE STATE MACHINE AND THE GRACE PERIOD:
// When an Ambassador pod boots, Envoy is _not_ started at boot time: instead, we
// wait for the initial configuration to be generated and only then start Envoy (this
// is because we don't know what port to tell Envoy to listen on until after we've
// generated the initial configuration). However, Envoy can't come up instantly,
// either.
//
// To deal with this, there's a state machine in here. We start in envoyNotStarted
// state, move to envoyStarting when the first snapshot is processed, then move to
// envoyRunning when we successfully get stats back from Envoy. These states inform
// what, exactly, we demand to declare Ambassador alive, dead, ready, or not ready.
//
// If you like pictures instead of words: envoyNotStarted
// |
// | (first snapshot is processed)
// V
// envoyStarting
// |
// | (we got stats from Envoy)
// V
// envoyRunning
//
// Envoy is currently given 30 seconds to come up after getting its initial
// configuration. This may be the wrong compromise: in practice, Envoy should come
// up _much_ faster than that, but the idea this code is more about providing a
// conservative failsafe than providing a finely-tuned hair trigger.
//
// TESTING HOOKS:
// Since time plays a role, you can use AmbassadorWatcher.SetFetchTime to change the
// function that the AmbassadorWatcher uses to fetch times. The default is time.Now.
//
// This hook is NOT meant for you to change the values on the fly in a running
// AmbassadorWatcher. Set it at instantiation if need be, then leave it alone. See
// ambassador_test.go for more.
package acp
import (
"context"
"fmt"
"sync"
"time"
)
type awState int
const (
envoyNotStarted awState = iota
envoyStarting
envoyRunning
)
// AmbassadorWatcher encapsulates state and methods for keeping an eye on a running
// Ambassador, and deciding if it's healthy.
type AmbassadorWatcher struct {
// This mutex protects access our watchers and our state, mostly as
// a matter of rank paranoia.
mutex sync.Mutex
// How shall we fetch the current time?
fetchTime timeFetcher
// What's the current Envoy state?
state awState
// We encapsulate an EnvoyWatcher and a DiagdWatcher.
ew *EnvoyWatcher
dw *DiagdWatcher
// At the point that the DiagdWatcher finishes processing the very first
// snapshot, we have to hand the snapshot to Envoy and allow Envoy to start
// up. This takes finite time, so we have to allow for that.
GraceEnd time.Time
}
// NewAmbassadorWatcher creates a new AmbassadorWatcher, given a fetcher.
//
// Honestly, this is slightly pointless -- it's here for parallelism with the
// EnvoyWatcher and the DiagdWatcher.
func NewAmbassadorWatcher(ew *EnvoyWatcher, dw *DiagdWatcher) *AmbassadorWatcher {
return &AmbassadorWatcher{
// Default to using time.Now for time. This can be reset later.
fetchTime: time.Now,
state: envoyNotStarted,
ew: ew,
dw: dw,
}
}
// SetFetchTime will change the function we use to get the current time.
func (w *AmbassadorWatcher) SetFetchTime(fetchTime timeFetcher) {
w.fetchTime = fetchTime
}
// FetchEnvoyReady will check whether Envoy's statistics are fetchable.
func (w *AmbassadorWatcher) FetchEnvoyReady(ctx context.Context) {
w.mutex.Lock()
defer w.mutex.Unlock()
w.ew.FetchEnvoyReady(ctx)
}
// NoteSnapshotSent will note that a snapshot has been sent.
func (w *AmbassadorWatcher) NoteSnapshotSent() {
w.mutex.Lock()
defer w.mutex.Unlock()
w.dw.NoteSnapshotSent()
}
// NoteSnapshotProcessed will note that a snapshot has been processed.
func (w *AmbassadorWatcher) NoteSnapshotProcessed() {
w.mutex.Lock()
defer w.mutex.Unlock()
w.dw.NoteSnapshotProcessed()
// Is this is the very first time we've processed a snapshot?
if w.state == envoyNotStarted {
// Yes, it is. Note that we're now waiting for Envoy to start...
w.state = envoyStarting
// ...and give Envoy 30 seconds to come up.
w.GraceEnd = w.fetchTime().Add(30 * time.Second)
}
}
// IsAlive returns true IFF the Ambassador as a whole can be considered alive.
func (w *AmbassadorWatcher) IsAlive() bool {
w.mutex.Lock()
defer w.mutex.Unlock()
// First things first: if diagd isn't alive, Ambassador as a whole is
// clearly not alive.
if !w.dw.IsAlive() {
return false
}
// OK, diagd is alive. We need to look at our current state to figure
// out how we'll check Envoy.
switch w.state {
case envoyNotStarted:
// We haven't even tried to start Envoy yet, so we're good to go with
// just diagd being alive.
return true
case envoyStarting:
// We're waiting for Envoy to start. Has it?
if w.ew.IsAlive() {
// Yes. Remember that it's running...
w.state = envoyRunning
// ...and then we're good to go.
return true
}
// It's not yet running. Return true IFF we're still within the grace period.
return w.fetchTime().Before(w.GraceEnd)
case envoyRunning:
// Envoy is already running, so check to make sure that it's still alive.
return w.ew.IsAlive()
default:
// This is "impossible": w.state isn't exported, and it's deliberately
// typed as awState to catch someone trying to assign a random integer to
// it. However, I guess someone could conceivably assign something new to
// it without updating this code, so we test for it.
panic(fmt.Sprintf("AmbassadorWatcher.state has unknown value %d", w.state))
}
}
// IsReady returns true IFF the Ambassador as a whole can be considered ready.
func (w *AmbassadorWatcher) IsReady() bool {
w.mutex.Lock()
defer w.mutex.Unlock()
// This is much simpler that IsAlive. Ambassador is ready IFF both diagd and
// Envoy are ready; that's all there is to it.
return w.dw.IsReady() && w.ew.IsReady()
}