-
Notifications
You must be signed in to change notification settings - Fork 586
/
proxyhealth.go
382 lines (341 loc) · 12.4 KB
/
proxyhealth.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
package proxyhealth
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/codersdk"
)
type Status string
const (
// Unknown should never be returned by the proxy health check.
Unknown Status = "unknown"
// Healthy means the proxy access url is reachable and returns a healthy
// status code.
Healthy Status = "ok"
// Unreachable means the proxy access url is not responding.
Unreachable Status = "unreachable"
// Unhealthy means the proxy access url is responding, but there is some
// problem with the proxy. This problem may or may not be preventing functionality.
Unhealthy Status = "unhealthy"
// Unregistered means the proxy has not registered a url yet. This means
// the proxy was created with the cli, but has not yet been started.
Unregistered Status = "unregistered"
)
type Options struct {
// Interval is the interval at which the proxy health is checked.
Interval time.Duration
DB database.Store
Logger slog.Logger
Client *http.Client
Prometheus *prometheus.Registry
}
// ProxyHealth runs a go routine that periodically checks the health of all
// workspace proxies. This information is stored in memory, so each coderd
// replica has its own view of the health of the proxies. These views should be
// consistent, and if they are not, it indicates a problem.
type ProxyHealth struct {
db database.Store
interval time.Duration
logger slog.Logger
client *http.Client
// Cached values for quick access to the health of proxies.
cache *atomic.Pointer[map[uuid.UUID]ProxyStatus]
proxyHosts *atomic.Pointer[[]string]
// PromMetrics
healthCheckDuration prometheus.Histogram
healthCheckResults *prometheusmetrics.CachedGaugeVec
}
func New(opts *Options) (*ProxyHealth, error) {
if opts.Interval <= 0 {
opts.Interval = time.Minute
}
if opts.DB == nil {
return nil, xerrors.Errorf("db is required")
}
if opts.Prometheus == nil {
opts.Prometheus = prometheus.NewRegistry()
}
client := opts.Client
if client == nil {
client = http.DefaultClient
}
// Set a timeout on the client, so we don't wait forever for a healthz response.
tmp := *client
tmp.Timeout = time.Second * 5
client = &tmp
// Prometheus metrics
healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_duration_seconds",
Help: "Histogram for duration of proxy health collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
opts.Prometheus.MustRegister(healthCheckDuration)
healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_results",
Help: "This endpoint returns a number to indicate the health status. " +
"-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)",
}, []string{"proxy_id"}))
opts.Prometheus.MustRegister(healthCheckResults)
return &ProxyHealth{
db: opts.DB,
interval: opts.Interval,
logger: opts.Logger,
client: client,
cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{},
proxyHosts: &atomic.Pointer[[]string]{},
healthCheckDuration: healthCheckDuration,
healthCheckResults: healthCheckResults,
}, nil
}
// Run will block until the context is canceled. It will periodically check the
// health of all proxies and store the results in the cache.
func (p *ProxyHealth) Run(ctx context.Context) {
ticker := time.NewTicker(p.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case now := <-ticker.C:
statuses, err := p.runOnce(ctx, now)
if err != nil {
p.logger.Error(ctx, "proxy health check failed", slog.Error(err))
continue
}
p.storeProxyHealth(statuses)
}
}
}
func (p *ProxyHealth) storeProxyHealth(statuses map[uuid.UUID]ProxyStatus) {
var proxyHosts []string
for _, s := range statuses {
if s.ProxyHost != "" {
proxyHosts = append(proxyHosts, s.ProxyHost)
}
}
// Store the statuses in the cache before any other quick values.
p.cache.Store(&statuses)
p.proxyHosts.Store(&proxyHosts)
}
// ForceUpdate runs a single health check and updates the cache. If the health
// check fails, the cache is not updated and an error is returned. This is useful
// to trigger an update when a proxy is created or deleted.
func (p *ProxyHealth) ForceUpdate(ctx context.Context) error {
statuses, err := p.runOnce(ctx, time.Now())
if err != nil {
return err
}
p.storeProxyHealth(statuses)
return nil
}
// HealthStatus returns the current health status of all proxies stored in the
// cache.
func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus {
if p == nil {
// This can happen because workspace proxies are still an experiment.
// For the /regions endpoint, this will be nil in those cases.
return map[uuid.UUID]ProxyStatus{}
}
ptr := p.cache.Load()
if ptr == nil {
return map[uuid.UUID]ProxyStatus{}
}
return *ptr
}
type ProxyStatus struct {
// ProxyStatus includes the value of the proxy at the time of checking. This is
// useful to know as it helps determine if the proxy checked has different values
// then the proxy in hand. AKA if the proxy was updated, and the status was for
// an older proxy.
Proxy database.WorkspaceProxy
// ProxyHost is the host:port of the proxy url. This is included in the status
// to make sure the proxy url is a valid URL. It also makes it easier to
// escalate errors if the url.Parse errors (should never happen).
ProxyHost string
Status Status
Report codersdk.ProxyHealthReport
CheckedAt time.Time
}
// ProxyHosts returns the host:port of all healthy proxies.
// This can be computed from HealthStatus, but is cached to avoid the
// caller needing to loop over all proxies to compute this on all
// static web requests.
func (p *ProxyHealth) ProxyHosts() []string {
ptr := p.proxyHosts.Load()
if ptr == nil {
return []string{}
}
return *ptr
}
// runOnce runs the health check for all workspace proxies. If there is an
// unexpected error, an error is returned. Expected errors will mark a proxy as
// unreachable.
func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) {
// Record from the given time.
defer func() { p.healthCheckDuration.Observe(time.Since(now).Seconds()) }()
//nolint:gocritic // Proxy health is a system service.
proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx))
if err != nil {
return nil, xerrors.Errorf("get workspace proxies: %w", err)
}
// Just use a mutex to protect map writes.
var statusMu sync.Mutex
proxyStatus := map[uuid.UUID]ProxyStatus{}
grp, gctx := errgroup.WithContext(ctx)
// Arbitrary parallelism limit.
grp.SetLimit(5)
for _, proxy := range proxies {
if proxy.Deleted {
// Ignore deleted proxies.
continue
}
// Each proxy needs to have a status set. Make a local copy for the
// call to be run async.
proxy := proxy
status := ProxyStatus{
Proxy: proxy,
CheckedAt: now,
Status: Unknown,
}
grp.Go(func() error {
if proxy.Url == "" {
// Empty URL means the proxy has not registered yet.
// When the proxy is started, it will update the url.
statusMu.Lock()
defer statusMu.Unlock()
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String())
status.Status = Unregistered
proxyStatus[proxy.ID] = status
return nil
}
// Try to hit the healthz-report endpoint for a comprehensive health check.
reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/"))
req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil)
if err != nil {
return xerrors.Errorf("new request: %w", err)
}
req = req.WithContext(gctx)
resp, err := p.client.Do(req)
if err == nil {
defer resp.Body.Close()
}
// A switch statement felt easier to categorize the different cases than
// if else statements or nested if statements.
switch {
case err == nil && resp.StatusCode == http.StatusOK:
err := json.NewDecoder(resp.Body).Decode(&status.Report)
if err != nil {
isCoderErr := xerrors.Errorf("proxy url %q is not a coder proxy instance, verify the url is correct", reqURL)
if resp.Header.Get(codersdk.BuildVersionHeader) != "" {
isCoderErr = xerrors.Errorf("proxy url %q is a coder instance, but unable to decode the response payload. Could this be a primary coderd and not a proxy?", reqURL)
}
// If the response is not json, then the user likely input a bad url that returns status code 200.
// This is very common, since most webpages do return a 200. So let's improve the error message.
if notJSONErr := codersdk.ExpectJSONMime(resp); notJSONErr != nil {
err = errors.Join(
isCoderErr,
xerrors.Errorf("attempted to query health at %q but got back the incorrect content type: %w", reqURL, notJSONErr),
)
status.Report.Errors = []string{
err.Error(),
}
status.Status = Unhealthy
break
}
// If we cannot read the report, mark the proxy as unhealthy.
status.Report.Errors = []string{
errors.Join(
isCoderErr,
xerrors.Errorf("received a status code 200, but failed to decode health report body: %w", err),
).Error(),
}
status.Status = Unhealthy
break
}
if len(status.Report.Errors) > 0 {
status.Status = Unhealthy
break
}
status.Status = Healthy
case err == nil && resp.StatusCode != http.StatusOK:
// Unhealthy as we did reach the proxy but it got an unexpected response.
status.Status = Unhealthy
var builder strings.Builder
// This string is shown on the UI where newlines are respected.
// This error message is not ever decoded programmatically, so keep it human-
// readable.
builder.WriteString(fmt.Sprintf("unexpected status code %d. ", resp.StatusCode))
builder.WriteString(fmt.Sprintf("\nEncountered error, send a request to %q from the Coderd environment to debug this issue.", reqURL))
// err will always be non-nil
err := codersdk.ReadBodyAsError(resp)
var apiErr *codersdk.Error
if xerrors.As(err, &apiErr) {
builder.WriteString(fmt.Sprintf("\nError Message: %s\nError Detail: %s", apiErr.Message, apiErr.Detail))
for _, v := range apiErr.Validations {
// Pretty sure this is not possible from the called endpoint, but just in case.
builder.WriteString(fmt.Sprintf("\n\tValidation: %s=%s", v.Field, v.Detail))
}
}
builder.WriteString(fmt.Sprintf("\nError: %s", err.Error()))
status.Report.Errors = []string{builder.String()}
case err != nil:
// Request failed, mark the proxy as unreachable.
status.Status = Unreachable
status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())}
default:
// This should never happen
status.Status = Unknown
}
u, err := url.Parse(proxy.Url)
if err != nil {
// This should never happen. This would mean the proxy sent
// us an invalid url?
status.Report.Errors = append(status.Report.Errors, fmt.Sprintf("failed to parse proxy url: %s", err.Error()))
status.Status = Unhealthy
}
status.ProxyHost = u.Host
// Set the prometheus metric correctly.
switch status.Status {
case Healthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String())
case Unhealthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String())
case Unreachable:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String())
default:
// Unknown
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String())
}
statusMu.Lock()
defer statusMu.Unlock()
proxyStatus[proxy.ID] = status
return nil
})
}
err = grp.Wait()
if err != nil {
return nil, xerrors.Errorf("group run: %w", err)
}
p.healthCheckResults.Commit()
return proxyStatus, nil
}