-
Notifications
You must be signed in to change notification settings - Fork 28
/
metrics.go
494 lines (451 loc) · 15.8 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
// SPDX-License-Identifier: Apache-2.0
package api
import (
"net/http"
"time"
"github.com/gin-gonic/gin"
"github.com/go-vela/server/database"
"github.com/go-vela/server/queue"
"github.com/go-vela/types/constants"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
)
// MetricsQueryParameters holds query parameter information pertaining to requested metrics.
type MetricsQueryParameters struct {
// UserCount represents total platform users
UserCount bool `form:"user_count"`
// RepoCount represents total platform repos
RepoCount bool `form:"repo_count"`
// BuildCount represents total number of builds
BuildCount bool `form:"build_count"`
// RunningBuildCount represents total number of builds with status==running
RunningBuildCount bool `form:"running_build_count"`
// PendingBuildCount represents total number of builds with status==pending
PendingBuildCount bool `form:"pending_build_count"`
// QueuedBuildCount represents total number of builds currently in the queue
QueuedBuildCount bool `form:"queued_build_count"`
// FailureBuildCount represents total number of builds with status==failure
FailureBuildCount bool `form:"failure_build_count"`
// KilledBuildCount represents total number of builds with status==killed
KilledBuildCount bool `form:"killed_build_count"`
// SuccessBuildCount represents total number of builds with status==success
SuccessBuildCount bool `form:"success_build_count"`
// ErrorBuildCount represents total number of builds with status==error
ErrorBuildCount bool `form:"error_build_count"`
// StepImageCount represents total number of step images
StepImageCount bool `form:"step_image_count"`
// StepStatusCount represents total number of step statuses
StepStatusCount bool `form:"step_status_count"`
// ServiceImageCount represents total number of service images
ServiceImageCount bool `form:"service_image_count"`
// ServiceStatusCount represents total number of service statuses
ServiceStatusCount bool `form:"service_status_count"`
// WorkerBuildLimit represents total worker build limit
WorkerBuildLimit bool `form:"worker_build_limit"`
// ActiveWorkerCount represents total number of active workers
ActiveWorkerCount bool `form:"active_worker_count"`
// InactiveWorkerCount represents total number of inactive workers
InactiveWorkerCount bool `form:"inactive_worker_count"`
// IdleWorkerCount represents total number of workers with a status of idle
// where worker RunningBuildIDs.length = 0
IdleWorkerCount bool `form:"idle_worker_count"`
// AvailableWorkerCount represents total number of workers with a status of available,
// where worker RunningBuildIDs.length > 0 and < worker BuildLimit
AvailableWorkerCount bool `form:"available_worker_count"`
// BusyWorkerCount represents total number of workers with a status of busy,
// where worker BuildLimit == worker RunningBuildIDs.length
BusyWorkerCount bool `form:"busy_worker_count"`
// ErrorWorkerCount represents total number of workers with a status of error
ErrorWorkerCount bool `form:"error_worker_count"`
}
// predefine Prometheus metrics else they will be regenerated
// each function call which will throw error:
// "duplicate metrics collector registration attempted".
var (
totals = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "vela_totals",
Help: "The Vela totals collect the total number for a resource type.",
},
[]string{"resource", "field", "value"},
)
stepImages = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "step_images",
Help: "Step Images collect the number of times an image was used in a step.",
},
[]string{"name"},
)
serviceImages = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "service_images",
Help: "Service Images collect the number of times an image was used in a service.",
},
[]string{"name"},
)
)
// swagger:operation GET /metrics base BaseMetrics
//
// Retrieve metrics from the Vela api
//
// ---
// produces:
// - text/plain
// parameters:
// - in: query
// name: user_count
// description: Indicates a request for user count
// type: boolean
// default: false
// - in: query
// name: repo_count
// description: Indicates a request for repo count
// type: boolean
// default: false
// - in: query
// name: build_count
// description: Indicates a request for build count
// type: boolean
// default: false
// - in: query
// name: running_build_count
// description: Indicates a request for running build count
// type: boolean
// default: false
// - in: query
// name: pending_build_count
// description: Indicates a request for pending build count
// type: boolean
// default: false
// - in: query
// name: queued_build_count
// description: Indicates a request for queued build count
// type: boolean
// default: false
// - in: query
// name: failure_build_count
// description: Indicates a request for failure build count
// type: boolean
// default: false
// - in: query
// name: killed_build_count
// description: Indicates a request for killed build count
// type: boolean
// default: false
// - in: query
// name: success_build_count
// description: Indicates a request for success build count
// type: boolean
// default: false
// - in: query
// name: error_build_count
// description: Indicates a request for error build count
// type: boolean
// default: false
// - in: query
// name: step_image_count
// description: Indicates a request for step image count
// type: boolean
// default: false
// - in: query
// name: step_status_count
// description: Indicates a request for step status count
// type: boolean
// default: false
// - in: query
// name: service_image_count
// description: Indicates a request for service image count
// type: boolean
// default: false
// - in: query
// name: service_status_count
// description: Indicates a request for service status count
// type: boolean
// default: false
// - in: query
// name: worker_build_limit
// description: Indicates a request for total worker build limit
// type: boolean
// default: false
// - in: query
// name: active_worker_count
// description: Indicates a request for active worker count
// type: boolean
// default: false
// - in: query
// name: inactive_worker_count
// description: Indicates a request for inactive worker count
// type: boolean
// default: false
// - in: query
// name: idle_worker_count
// description: Indicates a request for idle worker count
// type: boolean
// default: false
// - in: query
// name: available_worker_count
// description: Indicates a request for available worker count
// type: boolean
// default: false
// - in: query
// name: busy_worker_count
// description: Indicates a request for busy worker count
// type: boolean
// default: false
// - in: query
// name: error_worker_count
// description: Indicates a request for error worker count
// type: boolean
// default: false
// responses:
// '200':
// description: Successfully retrieved the Vela metrics
// schema:
// type: string
// BaseMetrics returns a Prometheus handler for serving go metrics.
func BaseMetrics() http.Handler {
return promhttp.Handler()
}
// CustomMetrics returns custom Prometheus metrics from private functions.
func CustomMetrics(c *gin.Context) {
// call helper function to return total users
recordGauges(c)
}
// helper function to get the totals of resource types.
//
//nolint:funlen,gocyclo // ignore function length and cyclomatic complexity
func recordGauges(c *gin.Context) {
// capture middleware values
ctx := c.Request.Context()
// variable to store query parameters
q := MetricsQueryParameters{}
// take incoming request and bind query parameters
err := c.ShouldBindQuery(&q)
if err != nil {
logrus.Errorf("unable to get bind query parameters: %v", err)
} // continue execution with parameters defaulted to false
// get each metric separately based on request query parameters
// user_count
if q.UserCount {
// send API call to capture the total number of users
u, err := database.FromContext(c).CountUsers(ctx)
if err != nil {
logrus.Errorf("unable to get count of all users: %v", err)
}
// add platform metrics
totals.WithLabelValues("platform", "count", "users").Set(float64(u))
}
// repo_count
if q.RepoCount {
// send API call to capture the total number of repos
r, err := database.FromContext(c).CountRepos(ctx)
if err != nil {
logrus.Errorf("unable to get count of all repos: %v", err)
}
// add platform metrics
totals.WithLabelValues("platform", "count", "repos").Set(float64(r))
}
// build_count
if q.BuildCount {
// send API call to capture the total number of builds
b, err := database.FromContext(c).CountBuilds(ctx)
if err != nil {
logrus.Errorf("unable to get count of all builds: %v", err)
}
// add platform metrics
totals.WithLabelValues("platform", "count", "builds").Set(float64(b))
}
// running_build_count
if q.RunningBuildCount {
// send API call to capture the total number of running builds
bRun, err := database.FromContext(c).CountBuildsForStatus(ctx, "running", nil)
if err != nil {
logrus.Errorf("unable to get count of all running builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "running").Set(float64(bRun))
}
// pending_build_count
if q.PendingBuildCount {
// send API call to capture the total number of pending builds
bPen, err := database.FromContext(c).CountBuildsForStatus(ctx, "pending", nil)
if err != nil {
logrus.Errorf("unable to get count of all pending builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "pending").Set(float64(bPen))
}
// queued_build_count
if q.QueuedBuildCount {
// send API call to capture the total number of queued builds
t, err := queue.FromContext(c).Length(c)
if err != nil {
logrus.Errorf("unable to get count of all queued builds: %v", err)
}
totals.WithLabelValues("build", "status", "queued").Set(float64(t))
}
// failure_build_count
if q.FailureBuildCount {
// send API call to capture the total number of failure builds
bFail, err := database.FromContext(c).CountBuildsForStatus(ctx, "failure", nil)
if err != nil {
logrus.Errorf("unable to get count of all failure builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "failed").Set(float64(bFail))
}
// killed_build_count
if q.KilledBuildCount {
// send API call to capture the total number of killed builds
bKill, err := database.FromContext(c).CountBuildsForStatus(ctx, "killed", nil)
if err != nil {
logrus.Errorf("unable to get count of all killed builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "killed").Set(float64(bKill))
}
// success_build_count
if q.SuccessBuildCount {
// send API call to capture the total number of success builds
bSucc, err := database.FromContext(c).CountBuildsForStatus(ctx, "success", nil)
if err != nil {
logrus.Errorf("unable to get count of all success builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "success").Set(float64(bSucc))
}
// error_build_count
if q.ErrorBuildCount {
// send API call to capture the total number of error builds
bErr, err := database.FromContext(c).CountBuildsForStatus(ctx, "error", nil)
if err != nil {
logrus.Errorf("unable to get count of all error builds: %v", err)
}
// add build metrics
totals.WithLabelValues("build", "status", "error").Set(float64(bErr))
}
// step_image_count
if q.StepImageCount {
// send API call to capture the total number of step images
stepImageMap, err := database.FromContext(c).ListStepImageCount(ctx)
if err != nil {
logrus.Errorf("unable to get count of all step images: %v", err)
}
// add step image metrics
for image, count := range stepImageMap {
stepImages.WithLabelValues(image).Set(count)
}
}
// step_status_count
if q.StepStatusCount {
// send API call to capture the total number of step statuses
stepStatusMap, err := database.FromContext(c).ListStepStatusCount(ctx)
if err != nil {
logrus.Errorf("unable to get count of all step statuses: %v", err)
}
// add step status metrics
for status, count := range stepStatusMap {
totals.WithLabelValues("steps", "status", status).Set(count)
}
}
// service_image_count
if q.ServiceImageCount {
// send API call to capture the total number of service images
serviceImageMap, err := database.FromContext(c).ListServiceImageCount(ctx)
if err != nil {
logrus.Errorf("unable to get count of all service images: %v", err)
}
// add service image metrics
for image, count := range serviceImageMap {
serviceImages.WithLabelValues(image).Set(count)
}
}
// service_status_count
if q.ServiceStatusCount {
// send API call to capture the total number of service statuses
serviceStatusMap, err := database.FromContext(c).ListServiceStatusCount(ctx)
if err != nil {
logrus.Errorf("unable to get count of all service statuses: %v", err)
}
// add service status metrics
for status, count := range serviceStatusMap {
totals.WithLabelValues("services", "status", status).Set(count)
}
}
// add worker metrics
var (
buildLimit int64
activeWorkers int64
inactiveWorkers int64
idleWorkers int64
availableWorkers int64
busyWorkers int64
errorWorkers int64
)
// get worker metrics based on request query parameters
// worker_build_limit, active_worker_count, inactive_worker_count, idle_worker_count, available_worker_count, busy_worker_count, error_worker_count
if q.WorkerBuildLimit || q.ActiveWorkerCount || q.InactiveWorkerCount || q.IdleWorkerCount || q.AvailableWorkerCount || q.BusyWorkerCount || q.ErrorWorkerCount {
// send API call to capture the workers
workers, err := database.FromContext(c).ListWorkers(ctx, "all", time.Now().Unix(), 0)
if err != nil {
logrus.Errorf("unable to get workers: %v", err)
}
// get the unix time from worker_active_interval ago
before := time.Now().UTC().Add(-c.Value("worker_active_interval").(time.Duration)).Unix()
// active, inactive counts
// idle, available, busy, error counts
for _, worker := range workers {
// check if the worker checked in within the last worker_active_interval
if worker.GetLastCheckedIn() >= before {
buildLimit += worker.GetBuildLimit()
activeWorkers++
} else {
inactiveWorkers++
}
// check if the worker checked in within the last worker_active_interval
if worker.GetLastCheckedIn() >= before {
switch worker.GetStatus() {
case constants.WorkerStatusIdle:
idleWorkers++
case constants.WorkerStatusAvailable:
availableWorkers++
case constants.WorkerStatusBusy:
busyWorkers++
case constants.WorkerStatusError:
errorWorkers++
}
}
}
// apply metrics based on request query parameters
// worker_build_limit
if q.WorkerBuildLimit {
totals.WithLabelValues("worker", "sum", "build_limit").Set(float64(buildLimit))
}
// active_worker_count
if q.ActiveWorkerCount {
totals.WithLabelValues("worker", "count", "active").Set(float64(activeWorkers))
}
// inactive_worker_count
if q.InactiveWorkerCount {
totals.WithLabelValues("worker", "count", "inactive").Set(float64(inactiveWorkers))
}
// idle_worker_count
if q.IdleWorkerCount {
totals.WithLabelValues("worker", "count", "idle").Set(float64(idleWorkers))
}
// available_worker_count
if q.AvailableWorkerCount {
totals.WithLabelValues("worker", "count", "available").Set(float64(availableWorkers))
}
// busy_worker_count
if q.BusyWorkerCount {
totals.WithLabelValues("worker", "count", "busy").Set(float64(busyWorkers))
}
// error_worker_count
if q.ErrorWorkerCount {
totals.WithLabelValues("worker", "count", "error").Set(float64(errorWorkers))
}
}
}