/
metrics.go
192 lines (168 loc) · 6.62 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// SPDX-FileCopyrightText: 2023 The Crossplane Authors <https://crossplane.io>
//
// SPDX-License-Identifier: Apache-2.0
package metrics
import (
"context"
"sync"
"time"
"github.com/crossplane/crossplane-runtime/pkg/resource"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/tools/cache"
"sigs.k8s.io/controller-runtime/pkg/cluster"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
const (
promNSUpjet = "upjet"
promSysTF = "terraform"
promSysResource = "resource"
)
var (
// CLITime is the Terraform CLI execution times histogram.
CLITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: promNSUpjet,
Subsystem: promSysTF,
Name: "cli_duration",
Help: "Measures in seconds how long it takes a Terraform CLI invocation to complete",
Buckets: []float64{1.0, 3, 5, 10, 15, 30, 60, 120, 300},
}, []string{"subcommand", "mode"})
// ExternalAPITime is the SDK processing times histogram.
ExternalAPITime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: promNSUpjet,
Subsystem: promSysResource,
Name: "ext_api_duration",
Help: "Measures in seconds how long it takes a Cloud SDK call to complete",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"operation"})
// ExternalAPICalls is a counter metric of the number of external
// API calls. "service" and "operation" labels could be used to
// classify calls into a two-level hierarchy, in which calls are
// "operations" that belong to a "service". Users should beware of
// performance implications of high cardinality that could occur
// when there are many services and operations. See:
// https://prometheus.io/docs/practices/naming/#labels
ExternalAPICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: promNSUpjet,
Subsystem: promSysResource,
Name: "external_api_calls_total",
Help: "The number of external API calls.",
}, []string{"service", "operation"})
// DeletionTime is the histogram metric for collecting statistics on the
// intervals between the deletion timestamp and the moment when
// the resource is observed to be missing (actually deleted).
DeletionTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: promNSUpjet,
Subsystem: promSysResource,
Name: "deletion_seconds",
Help: "Measures in seconds how long it takes for a resource to be deleted",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"group", "version", "kind"})
// ReconcileDelay is the histogram metric for collecting statistics on the
// delays between when the expected reconciles of an up-to-date resource
// should happen and when the resource is actually reconciled. Only
// delays from the expected reconcile times are considered.
ReconcileDelay = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: promNSUpjet,
Subsystem: promSysResource,
Name: "reconcile_delay_seconds",
Help: "Measures in seconds how long the reconciles for a resource have been delayed from the configured poll periods",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"group", "version", "kind"})
// CLIExecutions are the active number of terraform CLI invocations.
CLIExecutions = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: promNSUpjet,
Subsystem: promSysTF,
Name: "active_cli_invocations",
Help: "The number of active (running) Terraform CLI invocations",
}, []string{"subcommand", "mode"})
// TFProcesses are the active number of
// terraform CLI & Terraform provider processes running.
TFProcesses = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: promNSUpjet,
Subsystem: promSysTF,
Name: "running_processes",
Help: "The number of running Terraform CLI and Terraform provider processes",
}, []string{"type"})
// TTRMeasurements are the time-to-readiness measurements for
// the managed resources.
TTRMeasurements = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: promNSUpjet,
Subsystem: promSysResource,
Name: "ttr",
Help: "Measures in seconds the time-to-readiness (TTR) for managed resources",
Buckets: []float64{1, 5, 10, 15, 30, 60, 120, 300, 600, 1800, 3600},
}, []string{"group", "version", "kind"})
)
var _ manager.Runnable = &MetricRecorder{}
type MetricRecorder struct {
observations sync.Map
gvk schema.GroupVersionKind
cluster cluster.Cluster
pollInterval time.Duration
}
type Observations struct {
expectedReconcileTime *time.Time
observeReconcileDelay bool
}
func NewMetricRecorder(gvk schema.GroupVersionKind, c cluster.Cluster, pollInterval time.Duration) *MetricRecorder {
return &MetricRecorder{
gvk: gvk,
cluster: c,
pollInterval: pollInterval,
}
}
func (r *MetricRecorder) SetReconcileTime(name string) {
if r == nil {
return
}
o, ok := r.observations.Load(name)
if !ok {
o = &Observations{}
r.observations.Store(name, o)
}
t := time.Now().Add(r.pollInterval)
o.(*Observations).expectedReconcileTime = &t
o.(*Observations).observeReconcileDelay = true
}
func (r *MetricRecorder) ObserveReconcileDelay(gvk schema.GroupVersionKind, name string) {
if r == nil {
return
}
o, _ := r.observations.Load(name)
if o == nil || !o.(*Observations).observeReconcileDelay || o.(*Observations).expectedReconcileTime == nil {
return
}
d := time.Since(*o.(*Observations).expectedReconcileTime)
if d < 0 {
d = 0
}
ReconcileDelay.WithLabelValues(gvk.Group, gvk.Version, gvk.Kind).Observe(d.Seconds())
o.(*Observations).observeReconcileDelay = false
}
func (r *MetricRecorder) Start(ctx context.Context) error {
inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk)
if err != nil {
return errors.Wrapf(err, "cannot get informer for metric recorder for resource %s", r.gvk)
}
registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{
DeleteFunc: func(obj interface{}) {
if final, ok := obj.(cache.DeletedFinalStateUnknown); ok {
obj = final.Obj
}
managed := obj.(resource.Managed)
r.observations.Delete(managed.GetName())
},
})
if err != nil {
return errors.Wrap(err, "cannot add delete event handler to informer for metric recorder")
}
defer inf.RemoveEventHandler(registered) //nolint:errcheck // this happens on destruction. We cannot do anything anyway.
<-ctx.Done()
return nil
}
func init() {
metrics.Registry.MustRegister(CLITime, CLIExecutions, TFProcesses, TTRMeasurements, ExternalAPITime, ExternalAPICalls, DeletionTime, ReconcileDelay)
}