-
Notifications
You must be signed in to change notification settings - Fork 405
/
stats.go
318 lines (274 loc) · 11 KB
/
stats.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
package agent
import (
"context"
"strings"
"time"
"github.com/fnproject/fn/api/common"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)
var (
containerStateKey = common.MakeKey("container_state")
callStatusKey = common.MakeKey("call_status")
)
func statsCalls(ctx context.Context) {
stats.Record(ctx, callsMeasure.M(1))
}
func statsEnqueue(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(1))
}
func statsDequeue(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(-1))
}
func statsStartRun(ctx context.Context) {
stats.Record(ctx, runningMeasure.M(1))
}
func statsStopRun(ctx context.Context) {
stats.Record(ctx, runningMeasure.M(-1))
}
func statsComplete(ctx context.Context) {
stats.Record(ctx, completedMeasure.M(1))
}
func statsCanceled(ctx context.Context) {
stats.Record(ctx, canceledMeasure.M(1))
}
func statsTimedout(ctx context.Context) {
stats.Record(ctx, timedoutMeasure.M(1))
}
func statsErrors(ctx context.Context) {
stats.Record(ctx, errorsMeasure.M(1))
}
func statsTooBusy(ctx context.Context) {
stats.Record(ctx, serverBusyMeasure.M(1))
}
func statsLBAgentRunnerSchedLatency(ctx context.Context, dur time.Duration) {
stats.Record(ctx, runnerSchedLatencyMeasure.M(int64(dur/time.Millisecond)))
}
func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) {
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
}
func statsContainerEvicted(ctx context.Context, containerState string) {
ctx, err := tag.New(ctx,
tag.Upsert(containerStateKey, containerState),
)
if err != nil {
logrus.Fatal(err)
}
stats.Record(ctx, containerEvictedMeasure.M(0))
}
func statsUtilization(ctx context.Context, util ResourceUtilization) {
stats.Record(ctx, utilCpuUsedMeasure.M(int64(util.CpuUsed)))
stats.Record(ctx, utilCpuAvailMeasure.M(int64(util.CpuAvail)))
stats.Record(ctx, utilMemUsedMeasure.M(int64(util.MemUsed)))
stats.Record(ctx, utilMemAvailMeasure.M(int64(util.MemAvail)))
}
func statsCallLatency(ctx context.Context, dur time.Duration, callStatus string) {
ctx, err := tag.New(ctx,
tag.Upsert(callStatusKey, callStatus),
)
if err != nil {
logrus.Fatal(err)
}
stats.Record(ctx, callLatencyMeasure.M(int64(dur/time.Millisecond)))
}
const (
//
// WARNING: Dual Role Metrics both used in Runner/Agent and LB-Agent
//
// LB Context:
//
// calls - call received in Agent Submit
// queued - LB is reading request from Client and attempting to validate/start
// running - LB is forwarding Call to runners
// completed - call completed running successfully
// canceled - call canceled (client disconnect)
// timeouts - call timed out
// errors - call failed
// server_busy - server busy responses (retriable)
//
// Agent/Runner Context:
//
// calls - calls received in Agent Submit
// queued - Reading/validating call from client and waiting for resources/containers to start
// running - call is now running
// completed - call completed running (success)
// canceled - call canceled (client disconnect)
// timeouts - call timed out
// errors - call failed
// server_busy - server busy responses (retriable)
//
queuedMetricName = "queued"
callsMetricName = "calls"
runningMetricName = "running"
completedMetricName = "completed"
canceledMetricName = "canceled"
timedoutMetricName = "timeouts"
errorsMetricName = "errors"
serverBusyMetricName = "server_busy"
containerEvictedMetricName = "container_evictions"
utilCpuUsedMetricName = "util_cpu_used"
utilCpuAvailMetricName = "util_cpu_avail"
utilMemUsedMetricName = "util_mem_used"
utilMemAvailMetricName = "util_mem_avail"
// Reported By LB
runnerSchedLatencyMetricName = "lb_runner_sched_latency"
runnerExecLatencyMetricName = "lb_runner_exec_latency"
callLatencyMetricName = "lb_call_latency"
)
var (
queuedMeasure = common.MakeMeasure(queuedMetricName, "calls currently queued against agent", "")
callsMeasure = common.MakeMeasure(callsMetricName, "calls created in agent", "")
runningMeasure = common.MakeMeasure(runningMetricName, "calls currently running in agent", "")
completedMeasure = common.MakeMeasure(completedMetricName, "calls completed in agent", "")
canceledMeasure = common.MakeMeasure(canceledMetricName, "calls canceled in agent", "")
timedoutMeasure = common.MakeMeasure(timedoutMetricName, "calls timed out in agent", "")
errorsMeasure = common.MakeMeasure(errorsMetricName, "calls errored in agent", "")
serverBusyMeasure = common.MakeMeasure(serverBusyMetricName, "calls where server was too busy in agent", "")
dockerMeasures = initDockerMeasures()
containerGaugeMeasures = initContainerGaugeMeasures()
containerTimeMeasures = initContainerTimeMeasures()
utilCpuUsedMeasure = common.MakeMeasure(utilCpuUsedMetricName, "agent cpu in use", "")
utilCpuAvailMeasure = common.MakeMeasure(utilCpuAvailMetricName, "agent cpu available", "")
utilMemUsedMeasure = common.MakeMeasure(utilMemUsedMetricName, "agent memory in use", "By")
utilMemAvailMeasure = common.MakeMeasure(utilMemAvailMetricName, "agent memory available", "By")
containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "")
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
// Reported By LB: Function execution time inside a container.
runnerExecLatencyMeasure = common.MakeMeasure(runnerExecLatencyMetricName, "Runner Container Execution Latency Reported By LBAgent", "msecs")
// Reported By LB: Function total call latency (except function execution inside container)
callLatencyMeasure = common.MakeMeasure(callLatencyMetricName, "LB Call Latency Reported By LBAgent", "msecs")
)
func RegisterLBAgentViews(tagKeys []string, latencyDist []float64) {
// add call_status tag for call latency
callLatencyTags := make([]string, 0, len(tagKeys)+1)
callLatencyTags = append(callLatencyTags, "call_status")
for _, key := range tagKeys {
if key != "call_status" {
callLatencyTags = append(callLatencyTags, key)
}
}
err := view.Register(
common.CreateView(runnerSchedLatencyMeasure, view.Distribution(latencyDist...), tagKeys),
common.CreateView(runnerExecLatencyMeasure, view.Distribution(latencyDist...), tagKeys),
common.CreateView(callLatencyMeasure, view.Distribution(latencyDist...), callLatencyTags),
)
if err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
// RegisterAgentViews creates and registers all agent views
func RegisterAgentViews(tagKeys []string, latencyDist []float64) {
err := view.Register(
common.CreateView(queuedMeasure, view.Sum(), tagKeys),
common.CreateView(callsMeasure, view.Sum(), tagKeys),
common.CreateView(runningMeasure, view.Sum(), tagKeys),
common.CreateView(completedMeasure, view.Sum(), tagKeys),
common.CreateView(canceledMeasure, view.Sum(), tagKeys),
common.CreateView(timedoutMeasure, view.Sum(), tagKeys),
common.CreateView(errorsMeasure, view.Sum(), tagKeys),
common.CreateView(serverBusyMeasure, view.Sum(), tagKeys),
common.CreateView(utilCpuUsedMeasure, view.LastValue(), tagKeys),
common.CreateView(utilCpuAvailMeasure, view.LastValue(), tagKeys),
common.CreateView(utilMemUsedMeasure, view.LastValue(), tagKeys),
common.CreateView(utilMemAvailMeasure, view.LastValue(), tagKeys),
)
if err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
// RegisterDockerViews creates a and registers Docker views with provided tag keys
func RegisterDockerViews(tagKeys []string, latencyDist, ioNetDist, ioDiskDist, memoryDist, cpuDist []float64) {
for _, m := range dockerMeasures {
var dist *view.Aggregation
// Remember these are sampled by docker in short intervals (approx 1 sec)
if m.Name() == "docker_stats_net_rx" || m.Name() == "docker_stats_net_tx" {
dist = view.Distribution(ioNetDist...)
} else if m.Name() == "docker_stats_disk_read" || m.Name() == "docker_stats_disk_write" {
dist = view.Distribution(ioDiskDist...)
} else if m.Name() == "docker_stats_mem_limit" || m.Name() == "docker_stats_mem_usage" {
dist = view.Distribution(memoryDist...)
} else if m.Name() == "docker_stats_cpu_user" || m.Name() == "docker_stats_cpu_total" || m.Name() == "docker_stats_cpu_kernel" {
dist = view.Distribution(cpuDist...)
} else {
// Not used yet.
dist = view.Distribution(latencyDist...)
}
v := common.CreateView(m, dist, tagKeys)
if err := view.Register(v); err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
}
// RegisterContainerViews creates and register containers views with provided tag keys
func RegisterContainerViews(tagKeys []string, latencyDist []float64) {
// Create views for container measures
for i, key := range containerGaugeKeys {
if key == "" {
continue
}
v := common.CreateView(containerGaugeMeasures[i], view.Sum(), tagKeys)
if err := view.Register(v); err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
for i, key := range containerTimeKeys {
if key == "" {
continue
}
v := common.CreateView(containerTimeMeasures[i], view.Distribution(latencyDist...), tagKeys)
if err := view.Register(v); err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
// add container state tag for evictions
evictTags := make([]string, 0, len(tagKeys)+1)
evictTags = append(evictTags, "container_state")
for _, key := range tagKeys {
if key != "container_state" {
evictTags = append(evictTags, key)
}
}
err := view.Register(
common.CreateView(containerEvictedMeasure, view.Count(), evictTags),
)
if err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
}
// initDockerMeasures initializes Docker related measures
func initDockerMeasures() map[string]*stats.Int64Measure {
// TODO this is nasty figure out how to use opencensus to not have to declare these
keys := []string{"net_rx", "net_tx", "mem_limit", "mem_usage", "disk_read", "disk_write", "cpu_user", "cpu_total", "cpu_kernel"}
measures := make(map[string]*stats.Int64Measure, len(keys))
for _, key := range keys {
units := "bytes"
if strings.Contains(key, "cpu") {
units = "cpu"
}
measures[key] = common.MakeMeasure("docker_stats_"+key, "docker container stats for "+key, units)
}
return measures
}
func initContainerGaugeMeasures() []*stats.Int64Measure {
gaugeMeasures := make([]*stats.Int64Measure, len(containerGaugeKeys))
for i, key := range containerGaugeKeys {
if key == "" { // leave nil intentionally, let it panic
continue
}
gaugeMeasures[i] = common.MakeMeasure(key, "containers in state "+key, "")
}
return gaugeMeasures
}
func initContainerTimeMeasures() []*stats.Int64Measure {
timeMeasures := make([]*stats.Int64Measure, len(containerTimeKeys))
for i, key := range containerTimeKeys {
if key == "" {
continue
}
timeMeasures[i] = common.MakeMeasure(key, "time spent in container state "+key, "ms")
}
return timeMeasures
}