-
Notifications
You must be signed in to change notification settings - Fork 410
fn: introducing lb placer basic metrics #1058
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6af7cac
7763ae7
b24f58c
5a4c251
44ebcd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| package runnerpool | ||
|
|
||
| import ( | ||
| "context" | ||
| "math" | ||
| "time" | ||
|
|
||
| "github.com/sirupsen/logrus" | ||
| "go.opencensus.io/stats" | ||
| "go.opencensus.io/stats/view" | ||
| "go.opencensus.io/tag" | ||
| ) | ||
|
|
||
| var ( | ||
| attemptCountMeasure = stats.Int64("lb_placer_attempt_count", "LB Placer Number of Runners Attempted Count", "") | ||
| errorPoolCountMeasure = stats.Int64("lb_placer_rp_error_count", "LB Placer RunnerPool RunnerList Error Count", "") | ||
| emptyPoolCountMeasure = stats.Int64("lb_placer_rp_empty_count", "LB Placer RunnerPool RunnerList Empty Count", "") | ||
| cancelCountMeasure = stats.Int64("lb_placer_client_cancelled_count", "LB Placer Client Cancel Count", "") | ||
| placedErrorCountMeasure = stats.Int64("lb_placer_placed_error_count", "LB Placer Placed Call Count With Errors", "") | ||
| placedOKCountMeasure = stats.Int64("lb_placer_placed_ok_count", "LB Placer Placed Call Count Without Errors", "") | ||
| retryTooBusyCountMeasure = stats.Int64("lb_placer_retry_busy_count", "LB Placer Retry Count - Too Busy", "") | ||
| retryErrorCountMeasure = stats.Int64("lb_placer_retry_error_count", "LB Placer Retry Count - Errors", "") | ||
| placerLatencyMeasure = stats.Int64("lb_placer_latency", "LB Placer Latency", "msecs") | ||
| ) | ||
|
|
||
| // Helper struct for tracking LB Placer latency and attempt counts | ||
| type attemptTracker struct { | ||
| ctx context.Context | ||
| startTime time.Time | ||
| lastAttemptTime time.Time | ||
| attemptCount int64 | ||
| } | ||
|
|
||
| func newAttemptTracker(ctx context.Context) *attemptTracker { | ||
| return &attemptTracker{ | ||
| ctx: ctx, | ||
| startTime: time.Now(), | ||
| } | ||
| } | ||
|
|
||
| func (data *attemptTracker) finalizeAttempts(isSuccess bool) { | ||
| stats.Record(data.ctx, attemptCountMeasure.M(data.attemptCount)) | ||
|
|
||
| // IMPORTANT: here we use (lastAttemptTime - startTime). We want to exclude TryExec | ||
| // latency *if* TryExec() goes through with success. Placer latency metric only shows | ||
| // how much time are spending in Placer loop/retries. The metric includes rtt/latency of | ||
| // *all* unsuccessful NACK (retriable) responses from runners as well. For example, if | ||
| // Placer loop here retries 4 runners (which takes 5 msecs each) and then 5th runner | ||
| // succeeds (but takes 35 seconds to finish execution), we report 20 msecs as our LB | ||
| // latency. | ||
| endTime := data.lastAttemptTime | ||
| if !isSuccess { | ||
| endTime = time.Now() | ||
| } | ||
|
|
||
| stats.Record(data.ctx, placerLatencyMeasure.M(int64(endTime.Sub(data.startTime)/time.Millisecond))) | ||
| } | ||
|
|
||
| func (data *attemptTracker) recordAttempt() { | ||
| data.lastAttemptTime = time.Now() | ||
| if data.attemptCount != math.MaxInt64 { | ||
| data.attemptCount++ | ||
| } | ||
| } | ||
|
|
||
| func makeKeys(names []string) []tag.Key { | ||
| var tagKeys []tag.Key | ||
| for _, name := range names { | ||
| key, err := tag.NewKey(name) | ||
| if err != nil { | ||
| logrus.WithError(err).Fatal("cannot create tag key for %v", name) | ||
| } | ||
| tagKeys = append(tagKeys, key) | ||
| } | ||
| return tagKeys | ||
| } | ||
|
|
||
| func createView(measure stats.Measure, agg *view.Aggregation, tagKeys []string) *view.View { | ||
| return &view.View{ | ||
| Name: measure.Name(), | ||
| Description: measure.Description(), | ||
| TagKeys: makeKeys(tagKeys), | ||
| Measure: measure, | ||
| Aggregation: agg, | ||
| } | ||
| } | ||
|
|
||
| func RegisterPlacerViews(tagKeys []string) { | ||
| err := view.Register( | ||
| createView(attemptCountMeasure, view.Distribution(0, 1, 2, 4, 8, 32, 64, 256), tagKeys), | ||
| createView(errorPoolCountMeasure, view.Count(), tagKeys), | ||
| createView(emptyPoolCountMeasure, view.Count(), tagKeys), | ||
| createView(cancelCountMeasure, view.Count(), tagKeys), | ||
| createView(placedErrorCountMeasure, view.Count(), tagKeys), | ||
| createView(placedOKCountMeasure, view.Count(), tagKeys), | ||
| createView(retryTooBusyCountMeasure, view.Count(), tagKeys), | ||
| createView(retryErrorCountMeasure, view.Count(), tagKeys), | ||
| createView(placerLatencyMeasure, view.Distribution(1, 10, 25, 50, 200, 1000, 10000, 60000), tagKeys), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did something similar in api/server/stats.go The idea was to register views only if the server is started with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Showing the limits of my own knowledge with this question. Looked at the docs for
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Distribution seems to get multiple values in the buckets. |
||
| ) | ||
| if err != nil { | ||
| logrus.WithError(err).Fatal("cannot create view") | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -468,6 +468,12 @@ func WithAgentFromEnv() ServerOption { | |
| placer = pool.NewNaivePlacer() | ||
| } | ||
|
|
||
| // If prometheus is enabled, add LB placer metrics to the views | ||
| if s.promExporter != nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't think we should be doing this as a general rule of thumb -- which exporters we have attached shouldn't affect which metrics we collect (easy to fix...) -- if we need a way to turn these off we kinda have it half plumbed with the register views methods throughout now, seems like we could do similar to as was done in #1057 to have the default set of views and let the user either import the package that turns on all our views or they can add them individually themselves. anyway, not too big a deal
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rdallman thanks for pointing this out, saw your PR and just merged it. :-) |
||
| keys := []string{"fn_appname", "fn_path"} | ||
| pool.RegisterPlacerViews(keys) | ||
| } | ||
|
|
||
| s.agent, err = agent.NewLBAgent(agent.NewCachedDataAccess(cl), runnerPool, placer) | ||
| if err != nil { | ||
| return errors.New("LBAgent creation failed") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| prometheus.*.txt |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this code be conditional on the length of the runners list being greater than zero? Also worth noting: this changes the behavior of this function in the case that no runners are found. You'll now return
ErrCallTimeoutServerBusywhen previously it would have returnednil