Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: instrument the query layer to track rate-limited queries #3894

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -28,6 +28,9 @@
* `-alertmanager.alertmanager-client.tls-server-name`
* `-alertmanager.alertmanager-client.tls-insecure-skip-verify`
* [FEATURE] Compactor: added blocks storage per-tenant retention support. This is configured via `-compactor.retention-period`, and can be overridden on a per-tenant basis. #3879
* [ENHANCEMENT] Queries: Instrument queries that were discarded due to the configured `max_outstanding_requests_per_tenant`. #3894
jtlisi marked this conversation as resolved.
Show resolved Hide resolved
* `cortex_query_frontend_discarded_queries_total`
* `cortex_query_scheduler_discarded_queries_total`
* [ENHANCEMENT] Ruler: Add TLS and explicit basis authentication configuration options for the HTTP client the ruler uses to communicate with the alertmanager. #3752
* `-ruler.alertmanager-client.basic-auth-username`: Configure the basic authentication username used by the client. Takes precedent over a URL configured username.
* `-ruler.alertmanager-client.basic-auth-password`: Configure the basic authentication password used by the client. Takes precedent over a URL configured password.
Expand Down
13 changes: 9 additions & 4 deletions pkg/frontend/v1/frontend.go
Expand Up @@ -57,9 +57,10 @@ type Frontend struct {
activeUsers *util.ActiveUsersCleanupService

// Metrics.
queueLength *prometheus.GaugeVec
numClients prometheus.GaugeFunc
queueDuration prometheus.Histogram
queueLength *prometheus.GaugeVec
discardedQueries *prometheus.CounterVec
numClients prometheus.GaugeFunc
queueDuration prometheus.Histogram
}

type request struct {
Expand All @@ -83,14 +84,18 @@ func New(cfg Config, limits Limits, log log.Logger, registerer prometheus.Regist
Name: "cortex_query_frontend_queue_length",
Help: "Number of queries in the queue.",
}, []string{"user"}),
discardedQueries: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
pstibrany marked this conversation as resolved.
Show resolved Hide resolved
Name: "cortex_query_frontend_discarded_queries_total",
Help: "Total number of query requests discarded.",
}, []string{"user", "reason"}),
queueDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_query_frontend_queue_duration_seconds",
Help: "Time spend by requests queued.",
Buckets: prometheus.DefBuckets,
}),
}

f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength)
f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength, f.discardedQueries)
f.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(f.cleanupInactiveUserMetrics)

f.numClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{
Expand Down
7 changes: 5 additions & 2 deletions pkg/frontend/v1/frontend_test.go
Expand Up @@ -127,8 +127,11 @@ func TestFrontendCheckReady(t *testing.T) {
} {
t.Run(tt.name, func(t *testing.T) {
f := &Frontend{
log: log.NewNopLogger(),
requestQueue: queue.NewRequestQueue(5, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"})),
log: log.NewNopLogger(),
requestQueue: queue.NewRequestQueue(5,
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
),
}
for i := 0; i < tt.connectedClients; i++ {
f.requestQueue.RegisterQuerierConnection("test")
Expand Down
9 changes: 7 additions & 2 deletions pkg/scheduler/queue/queue.go
Expand Up @@ -7,6 +7,8 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"

"github.com/cortexproject/cortex/pkg/util/validation"
)

var (
Expand Down Expand Up @@ -47,14 +49,16 @@ type RequestQueue struct {
queues *queues
stopped bool

queueLength *prometheus.GaugeVec // Per user.
queueLength *prometheus.GaugeVec // Per user and reason.
discardedQueries *prometheus.CounterVec // Per user.
}

func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec) *RequestQueue {
func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec, discardedQueries *prometheus.CounterVec) *RequestQueue {
q := &RequestQueue{
queues: newUserQueues(maxOutstandingPerTenant),
connectedQuerierWorkers: atomic.NewInt32(0),
queueLength: queueLength,
discardedQueries: discardedQueries,
}

q.cond = sync.NewCond(&q.mtx)
Expand Down Expand Up @@ -91,6 +95,7 @@ func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers in
}
return nil
default:
q.discardedQueries.WithLabelValues(userID, validation.RateLimited).Inc()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "rate limited" is misleading here. We only drop the request if queue is full, which can be for various reasons (slow queries, queriers crashed, no querier connected or all queriers are busy). I'd suggest to start without "reason" label.

Also I don't think we should use "validation" reasons (used for validating incoming samples) here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense.

return ErrTooManyRequests
}
}
Expand Down
10 changes: 8 additions & 2 deletions pkg/scheduler/queue/queue_test.go
Expand Up @@ -17,7 +17,10 @@ func BenchmarkGetNextRequest(b *testing.B) {
queues := make([]*RequestQueue, 0, b.N)

for n := 0; n < b.N; n++ {
queue := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
queue := NewRequestQueue(maxOutstandingPerTenant,
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
)
queues = append(queues, queue)

for ix := 0; ix < queriers; ix++ {
Expand Down Expand Up @@ -71,7 +74,10 @@ func BenchmarkQueueRequest(b *testing.B) {
requests := make([]string, 0, numTenants)

for n := 0; n < b.N; n++ {
q := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
q := NewRequestQueue(maxOutstandingPerTenant,
prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
)

for ix := 0; ix < queriers; ix++ {
q.RegisterQuerierConnection(fmt.Sprintf("querier-%d", ix))
Expand Down
8 changes: 7 additions & 1 deletion pkg/scheduler/scheduler.go
Expand Up @@ -55,6 +55,7 @@ type Scheduler struct {

// Metrics.
queueLength *prometheus.GaugeVec
discardedQueries *prometheus.CounterVec
connectedQuerierClients prometheus.GaugeFunc
connectedFrontendClients prometheus.GaugeFunc
queueDuration prometheus.Histogram
Expand Down Expand Up @@ -100,7 +101,12 @@ func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer promethe
Name: "cortex_query_scheduler_queue_length",
Help: "Number of queries in the queue.",
}, []string{"user"})
s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength)

s.discardedQueries = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please cleanup this metric in cleanupMetricsForInactiveUser and add it to tests to verify that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Name: "cortex_query_scheduler_discarded_queries_total",
Help: "Total number of query requests discarded.",
}, []string{"user", "reason"})
s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength, s.discardedQueries)

s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_query_scheduler_queue_duration_seconds",
Expand Down