cortexproject · jtlisi · Mar 2, 2021 · Mar 2, 2021 · Mar 2, 2021 · Mar 2, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,9 @@
     * `-alertmanager.alertmanager-client.tls-server-name`
     * `-alertmanager.alertmanager-client.tls-insecure-skip-verify`
 * [FEATURE] Compactor: added blocks storage per-tenant retention support. This is configured via `-compactor.retention-period`, and can be overridden on a per-tenant basis. #3879
+* [ENHANCEMENT] Queries: Instrument queries that were discarded due to the configured `max_outstanding_requests_per_tenant`. #3894
+  * `cortex_query_frontend_discarded_queries_total`
+  * `cortex_query_scheduler_discarded_queries_total`
 * [ENHANCEMENT] Ruler: Add TLS and explicit basis authentication configuration options for the HTTP client the ruler uses to communicate with the alertmanager. #3752
   * `-ruler.alertmanager-client.basic-auth-username`: Configure the basic authentication username used by the client. Takes precedent over a URL configured username.
   * `-ruler.alertmanager-client.basic-auth-password`: Configure the basic authentication password used by the client. Takes precedent over a URL configured password.

diff --git a/pkg/frontend/v1/frontend.go b/pkg/frontend/v1/frontend.go
@@ -57,9 +57,10 @@ type Frontend struct {
 	activeUsers  *util.ActiveUsersCleanupService
 
 	// Metrics.
-	queueLength   *prometheus.GaugeVec
-	numClients    prometheus.GaugeFunc
-	queueDuration prometheus.Histogram
+	queueLength      *prometheus.GaugeVec
+	discardedQueries *prometheus.CounterVec
+	numClients       prometheus.GaugeFunc
+	queueDuration    prometheus.Histogram
 }
 
 type request struct {
@@ -83,14 +84,18 @@ func New(cfg Config, limits Limits, log log.Logger, registerer prometheus.Regist
 			Name: "cortex_query_frontend_queue_length",
 			Help: "Number of queries in the queue.",
 		}, []string{"user"}),
+		discardedQueries: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_query_frontend_discarded_queries_total",
+			Help: "Total number of query requests discarded.",
+		}, []string{"user", "reason"}),
 		queueDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
 			Name:    "cortex_query_frontend_queue_duration_seconds",
 			Help:    "Time spend by requests queued.",
 			Buckets: prometheus.DefBuckets,
 		}),
 	}
 
-	f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength)
+	f.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, f.queueLength, f.discardedQueries)
 	f.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(f.cleanupInactiveUserMetrics)
 
 	f.numClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{

diff --git a/pkg/frontend/v1/frontend_test.go b/pkg/frontend/v1/frontend_test.go
@@ -127,8 +127,11 @@ func TestFrontendCheckReady(t *testing.T) {
 	} {
 		t.Run(tt.name, func(t *testing.T) {
 			f := &Frontend{
-				log:          log.NewNopLogger(),
-				requestQueue: queue.NewRequestQueue(5, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"})),
+				log: log.NewNopLogger(),
+				requestQueue: queue.NewRequestQueue(5,
+					prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
+					prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
+				),
 			}
 			for i := 0; i < tt.connectedClients; i++ {
 				f.requestQueue.RegisterQuerierConnection("test")

diff --git a/pkg/scheduler/queue/queue.go b/pkg/scheduler/queue/queue.go
@@ -7,6 +7,8 @@ import (
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/atomic"
+
+	"github.com/cortexproject/cortex/pkg/util/validation"
 )
 
 var (
@@ -47,14 +49,16 @@ type RequestQueue struct {
 	queues  *queues
 	stopped bool
 
-	queueLength *prometheus.GaugeVec // Per user.
+	queueLength      *prometheus.GaugeVec   // Per user and reason.
+	discardedQueries *prometheus.CounterVec // Per user.
 }
 
-func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec) *RequestQueue {
+func NewRequestQueue(maxOutstandingPerTenant int, queueLength *prometheus.GaugeVec, discardedQueries *prometheus.CounterVec) *RequestQueue {
 	q := &RequestQueue{
 		queues:                  newUserQueues(maxOutstandingPerTenant),
 		connectedQuerierWorkers: atomic.NewInt32(0),
 		queueLength:             queueLength,
+		discardedQueries:        discardedQueries,
 	}
 
 	q.cond = sync.NewCond(&q.mtx)
@@ -91,6 +95,7 @@ func (q *RequestQueue) EnqueueRequest(userID string, req Request, maxQueriers in
 		}
 		return nil
 	default:
+		q.discardedQueries.WithLabelValues(userID, validation.RateLimited).Inc()
 		return ErrTooManyRequests
 	}
 }

diff --git a/pkg/scheduler/queue/queue_test.go b/pkg/scheduler/queue/queue_test.go
@@ -17,7 +17,10 @@ func BenchmarkGetNextRequest(b *testing.B) {
 	queues := make([]*RequestQueue, 0, b.N)
 
 	for n := 0; n < b.N; n++ {
-		queue := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
+		queue := NewRequestQueue(maxOutstandingPerTenant,
+			prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
+			prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
+		)
 		queues = append(queues, queue)
 
 		for ix := 0; ix < queriers; ix++ {
@@ -71,7 +74,10 @@ func BenchmarkQueueRequest(b *testing.B) {
 	requests := make([]string, 0, numTenants)
 
 	for n := 0; n < b.N; n++ {
-		q := NewRequestQueue(maxOutstandingPerTenant, prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}))
+		q := NewRequestQueue(maxOutstandingPerTenant,
+			prometheus.NewGaugeVec(prometheus.GaugeOpts{}, []string{"user"}),
+			prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}),
+		)
 
 		for ix := 0; ix < queriers; ix++ {
 			q.RegisterQuerierConnection(fmt.Sprintf("querier-%d", ix))

diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
@@ -55,6 +55,7 @@ type Scheduler struct {
 
 	// Metrics.
 	queueLength              *prometheus.GaugeVec
+	discardedQueries         *prometheus.CounterVec
 	connectedQuerierClients  prometheus.GaugeFunc
 	connectedFrontendClients prometheus.GaugeFunc
 	queueDuration            prometheus.Histogram
@@ -100,7 +101,12 @@ func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer promethe
 		Name: "cortex_query_scheduler_queue_length",
 		Help: "Number of queries in the queue.",
 	}, []string{"user"})
-	s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength)
+
+	s.discardedQueries = promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
+		Name: "cortex_query_scheduler_discarded_queries_total",
+		Help: "Total number of query requests discarded.",
+	}, []string{"user", "reason"})
+	s.requestQueue = queue.NewRequestQueue(cfg.MaxOutstandingPerTenant, s.queueLength, s.discardedQueries)
 
 	s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
 		Name:    "cortex_query_scheduler_queue_duration_seconds",