Skip to content

Commit

Permalink
Make results cache TTL configurable and settable per tenant (#4385)
Browse files Browse the repository at this point in the history
* Make results cache ttl configurable.

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>

* Update CHANGELOG.md

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>

* Move new limits close to frontend limits.

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>

* Enhance description on OOO flag.

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>

* Fix tests.

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>

---------

Signed-off-by: Peter Štibraný <pstibrany@gmail.com>
  • Loading branch information
pstibrany committed Mar 9, 2023
1 parent 5fb813d commit 8c6cb52
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 44 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* [ENHANCEMENT] Query-frontend and ruler: add experimental, more performant protobuf internal query result response format enabled with `-ruler.query-frontend.query-result-response-format=protobuf`. #4331
* [ENHANCEMENT] Ruler: increased tolerance for missed iterations on alerts, reducing the chances of flapping firing alerts during ruler restarts. #4432
* [ENHANCEMENT] Querier and store-gateway: optimized `.*` and `.+` regular expression label matchers. #4432
* [ENHANCEMENT] Query-frontend: results cache TTL is now configurable by using `-query-frontend.results-cache-ttl` and `-query-frontend.results-cache-ttl-for-out-of-order-time-window` options. These values can also be specified per tenant. Default values are unchanged (7 days and 10 minutes respectively). #4385
* [BUGFIX] Querier: Streaming remote read will now continue to return multiple chunks per frame after the first frame. #4423

### Mixin
Expand Down
24 changes: 23 additions & 1 deletion cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3078,7 +3078,7 @@
"kind": "field",
"name": "out_of_order_time_window",
"required": false,
"desc": "Non-zero value enables out-of-order support for most recent samples that are within the time window in relation to the TSDB's maximum time, i.e., within [db.maxTime-timeWindow, db.maxTime]). The ingester will need more memory as a factor of rate of out-of-order samples being ingested and the number of series that are getting out-of-order samples. A lower TTL of 10 minutes will be set for the query cache entries that overlap with this window.",
"desc": "Non-zero value enables out-of-order support for most recent samples that are within the time window in relation to the TSDB's maximum time, i.e., within [db.maxTime-timeWindow, db.maxTime]). The ingester will need more memory as a factor of rate of out-of-order samples being ingested and the number of series that are getting out-of-order samples. If query falls into this window, cached results will use value from -query-frontend.results-cache-ttl-for-out-of-order-time-window option to specify TTL for resulting cache entry.",
"fieldValue": null,
"fieldDefaultValue": 0,
"fieldFlag": "ingester.out-of-order-time-window",
Expand Down Expand Up @@ -3239,6 +3239,28 @@
"fieldFlag": "query-frontend.max-total-query-length",
"fieldType": "duration"
},
{
"kind": "field",
"name": "results_cache_ttl",
"required": false,
"desc": "Time to live duration for cached query results. If query falls into out-of-order time window, -query-frontend.results-cache-ttl-for-out-of-order-time-window is used instead.",
"fieldValue": null,
"fieldDefaultValue": 604800000000000,
"fieldFlag": "query-frontend.results-cache-ttl",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "results_cache_ttl_for_out_of_order_time_window",
"required": false,
"desc": "Time to live duration for cached query results if query falls into out-of-order time window. This is lower than -query-frontend.results-cache-ttl so that incoming out-of-order samples are returned in the query results sooner.",
"fieldValue": null,
"fieldDefaultValue": 600000000000,
"fieldFlag": "query-frontend.results-cache-ttl-for-out-of-order-time-window",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "cardinality_analysis_enabled",
Expand Down
6 changes: 5 additions & 1 deletion cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1184,7 +1184,7 @@ Usage of ./cmd/mimir/mimir:
-ingester.native-histograms-ingestion-enabled
[experimental] Enable ingestion of native histogram samples. If false, native histogram samples are ignored without an error.
-ingester.out-of-order-time-window duration
[experimental] Non-zero value enables out-of-order support for most recent samples that are within the time window in relation to the TSDB's maximum time, i.e., within [db.maxTime-timeWindow, db.maxTime]). The ingester will need more memory as a factor of rate of out-of-order samples being ingested and the number of series that are getting out-of-order samples. A lower TTL of 10 minutes will be set for the query cache entries that overlap with this window.
[experimental] Non-zero value enables out-of-order support for most recent samples that are within the time window in relation to the TSDB's maximum time, i.e., within [db.maxTime-timeWindow, db.maxTime]). The ingester will need more memory as a factor of rate of out-of-order samples being ingested and the number of series that are getting out-of-order samples. If query falls into this window, cached results will use value from -query-frontend.results-cache-ttl-for-out-of-order-time-window option to specify TTL for resulting cache entry.
-ingester.rate-update-period duration
Period with which to update the per-tenant ingestion rates. (default 15s)
-ingester.ring.consul.acl-token string
Expand Down Expand Up @@ -1607,6 +1607,10 @@ Usage of ./cmd/mimir/mimir:
The amount of shards to use when doing parallelisation via query sharding by tenant. 0 to disable query sharding for tenant. Query sharding implementation will adjust the number of query shards based on compactor shards. This allows querier to not search the blocks which cannot possibly have the series for given query shard. (default 16)
-query-frontend.query-stats-enabled
False to disable query statistics tracking. When enabled, a message with some statistics is logged for every query. (default true)
-query-frontend.results-cache-ttl duration
[experimental] Time to live duration for cached query results. If query falls into out-of-order time window, -query-frontend.results-cache-ttl-for-out-of-order-time-window is used instead. (default 1w)
-query-frontend.results-cache-ttl-for-out-of-order-time-window duration
[experimental] Time to live duration for cached query results if query falls into out-of-order time window. This is lower than -query-frontend.results-cache-ttl so that incoming out-of-order samples are returned in the query results sooner. (default 10m)
-query-frontend.results-cache.backend string
Backend for query-frontend results cache, if not empty. Supported values: memcached, redis.
-query-frontend.results-cache.compression string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ The following features are currently experimental:
- Protobuf internal query result payload format
- `-query-frontend.query-result-response-format=protobuf`
- `-ruler.query-frontend.query-result-response-format=protobuf`
- Per-tenant Results cache TTL (`-query-frontend.results-cache-ttl`, `-query-frontend.results-cache-ttl-for-out-of-order-time-window`)

## Deprecated features

Expand Down
21 changes: 18 additions & 3 deletions docs/sources/mimir/references/configuration-parameters/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2596,9 +2596,10 @@ The `limits` block configures default and per-tenant limits imposed by component
# samples that are within the time window in relation to the TSDB's maximum
# time, i.e., within [db.maxTime-timeWindow, db.maxTime]). The ingester will
# need more memory as a factor of rate of out-of-order samples being ingested
# and the number of series that are getting out-of-order samples. A lower TTL of
# 10 minutes will be set for the query cache entries that overlap with this
# window.
# and the number of series that are getting out-of-order samples. If query falls
# into this window, cached results will use value from
# -query-frontend.results-cache-ttl-for-out-of-order-time-window option to
# specify TTL for resulting cache entry.
# CLI flag: -ingester.out-of-order-time-window
[out_of_order_time_window: <duration> | default = 0s]

Expand Down Expand Up @@ -2701,6 +2702,20 @@ The `limits` block configures default and per-tenant limits imposed by component
# CLI flag: -query-frontend.max-total-query-length
[max_total_query_length: <duration> | default = 0s]

# (experimental) Time to live duration for cached query results. If query falls
# into out-of-order time window,
# -query-frontend.results-cache-ttl-for-out-of-order-time-window is used
# instead.
# CLI flag: -query-frontend.results-cache-ttl
[results_cache_ttl: <duration> | default = 1w]

# (experimental) Time to live duration for cached query results if query falls
# into out-of-order time window. This is lower than
# -query-frontend.results-cache-ttl so that incoming out-of-order samples are
# returned in the query results sooner.
# CLI flag: -query-frontend.results-cache-ttl-for-out-of-order-time-window
[results_cache_ttl_for_out_of_order_time_window: <duration> | default = 10m]

# Enables endpoints used for cardinality analysis.
# CLI flag: -querier.cardinality-analysis-enabled
[cardinality_analysis_enabled: <boolean> | default = false]
Expand Down
7 changes: 7 additions & 0 deletions pkg/frontend/querymiddleware/limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ type Limits interface {

// NativeHistogramsIngestionEnabled returns whether to ingest native histograms in the ingester
NativeHistogramsIngestionEnabled(userID string) bool

// ResultsCacheTTL returns TTL for cached results for query that doesn't fall into out of order window, or
// if out of order ingestion is disabled.
ResultsCacheTTL(userID string) time.Duration

// ResultsCacheForOutOfOrderWindowTTL returns TTL for cached results for query that falls into out-of-order ingestion window.
ResultsCacheTTLForOutOfOrderTimeWindow(userID string) time.Duration
}

type limitsMiddleware struct {
Expand Down
10 changes: 10 additions & 0 deletions pkg/frontend/querymiddleware/limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ type mockLimits struct {
outOfOrderTimeWindow time.Duration
creationGracePeriod time.Duration
nativeHistogramsIngestionEnabled bool
resultsCacheTTL time.Duration
resultsCacheOutOfOrderWindowTTL time.Duration
}

func (m mockLimits) MaxQueryLookback(string) time.Duration {
Expand Down Expand Up @@ -343,6 +345,14 @@ func (m mockLimits) OutOfOrderTimeWindow(userID string) time.Duration {
return m.outOfOrderTimeWindow
}

func (m mockLimits) ResultsCacheTTL(userID string) time.Duration {
return m.resultsCacheTTL
}

func (m mockLimits) ResultsCacheTTLForOutOfOrderTimeWindow(userID string) time.Duration {
return m.resultsCacheOutOfOrderWindowTTL
}

func (m mockLimits) CreationGracePeriod(userID string) time.Duration {
return m.creationGracePeriod
}
Expand Down
11 changes: 3 additions & 8 deletions pkg/frontend/querymiddleware/split_and_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ import (
)

const (
// Cache entries for 7 days. We're not disabling TTL because the backend client currently doesn't support it.
resultsCacheTTL = 7 * 24 * time.Hour
// resultsCacheLowerTTL is the smaller TTL used in specific cases. For example OOO queries.
resultsCacheLowerTTL = 10 * time.Minute
notCachableReasonUnalignedTimeRange = "unaligned-time-range"
notCachableReasonTooNew = "too-new"
notCachableReasonModifiersNotCachable = "has-modifiers"
Expand Down Expand Up @@ -382,11 +378,10 @@ func (s *splitAndCacheMiddleware) fetchCacheExtents(ctx context.Context, keys []

// storeCacheExtents stores the extents for given key in the cache.
func (s *splitAndCacheMiddleware) storeCacheExtents(key string, tenantIDs []string, extents []Extent) {
ttl := resultsCacheTTL
ttl := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, s.limits.ResultsCacheTTL)
lowerTTLWithinTimePeriod := validation.MaxDurationPerTenant(tenantIDs, s.limits.OutOfOrderTimeWindow)
if lowerTTLWithinTimePeriod > 0 && len(extents) > 0 &&
extents[len(extents)-1].End >= time.Now().Add(-lowerTTLWithinTimePeriod).UnixMilli() {
ttl = resultsCacheLowerTTL
if lowerTTLWithinTimePeriod > 0 && len(extents) > 0 && extents[len(extents)-1].End >= time.Now().Add(-lowerTTLWithinTimePeriod).UnixMilli() {
ttl = validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, s.limits.ResultsCacheTTLForOutOfOrderTimeWindow)
}

buf, err := proto.Marshal(&CachedResponse{
Expand Down
23 changes: 14 additions & 9 deletions pkg/frontend/querymiddleware/split_and_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ import (
"github.com/grafana/mimir/pkg/util"
)

const resultsCacheTTL = 24 * time.Hour
const resultsCacheLowerTTL = 10 * time.Minute

func TestSplitAndCacheMiddleware_SplitByInterval(t *testing.T) {
var (
dayOneStartTime = parseTimeRFC3339(t, "2021-10-14T00:00:00Z")
Expand Down Expand Up @@ -233,7 +236,7 @@ func TestSplitAndCacheMiddleware_ResultsCache(t *testing.T) {
true,
24*time.Hour,
false,
mockLimits{maxCacheFreshness: 10 * time.Minute},
mockLimits{maxCacheFreshness: 10 * time.Minute, resultsCacheTTL: resultsCacheTTL, resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL},
newTestPrometheusCodec(),
cacheBackend,
ConstSplitter(day),
Expand Down Expand Up @@ -444,7 +447,7 @@ func TestSplitAndCacheMiddleware_ResultsCache_EnabledCachingOfStepUnalignedReque
true,
24*time.Hour,
true, // caching of step-unaligned requests is enabled in this test.
mockLimits{maxCacheFreshness: 10 * time.Minute},
mockLimits{maxCacheFreshness: 10 * time.Minute, resultsCacheTTL: resultsCacheTTL, resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL},
newTestPrometheusCodec(),
cacheBackend,
ConstSplitter(day),
Expand Down Expand Up @@ -600,7 +603,7 @@ func TestSplitAndCacheMiddleware_ResultsCache_ShouldNotCacheRequestEarlierThanMa
true,
24*time.Hour,
false,
mockLimits{maxCacheFreshness: maxCacheFreshness},
mockLimits{maxCacheFreshness: maxCacheFreshness, resultsCacheTTL: resultsCacheTTL, resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL},
newTestPrometheusCodec(),
cacheBackend,
cacheSplitter,
Expand Down Expand Up @@ -1048,7 +1051,7 @@ func TestSplitAndCacheMiddleware_ResultsCache_ExtentsEdgeCases(t *testing.T) {
true,
24*time.Hour,
false,
mockLimits{},
mockLimits{resultsCacheTTL: resultsCacheTTL, resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL},
newTestPrometheusCodec(),
cacheBackend,
cacheSplitter,
Expand Down Expand Up @@ -1093,7 +1096,7 @@ func TestSplitAndCacheMiddleware_StoreAndFetchCacheExtents(t *testing.T) {
true,
24*time.Hour,
false,
mockLimits{},
mockLimits{resultsCacheTTL: resultsCacheTTL, resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL},
newTestPrometheusCodec(),
cacheBackend,
ConstSplitter(day),
Expand All @@ -1112,8 +1115,8 @@ func TestSplitAndCacheMiddleware_StoreAndFetchCacheExtents(t *testing.T) {
})

t.Run("fetchCacheExtents() should return a slice with the same number of input keys and some extends filled up on partial cache hit", func(t *testing.T) {
mw.storeCacheExtents("key-1", nil, []Extent{mkExtent(10, 20)})
mw.storeCacheExtents("key-3", nil, []Extent{mkExtent(20, 30), mkExtent(40, 50)})
mw.storeCacheExtents("key-1", []string{"tenant"}, []Extent{mkExtent(10, 20)})
mw.storeCacheExtents("key-3", []string{"tenant"}, []Extent{mkExtent(20, 30), mkExtent(40, 50)})

actual := mw.fetchCacheExtents(ctx, []string{"key-1", "key-2", "key-3"})
expected := [][]Extent{{mkExtent(10, 20)}, nil, {mkExtent(20, 30), mkExtent(40, 50)}}
Expand All @@ -1126,7 +1129,7 @@ func TestSplitAndCacheMiddleware_StoreAndFetchCacheExtents(t *testing.T) {
require.NoError(t, err)
cacheBackend.StoreAsync(map[string][]byte{cacheHashKey("key-1"): buf}, 0)

mw.storeCacheExtents("key-3", nil, []Extent{mkExtent(20, 30), mkExtent(40, 50)})
mw.storeCacheExtents("key-3", []string{"tenant"}, []Extent{mkExtent(20, 30), mkExtent(40, 50)})

actual := mw.fetchCacheExtents(ctx, []string{"key-1", "key-2", "key-3"})
expected := [][]Extent{nil, nil, {mkExtent(20, 30), mkExtent(40, 50)}}
Expand Down Expand Up @@ -1697,7 +1700,9 @@ func TestSplitAndCacheMiddlewareLowerTTL(t *testing.T) {
mcache := cache.NewMockCache()
m := splitAndCacheMiddleware{
limits: mockLimits{
outOfOrderTimeWindow: time.Hour,
outOfOrderTimeWindow: time.Hour,
resultsCacheTTL: resultsCacheTTL,
resultsCacheOutOfOrderWindowTTL: resultsCacheLowerTTL,
},
cache: mcache,
}
Expand Down
Loading

0 comments on commit 8c6cb52

Please sign in to comment.