diff --git a/CHANGELOG.md b/CHANGELOG.md index b2f2ea4b79..6182531869 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [ENHANCEMENT] Cache: Add per-tenant TTL configuration for query results cache to control cache expiration on a per-tenant basis with separate TTLs for regular and out-of-order data. #7357 * [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363 * [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355 +* [BUGFIX] Fix nil when ingester_query_max_attempts > 1. #7369 ## 1.21.0 in progress diff --git a/pkg/querier/distributor_queryable.go b/pkg/querier/distributor_queryable.go index f57f32bda5..6ab0b592bf 100644 --- a/pkg/querier/distributor_queryable.go +++ b/pkg/querier/distributor_queryable.go @@ -166,6 +166,16 @@ func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, pa return storage.ErrSeriesSet(err) } + // Guard against nil results. This can happen when queryWithRetry's + // backoff loop exits without executing (e.g., context cancelled before + // the first attempt), returning (nil, nil). See #7364. + if results == nil { + if err != nil { + return storage.ErrSeriesSet(err) + } + return storage.EmptySeriesSet() + } + serieses := make([]storage.Series, 0, len(results.Chunkseries)) for _, result := range results.Chunkseries { // Sometimes the ingester can send series that have no data. diff --git a/pkg/querier/distributor_queryable_test.go b/pkg/querier/distributor_queryable_test.go index 825da08860..cbec09d8bc 100644 --- a/pkg/querier/distributor_queryable_test.go +++ b/pkg/querier/distributor_queryable_test.go @@ -387,6 +387,70 @@ func TestDistributorQuerier_Retry(t *testing.T) { } } +// TestDistributorQuerier_Select_CancelledContext_NoRetry verifies that with +// ingesterQueryMaxAttempts=1, a cancelled context does not panic because the +// direct code path (no retry loop) is used. +func TestDistributorQuerier_Select_CancelledContext_NoRetry(t *testing.T) { + t.Parallel() + + ctx := user.InjectOrgID(context.Background(), "0") + ctx, cancel := context.WithCancel(ctx) + cancel() + + d := &MockDistributor{} + d.On("QueryStream", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&client.QueryStreamResponse{}, context.Canceled) + + ingesterQueryMaxAttempts := 1 + queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0, func(string) bool { + return true + }, ingesterQueryMaxAttempts) + querier, err := queryable.Querier(mint, maxt) + require.NoError(t, err) + + require.NotPanics(t, func() { + seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt}) + _ = seriesSet.Err() + }) +} + +// TestDistributorQuerier_Select_CancelledContext reproduces the panic described +// in https://github.com/cortexproject/cortex/issues/7364. +// +// When ingesterQueryMaxAttempts > 1 and the context is cancelled before the +// retry loop starts (e.g. query timeout or another querier goroutine failing), +// backoff.Ongoing() returns false immediately. The result variable stays nil, +// queryWithRetry returns (nil, nil), and streamingSelect dereferences the nil +// result at line 169 → panic. +func TestDistributorQuerier_Select_CancelledContext(t *testing.T) { + t.Parallel() + + // Create a context that is already cancelled. + ctx := user.InjectOrgID(context.Background(), "0") + ctx, cancel := context.WithCancel(ctx) + cancel() + + d := &MockDistributor{} + // No mock expectations needed — QueryStream should never be called + // because the context is already cancelled. + + ingesterQueryMaxAttempts := 2 + queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0, func(string) bool { + return true + }, ingesterQueryMaxAttempts) + querier, err := queryable.Querier(mint, maxt) + require.NoError(t, err) + + // This should NOT panic. Before the fix, the cancelled context causes + // queryWithRetry to return (nil, nil), and streamingSelect dereferences + // the nil result: panic: runtime error: invalid memory address or nil + // pointer dereference [signal SIGSEGV ... addr=0x8] + require.NotPanics(t, func() { + seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt}) + // With a cancelled context, we expect either an error or an empty result. + _ = seriesSet.Err() + }) +} + func TestDistributorQuerier_LabelNames(t *testing.T) { t.Parallel()