stats: fix histograms for collated strings

Currently, whenever we're encoding the upper bounds of the histogram buckets we always use the datum's key-encoding. However, some datums are composite, and we need to operate on their value encoding. This strategy happens to be ok for floats and decimals because their value part only stores some auxiliary information (like the number of trailing zeroes and the sign of zero), so even if we lose that auxiliary information, when decoding the upper bound we end up with a datum that has the same comparison properties. Unfortunately, this is not the case for collated strings where key-encoding and value-encoding can be vastly different. So the fact that we always use the key-encoding for collated strings makes it so that when we key-decode upper bounds, we end up with garbage datums. As a result, our histograms are meaningless for collated strings. This problem is now fixed by using the value encoding for collated strings. Doing so for other types that might have composite encoding doesn't seem necessary at the moment (for floats and decimals things already work, and we currently don't collect histograms on JSONs or arrays). In order to make this change backwards-compatible we gate it based on the new histogram version - if the histogram has the previous version, then it must have used key-encoding for collated strings, so we will keep on using key-decoding as well. Only once the cluster upgrades to 24.1 version do we start creating the histograms with the new version which will use value-encoding for collated strings. Release note (bug fix): CockroachDB now correctly uses the histograms on columns of COLLATED STRING type. The bug has been present since pre-22.1 release.
cockroachdb · Jan 12, 2024 · 4d77dd6 · 4d77dd6
1 parent 12ce879
commit 4d77dd6
Show file tree

Hide file tree

Showing 19 changed files with 212 additions and 85 deletions.
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/collated_strings b/pkg/sql/opt/exec/execbuilder/testdata/collated_strings
@@ -0,0 +1,41 @@
+# LogicTest: local
+
+# Regression test for incorrectly using the key-encoding for collated strings
+# as upper bounds of histogram buckets (#98400).
+statement ok
+CREATE TABLE t98400 (k INT PRIMARY KEY, s STRING COLLATE en_US_u_ks_level2);
+INSERT INTO t98400 SELECT i, 'hello' FROM generate_series(1, 10) g(i);
+INSERT INTO t98400 SELECT i, 'world' FROM generate_series(11, 12) g(i);
+INSERT INTO t98400 SELECT 13, 'foo';
+
+statement ok
+ANALYZE t98400;
+
+# We expect that the filter is estimated to match 10 rows.
+query T
+EXPLAIN (OPT, VERBOSE) SELECT * FROM t98400 WHERE s = 'hello' COLLATE en_US_u_ks_level2;
+----
+select
+ ├── columns: k:1 s:2
+ ├── stats: [rows=10, distinct(2)=1, null(2)=0]
+ │   histogram(2)=  0                 10
+ │                <--- 'hello' COLLATE en_US_u_ks_level2
+ ├── cost: 43.02
+ ├── key: (1)
+ ├── fd: ()-->(2)
+ ├── distribution: test
+ ├── prune: (1)
+ ├── scan t98400
+ │    ├── columns: k:1 s:2
+ │    ├── stats: [rows=13, distinct(1)=13, null(1)=0, distinct(2)=3, null(2)=0]
+ │    │   histogram(1)=  0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1   0  1   0  1   0  1
+ │    │                <--- 1 --- 2 --- 3 --- 4 --- 5 --- 6 --- 7 --- 8 --- 9 --- 10 --- 11 --- 12 --- 13
+ │    │   histogram(2)=  0                 1                 10                  2
+ │    │                <--- 'foo' COLLATE en_US_u_ks_level2 ---- 'world' COLLATE en_US_u_ks_level2
+ │    ├── cost: 42.86
+ │    ├── key: (1)
+ │    ├── fd: (1)-->(2)
+ │    ├── distribution: test
+ │    └── prune: (1,2)
+ └── filters
+      └── s:2 = 'hello' COLLATE en_US_u_ks_level2 [outer=(2), constraints=(/2: [/'hello' COLLATE en_US_u_ks_level2 - /'hello' COLLATE en_US_u_ks_level2]; tight), fd=()-->(2)]
diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -4825,12 +4825,15 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
 				// Each single-column prefix value from the Spans is a sample value.
 				// Give each sample value its own bucket, up to a maximum of 200
 				// buckets, with even distribution.
-				_, unencodedBuckets, err := stats.EquiDepthHistogram(sb.evalCtx,
+				_, unencodedBuckets, err := stats.EquiDepthHistogram(
+					sb.ctx,
+					sb.evalCtx,
 					dataType,
 					values, /* samples */
 					numRows,
 					int64(numValues), /* distinctCount */
 					int(stats.DefaultHistogramBuckets.Get(&sb.evalCtx.Settings.SV)), /* maxBuckets */
+					sb.evalCtx.Settings,
 				)
 				// This shouldn't error out, but if it does, let's not punish the user.
 				// Just build stats without the histogram in that case.

diff --git a/pkg/sql/randgen/mutator.go b/pkg/sql/randgen/mutator.go
@@ -24,6 +24,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/parser"
 	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
 	"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
+	"github.com/cockroachdb/cockroach/pkg/sql/rowenc/valueside"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/stats"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
@@ -300,6 +301,7 @@ func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData {
 	}
 	h := stats.HistogramData{
 		ColumnType: histogramColType,
+		Version:    stats.HistVersion,
 	}
 
 	// Generate random values for histogram bucket upper bounds.
@@ -310,7 +312,13 @@ func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData {
 			encs := encodeInvertedIndexHistogramUpperBounds(colType, upper)
 			encodedUpperBounds = append(encodedUpperBounds, encs...)
 		} else {
-			enc, err := keyside.Encode(nil, upper, encoding.Ascending)
+			var enc []byte
+			var err error
+			if colType.Family() == types.CollatedStringFamily {
+				enc, err = valueside.Encode(nil /* appendTo */, valueside.NoColumnID, upper, nil /* scratch */)
+			} else {
+				enc, err = keyside.Encode(nil, upper, encoding.Ascending)
+			}
 			if err != nil {
 				panic(err)
 			}

diff --git a/pkg/sql/rowexec/sample_aggregator.go b/pkg/sql/rowexec/sample_aggregator.go
@@ -653,14 +653,14 @@ func (s *sampleAggregator) generateHistogram(
 	}
 
 	if lowerBound != nil {
-		h, buckets, err := stats.ConstructExtremesHistogram(evalCtx, colType, values, numRows, distinctCount, maxBuckets, lowerBound)
+		h, buckets, err := stats.ConstructExtremesHistogram(ctx, evalCtx, colType, values, numRows, distinctCount, maxBuckets, lowerBound, evalCtx.Settings)
 		_ = buckets
 		return h, err
 	}
 
 	// TODO(michae2): Instead of using the flowCtx's evalCtx, investigate
 	// whether this can use a nil *eval.Context.
-	h, _, err := stats.EquiDepthHistogram(evalCtx, colType, values, numRows, distinctCount, maxBuckets)
+	h, _, err := stats.EquiDepthHistogram(ctx, evalCtx, colType, values, numRows, distinctCount, maxBuckets, evalCtx.Settings)
 	return h, err
 }
 

diff --git a/pkg/sql/rowexec/sample_aggregator_test.go b/pkg/sql/rowexec/sample_aggregator_test.go
@@ -20,13 +20,11 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/gossip"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
-	"github.com/cockroachdb/cockroach/pkg/sql/catalog/catenumpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
 	"github.com/cockroachdb/cockroach/pkg/sql/randgen"
-	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/stats"
@@ -249,21 +247,16 @@ func runSampleAggregator(
 				t.Fatal(err)
 			}
 
+			var a tree.DatumAlloc
 			for _, b := range h.Buckets {
-				ed, _, err := rowenc.EncDatumFromBuffer(
-					catenumpb.DatumEncoding_ASCENDING_KEY, b.UpperBound,
-				)
+				datum, err := stats.DecodeUpperBound(h.Version, histEncType, &a, b.UpperBound)
 				if err != nil {
 					t.Fatal(err)
 				}
-				var d tree.DatumAlloc
-				if err := ed.EnsureDecoded(histEncType, &d); err != nil {
-					t.Fatal(err)
-				}
 				r.buckets = append(r.buckets, resultBucket{
 					numEq:    int(b.NumEq),
 					numRange: int(b.NumRange),
-					upper:    int(*ed.Datum.(*tree.DInt)),
+					upper:    int(*datum.(*tree.DInt)),
 				})
 			}
 

diff --git a/pkg/sql/show_histogram.go b/pkg/sql/show_histogram.go
@@ -14,11 +14,9 @@ import (
 	"context"
 	"fmt"
 
-	"github.com/cockroachdb/cockroach/pkg/sql/catalog/catenumpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/typedesc"
-	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
 	"github.com/cockroachdb/cockroach/pkg/sql/stats"
@@ -80,16 +78,15 @@ func (p *planner) ShowHistogram(ctx context.Context, n *tree.ShowHistogram) (pla
 			if err := typedesc.EnsureTypeIsHydrated(ctx, histogram.ColumnType, &resolver); err != nil {
 				return nil, err
 			}
+			var a tree.DatumAlloc
 			for _, b := range histogram.Buckets {
-				ed, _, err := rowenc.EncDatumFromBuffer(
-					catenumpb.DatumEncoding_ASCENDING_KEY, b.UpperBound,
-				)
+				datum, err := stats.DecodeUpperBound(histogram.Version, histogram.ColumnType, &a, b.UpperBound)
 				if err != nil {
 					v.Close(ctx)
 					return nil, err
 				}
 				row := tree.Datums{
-					tree.NewDString(ed.String(histogram.ColumnType)),
+					tree.NewDString(datum.String()),
 					tree.NewDInt(tree.DInt(b.NumRange)),
 					tree.NewDFloat(tree.DFloat(b.DistinctRange)),
 					tree.NewDInt(tree.DInt(b.NumEq)),

diff --git a/pkg/sql/show_stats.go b/pkg/sql/show_stats.go
@@ -201,7 +201,7 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
 				}
 
 				if withMerge {
-					merged := stats.MergedStatistics(ctx, statsList)
+					merged := stats.MergedStatistics(ctx, statsList, p.ExtendedEvalContext().Settings)
 					statsList = append(merged, statsList...)
 					// Iterate in reverse order to match the ORDER BY "columnIDs".
 					for i := len(merged) - 1; i >= 0; i-- {
@@ -214,7 +214,7 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
 				}
 
 				if withForecast {
-					forecasts := stats.ForecastTableStatistics(ctx, &p.ExtendedEvalContext().Settings.SV, statsList)
+					forecasts := stats.ForecastTableStatistics(ctx, p.ExtendedEvalContext().Settings, statsList)
 					forecastRows := make([]tree.Datums, 0, len(forecasts))
 					// Iterate in reverse order to match the ORDER BY "columnIDs".
 					for i := len(forecasts) - 1; i >= 0; i-- {

diff --git a/pkg/sql/stats/BUILD.bazel b/pkg/sql/stats/BUILD.bazel
@@ -22,6 +22,7 @@ go_library(
     importpath = "github.com/cockroachdb/cockroach/pkg/sql/stats",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/clusterversion",
         "//pkg/jobs/jobspb",
         "//pkg/keys",
         "//pkg/kv/kvclient/rangefeed",
@@ -40,6 +41,7 @@ go_library(
         "//pkg/sql/parser",
         "//pkg/sql/rowenc",
         "//pkg/sql/rowenc/keyside",
+        "//pkg/sql/rowenc/valueside",
         "//pkg/sql/sem/eval",
         "//pkg/sql/sem/tree",
         "//pkg/sql/sessiondata",

diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go
@@ -21,6 +21,7 @@ import (
 
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
@@ -70,7 +71,7 @@ const minGoodnessOfFit = 0.95
 // ForecastTableStatistics is deterministic: given the same observations it will
 // return the same forecasts.
 func ForecastTableStatistics(
-	ctx context.Context, sv *settings.Values, observed []*TableStatistic,
+	ctx context.Context, st *cluster.Settings, observed []*TableStatistic,
 ) []*TableStatistic {
 	// Group observed statistics by column set, skipping over partial statistics
 	// and statistics with inverted histograms.
@@ -113,7 +114,7 @@ func ForecastTableStatistics(
 		latest := observedByCols[colKey][0].CreatedAt
 		at := latest.Add(avgRefresh)
 
-		forecast, err := forecastColumnStatistics(ctx, sv, observedByCols[colKey], at, minGoodnessOfFit)
+		forecast, err := forecastColumnStatistics(ctx, st, observedByCols[colKey], at, minGoodnessOfFit)
 		if err != nil {
 			log.VEventf(
 				ctx, 2, "could not forecast statistics for table %v columns %s: %v",
@@ -146,7 +147,7 @@ func ForecastTableStatistics(
 // forecast time, it will return the same forecast.
 func forecastColumnStatistics(
 	ctx context.Context,
-	sv *settings.Values,
+	st *cluster.Settings,
 	observed []*TableStatistic,
 	at time.Time,
 	minRequiredFit float64,
@@ -290,7 +291,7 @@ func forecastColumnStatistics(
 		hist.adjustCounts(compareCtx, observed[0].HistogramData.ColumnType, nonNullRowCount, nonNullDistinctCount)
 
 		// Finally, convert back to HistogramData.
-		histData, err := hist.toHistogramData(observed[0].HistogramData.ColumnType)
+		histData, err := hist.toHistogramData(ctx, observed[0].HistogramData.ColumnType, st)
 		if err != nil {
 			return nil, err
 		}
@@ -343,7 +344,7 @@ func forecastColumnStatistics(
 					"forecasted histogram had first bucket with non-zero NumRange or DistinctRange: %s",
 					debugging,
 				)
-				sentryutil.SendReport(ctx, sv, err)
+				sentryutil.SendReport(ctx, &st.SV, err)
 				return nil, err
 			}
 			if bucket.UpperBound != tree.DNull {

diff --git a/pkg/sql/stats/forecast_test.go b/pkg/sql/stats/forecast_test.go
@@ -18,6 +18,7 @@ import (
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
@@ -605,19 +606,24 @@ func TestForecastColumnStatistics(t *testing.T) {
 		},
 	}
 	ctx := context.Background()
+	st := cluster.MakeTestingClusterSettings()
 	var fullStatID, partialStatID uint64
 	for i, tc := range testCases {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
 
 			// Set up observed TableStatistics in CreatedAt desc order.
 			observed := make([]*TableStatistic, len(tc.observed))
 			for j := range tc.observed {
-				observed[len(observed)-j-1] = tc.observed[j].toTableStatistic("testStat", i, descpb.ColumnIDs{1}, fullStatID, partialStatID)
+				observed[len(observed)-j-1] = tc.observed[j].toTableStatistic(
+					ctx, "testStat", i, descpb.ColumnIDs{1}, fullStatID, partialStatID, st,
+				)
 			}
-			expected := tc.forecast.toTableStatistic(jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID)
+			expected := tc.forecast.toTableStatistic(
+				ctx, jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID, st,
+			)
 			at := testStatTime(tc.at)
 
-			forecast, err := forecastColumnStatistics(ctx, nil /* sv */, observed, at, 1)
+			forecast, err := forecastColumnStatistics(ctx, st, observed, at, 1)
 			if err != nil {
 				if !tc.err {
 					t.Errorf("test case %d unexpected forecastColumnStatistics err: %v", i, err)
@@ -642,7 +648,13 @@ type testStat struct {
 }
 
 func (ts *testStat) toTableStatistic(
-	name string, tableID int, columnIDs descpb.ColumnIDs, statID uint64, fullStatID uint64,
+	ctx context.Context,
+	name string,
+	tableID int,
+	columnIDs descpb.ColumnIDs,
+	statID uint64,
+	fullStatID uint64,
+	st *cluster.Settings,
 ) *TableStatistic {
 	if ts == nil {
 		return nil
@@ -663,7 +675,7 @@ func (ts *testStat) toTableStatistic(
 	}
 	if ts.hist != nil {
 		hist := ts.hist.toHistogram()
-		histData, err := hist.toHistogramData(types.Float)
+		histData, err := hist.toHistogramData(ctx, types.Float, st)
 		if err != nil {
 			panic(err)
 		}