Skip to content

Commit

Permalink
stats: fix histograms for collated strings
Browse files Browse the repository at this point in the history
Currently, whenever we're encoding the upper bounds of the histogram
buckets we always use the datum's key-encoding. However, some datums are
composite, and we need to operate on their value encoding. This strategy
happens to be ok for floats and decimals because their value part only
stores some auxiliary information (like the number of trailing zeroes
and the sign of zero), so even if we lose that auxiliary information,
when decoding the upper bound we end up with a datum that has the same
comparison properties.

Unfortunately, this is not the case for collated strings where
key-encoding and value-encoding can be vastly different. So the fact
that we always use the key-encoding for collated strings makes it so
that when we key-decode upper bounds, we end up with garbage datums. As
a result, our histograms are meaningless for collated strings.

This problem is now fixed by using the value encoding for collated
strings. Doing so for other types that might have composite encoding
doesn't seem necessary at the moment (for floats and decimals things
already work, and we currently don't collect histograms on JSONs or
arrays). In order to make this change backwards-compatible we gate it
based on the new histogram version - if the histogram has the previous
version, then it must have used key-encoding for collated strings, so we
will keep on using key-decoding as well. Only once the cluster upgrades
to 24.1 version do we start creating the histograms with the new version
which will use value-encoding for collated strings.

Release note (bug fix): CockroachDB now correctly uses the histograms on
columns of COLLATED STRING type. The bug has been present since pre-22.1
release.
  • Loading branch information
yuzefovich committed Jan 12, 2024
1 parent 12ce879 commit 4d77dd6
Show file tree
Hide file tree
Showing 19 changed files with 212 additions and 85 deletions.
41 changes: 41 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/collated_strings
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# LogicTest: local

# Regression test for incorrectly using the key-encoding for collated strings
# as upper bounds of histogram buckets (#98400).
statement ok
CREATE TABLE t98400 (k INT PRIMARY KEY, s STRING COLLATE en_US_u_ks_level2);
INSERT INTO t98400 SELECT i, 'hello' FROM generate_series(1, 10) g(i);
INSERT INTO t98400 SELECT i, 'world' FROM generate_series(11, 12) g(i);
INSERT INTO t98400 SELECT 13, 'foo';

statement ok
ANALYZE t98400;

# We expect that the filter is estimated to match 10 rows.
query T
EXPLAIN (OPT, VERBOSE) SELECT * FROM t98400 WHERE s = 'hello' COLLATE en_US_u_ks_level2;
----
select
├── columns: k:1 s:2
├── stats: [rows=10, distinct(2)=1, null(2)=0]
│ histogram(2)= 0 10
│ <--- 'hello' COLLATE en_US_u_ks_level2
├── cost: 43.02
├── key: (1)
├── fd: ()-->(2)
├── distribution: test
├── prune: (1)
├── scan t98400
│ ├── columns: k:1 s:2
│ ├── stats: [rows=13, distinct(1)=13, null(1)=0, distinct(2)=3, null(2)=0]
│ │ histogram(1)= 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
│ │ <--- 1 --- 2 --- 3 --- 4 --- 5 --- 6 --- 7 --- 8 --- 9 --- 10 --- 11 --- 12 --- 13
│ │ histogram(2)= 0 1 10 2
│ │ <--- 'foo' COLLATE en_US_u_ks_level2 ---- 'world' COLLATE en_US_u_ks_level2
│ ├── cost: 42.86
│ ├── key: (1)
│ ├── fd: (1)-->(2)
│ ├── distribution: test
│ └── prune: (1,2)
└── filters
└── s:2 = 'hello' COLLATE en_US_u_ks_level2 [outer=(2), constraints=(/2: [/'hello' COLLATE en_US_u_ks_level2 - /'hello' COLLATE en_US_u_ks_level2]; tight), fd=()-->(2)]
7 changes: 7 additions & 0 deletions pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4825,12 +4825,15 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
// Each single-column prefix value from the Spans is a sample value.
// Give each sample value its own bucket, up to a maximum of 200
// buckets, with even distribution.
_, unencodedBuckets, err := stats.EquiDepthHistogram(sb.evalCtx,
_, unencodedBuckets, err := stats.EquiDepthHistogram(
sb.ctx,
sb.evalCtx,
dataType,
values, /* samples */
numRows,
int64(numValues), /* distinctCount */
int(stats.DefaultHistogramBuckets.Get(&sb.evalCtx.Settings.SV)), /* maxBuckets */
sb.evalCtx.Settings,
)
// This shouldn't error out, but if it does, let's not punish the user.
// Just build stats without the histogram in that case.
Expand Down
10 changes: 9 additions & 1 deletion pkg/sql/randgen/mutator.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/parser"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/valueside"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/sql/types"
Expand Down Expand Up @@ -300,6 +301,7 @@ func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData {
}
h := stats.HistogramData{
ColumnType: histogramColType,
Version: stats.HistVersion,
}

// Generate random values for histogram bucket upper bounds.
Expand All @@ -310,7 +312,13 @@ func randHistogram(rng *rand.Rand, colType *types.T) stats.HistogramData {
encs := encodeInvertedIndexHistogramUpperBounds(colType, upper)
encodedUpperBounds = append(encodedUpperBounds, encs...)
} else {
enc, err := keyside.Encode(nil, upper, encoding.Ascending)
var enc []byte
var err error
if colType.Family() == types.CollatedStringFamily {
enc, err = valueside.Encode(nil /* appendTo */, valueside.NoColumnID, upper, nil /* scratch */)
} else {
enc, err = keyside.Encode(nil, upper, encoding.Ascending)
}
if err != nil {
panic(err)
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/rowexec/sample_aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -653,14 +653,14 @@ func (s *sampleAggregator) generateHistogram(
}

if lowerBound != nil {
h, buckets, err := stats.ConstructExtremesHistogram(evalCtx, colType, values, numRows, distinctCount, maxBuckets, lowerBound)
h, buckets, err := stats.ConstructExtremesHistogram(ctx, evalCtx, colType, values, numRows, distinctCount, maxBuckets, lowerBound, evalCtx.Settings)
_ = buckets
return h, err
}

// TODO(michae2): Instead of using the flowCtx's evalCtx, investigate
// whether this can use a nil *eval.Context.
h, _, err := stats.EquiDepthHistogram(evalCtx, colType, values, numRows, distinctCount, maxBuckets)
h, _, err := stats.EquiDepthHistogram(ctx, evalCtx, colType, values, numRows, distinctCount, maxBuckets, evalCtx.Settings)
return h, err
}

Expand Down
13 changes: 3 additions & 10 deletions pkg/sql/rowexec/sample_aggregator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catenumpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/randgen"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
Expand Down Expand Up @@ -249,21 +247,16 @@ func runSampleAggregator(
t.Fatal(err)
}

var a tree.DatumAlloc
for _, b := range h.Buckets {
ed, _, err := rowenc.EncDatumFromBuffer(
catenumpb.DatumEncoding_ASCENDING_KEY, b.UpperBound,
)
datum, err := stats.DecodeUpperBound(h.Version, histEncType, &a, b.UpperBound)
if err != nil {
t.Fatal(err)
}
var d tree.DatumAlloc
if err := ed.EnsureDecoded(histEncType, &d); err != nil {
t.Fatal(err)
}
r.buckets = append(r.buckets, resultBucket{
numEq: int(b.NumEq),
numRange: int(b.NumRange),
upper: int(*ed.Datum.(*tree.DInt)),
upper: int(*datum.(*tree.DInt)),
})
}

Expand Down
9 changes: 3 additions & 6 deletions pkg/sql/show_histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@ import (
"context"
"fmt"

"github.com/cockroachdb/cockroach/pkg/sql/catalog/catenumpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/typedesc"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
Expand Down Expand Up @@ -80,16 +78,15 @@ func (p *planner) ShowHistogram(ctx context.Context, n *tree.ShowHistogram) (pla
if err := typedesc.EnsureTypeIsHydrated(ctx, histogram.ColumnType, &resolver); err != nil {
return nil, err
}
var a tree.DatumAlloc
for _, b := range histogram.Buckets {
ed, _, err := rowenc.EncDatumFromBuffer(
catenumpb.DatumEncoding_ASCENDING_KEY, b.UpperBound,
)
datum, err := stats.DecodeUpperBound(histogram.Version, histogram.ColumnType, &a, b.UpperBound)
if err != nil {
v.Close(ctx)
return nil, err
}
row := tree.Datums{
tree.NewDString(ed.String(histogram.ColumnType)),
tree.NewDString(datum.String()),
tree.NewDInt(tree.DInt(b.NumRange)),
tree.NewDFloat(tree.DFloat(b.DistinctRange)),
tree.NewDInt(tree.DInt(b.NumEq)),
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/show_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
}

if withMerge {
merged := stats.MergedStatistics(ctx, statsList)
merged := stats.MergedStatistics(ctx, statsList, p.ExtendedEvalContext().Settings)
statsList = append(merged, statsList...)
// Iterate in reverse order to match the ORDER BY "columnIDs".
for i := len(merged) - 1; i >= 0; i-- {
Expand All @@ -214,7 +214,7 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
}

if withForecast {
forecasts := stats.ForecastTableStatistics(ctx, &p.ExtendedEvalContext().Settings.SV, statsList)
forecasts := stats.ForecastTableStatistics(ctx, p.ExtendedEvalContext().Settings, statsList)
forecastRows := make([]tree.Datums, 0, len(forecasts))
// Iterate in reverse order to match the ORDER BY "columnIDs".
for i := len(forecasts) - 1; i >= 0; i-- {
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/stats/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/sql/stats",
visibility = ["//visibility:public"],
deps = [
"//pkg/clusterversion",
"//pkg/jobs/jobspb",
"//pkg/keys",
"//pkg/kv/kvclient/rangefeed",
Expand All @@ -40,6 +41,7 @@ go_library(
"//pkg/sql/parser",
"//pkg/sql/rowenc",
"//pkg/sql/rowenc/keyside",
"//pkg/sql/rowenc/valueside",
"//pkg/sql/sem/eval",
"//pkg/sql/sem/tree",
"//pkg/sql/sessiondata",
Expand Down
11 changes: 6 additions & 5 deletions pkg/sql/stats/forecast.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
Expand Down Expand Up @@ -70,7 +71,7 @@ const minGoodnessOfFit = 0.95
// ForecastTableStatistics is deterministic: given the same observations it will
// return the same forecasts.
func ForecastTableStatistics(
ctx context.Context, sv *settings.Values, observed []*TableStatistic,
ctx context.Context, st *cluster.Settings, observed []*TableStatistic,
) []*TableStatistic {
// Group observed statistics by column set, skipping over partial statistics
// and statistics with inverted histograms.
Expand Down Expand Up @@ -113,7 +114,7 @@ func ForecastTableStatistics(
latest := observedByCols[colKey][0].CreatedAt
at := latest.Add(avgRefresh)

forecast, err := forecastColumnStatistics(ctx, sv, observedByCols[colKey], at, minGoodnessOfFit)
forecast, err := forecastColumnStatistics(ctx, st, observedByCols[colKey], at, minGoodnessOfFit)
if err != nil {
log.VEventf(
ctx, 2, "could not forecast statistics for table %v columns %s: %v",
Expand Down Expand Up @@ -146,7 +147,7 @@ func ForecastTableStatistics(
// forecast time, it will return the same forecast.
func forecastColumnStatistics(
ctx context.Context,
sv *settings.Values,
st *cluster.Settings,
observed []*TableStatistic,
at time.Time,
minRequiredFit float64,
Expand Down Expand Up @@ -290,7 +291,7 @@ func forecastColumnStatistics(
hist.adjustCounts(compareCtx, observed[0].HistogramData.ColumnType, nonNullRowCount, nonNullDistinctCount)

// Finally, convert back to HistogramData.
histData, err := hist.toHistogramData(observed[0].HistogramData.ColumnType)
histData, err := hist.toHistogramData(ctx, observed[0].HistogramData.ColumnType, st)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -343,7 +344,7 @@ func forecastColumnStatistics(
"forecasted histogram had first bucket with non-zero NumRange or DistinctRange: %s",
debugging,
)
sentryutil.SendReport(ctx, sv, err)
sentryutil.SendReport(ctx, &st.SV, err)
return nil, err
}
if bucket.UpperBound != tree.DNull {
Expand Down
22 changes: 17 additions & 5 deletions pkg/sql/stats/forecast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
"github.com/cockroachdb/cockroach/pkg/sql/types"
Expand Down Expand Up @@ -605,19 +606,24 @@ func TestForecastColumnStatistics(t *testing.T) {
},
}
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
var fullStatID, partialStatID uint64
for i, tc := range testCases {
t.Run(strconv.Itoa(i), func(t *testing.T) {

// Set up observed TableStatistics in CreatedAt desc order.
observed := make([]*TableStatistic, len(tc.observed))
for j := range tc.observed {
observed[len(observed)-j-1] = tc.observed[j].toTableStatistic("testStat", i, descpb.ColumnIDs{1}, fullStatID, partialStatID)
observed[len(observed)-j-1] = tc.observed[j].toTableStatistic(
ctx, "testStat", i, descpb.ColumnIDs{1}, fullStatID, partialStatID, st,
)
}
expected := tc.forecast.toTableStatistic(jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID)
expected := tc.forecast.toTableStatistic(
ctx, jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID, st,
)
at := testStatTime(tc.at)

forecast, err := forecastColumnStatistics(ctx, nil /* sv */, observed, at, 1)
forecast, err := forecastColumnStatistics(ctx, st, observed, at, 1)
if err != nil {
if !tc.err {
t.Errorf("test case %d unexpected forecastColumnStatistics err: %v", i, err)
Expand All @@ -642,7 +648,13 @@ type testStat struct {
}

func (ts *testStat) toTableStatistic(
name string, tableID int, columnIDs descpb.ColumnIDs, statID uint64, fullStatID uint64,
ctx context.Context,
name string,
tableID int,
columnIDs descpb.ColumnIDs,
statID uint64,
fullStatID uint64,
st *cluster.Settings,
) *TableStatistic {
if ts == nil {
return nil
Expand All @@ -663,7 +675,7 @@ func (ts *testStat) toTableStatistic(
}
if ts.hist != nil {
hist := ts.hist.toHistogram()
histData, err := hist.toHistogramData(types.Float)
histData, err := hist.toHistogramData(ctx, types.Float, st)
if err != nil {
panic(err)
}
Expand Down

0 comments on commit 4d77dd6

Please sign in to comment.