diff --git a/pkg/sql/opt/constraint/constraint.go b/pkg/sql/opt/constraint/constraint.go index 9bb19e172fb5..ddd12849ca9b 100644 --- a/pkg/sql/opt/constraint/constraint.go +++ b/pkg/sql/opt/constraint/constraint.go @@ -448,6 +448,28 @@ func (c *Constraint) ExactPrefix(evalCtx *tree.EvalContext) int { } } +// ConstrainedColumns returns the number of columns which are constrained by +// the Constraint. For example: +// /a/b/c: [/1/1 - /1] [/3 - /3] +// has 2 constrained columns. This may be less than the total number of columns +// in the constraint, especially if it represents an index constraint. +func (c *Constraint) ConstrainedColumns(evalCtx *tree.EvalContext) int { + count := 0 + for i := 0; i < c.Spans.Count(); i++ { + sp := c.Spans.Get(i) + start := sp.StartKey() + end := sp.EndKey() + if start.Length() > count { + count = start.Length() + } + if end.Length() > count { + count = end.Length() + } + } + + return count +} + // Prefix returns the length of the longest prefix of columns for which all the // spans have the same start and end values. For example: // /a/b/c: [/1/1/1 - /1/1/2] [/3/3/3 - /3/3/4] diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index a291d99d3c90..71d7fe4fd4b7 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -454,22 +454,18 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa if scan.Constraint != nil { // Calculate distinct counts for constrained columns // ------------------------------------------------- - applied := sb.applyConstraint(scan.Constraint, scan, relProps) + numUnappliedConjuncts := sb.applyIndexConstraint(scan.Constraint, scan, relProps) var cols opt.ColSet - for i := 0; i < scan.Constraint.Columns.Count(); i++ { + for i := 0; i < scan.Constraint.ConstrainedColumns(sb.evalCtx); i++ { cols.Add(int(scan.Constraint.Columns.Get(i).ID())) } // Calculate row count and selectivity // ----------------------------------- inputRowCount := s.RowCount - if applied { - s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s)) - } else { - numUnappliedConjuncts := sb.numConjunctsInConstraint(scan.Constraint) - s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) - } + s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s)) + s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) // Set null counts to 0 for non-nullable columns // ------------------------------------------------- @@ -1846,18 +1842,26 @@ func (sb *statisticsBuilder) applyFilter( return numUnappliedConjuncts, constrainedCols } -func (sb *statisticsBuilder) applyConstraint( +func (sb *statisticsBuilder) applyIndexConstraint( c *constraint.Constraint, e RelExpr, relProps *props.Relational, -) (applied bool) { +) (numUnappliedConjuncts float64) { // If unconstrained, then no constraint could be derived from the expression, // so fall back to estimate. // If a contradiction, then optimizations must not be enabled (say for // testing), or else this would have been reduced. if c.IsUnconstrained() || c.IsContradiction() { - return false /* applied */ + return 0 /* numUnappliedConjuncts */ } - return sb.updateDistinctCountsFromConstraint(c, e, relProps) + applied := sb.updateDistinctCountsFromConstraint(c, e, relProps) + for i, n := applied, c.ConstrainedColumns(sb.evalCtx); i < n; i++ { + // Unlike the constraints found in Select and Join filters, an index + // constraint may represent multiple conjuncts. Therefore, we need to + // calculate the number of unapplied conjuncts for each constrained column. + numUnappliedConjuncts += sb.numConjunctsInConstraint(c, i) + } + + return numUnappliedConjuncts } func (sb *statisticsBuilder) applyConstraintSet( @@ -1874,13 +1878,13 @@ func (sb *statisticsBuilder) applyConstraintSet( numUnappliedConjuncts = 0 for i := 0; i < cs.Length(); i++ { applied := sb.updateDistinctCountsFromConstraint(cs.Constraint(i), e, relProps) - if !applied { + if applied == 0 { // If a constraint cannot be applied, it may represent an // inequality like x < 1. As a result, distinctCounts does not fully // represent the selectivity of the constraint set. // We return an estimate of the number of unapplied conjuncts to the // caller function to be used for selectivity calculation. - numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i)) + numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i), 0 /* nth */) } } @@ -1923,10 +1927,10 @@ func (sb *statisticsBuilder) updateNullCountsFromProps( // updateDistinctCountsFromConstraint updates the distinct count for each // column in a constraint that can be determined to have a finite number of -// possible values. It returns a boolean indicating if the constraint was -// applied (i.e., the distinct count for at least one column could be inferred -// from the constraint). If the same column appears in multiple constraints, -// the distinct count is the minimum for that column across all constraints. +// possible values. It returns the number of columns for which the distinct +// count could be inferred from the constraint. If the same column appears +// in multiple constraints, the distinct count is the minimum for that column +// across all constraints. // // For example, consider the following constraint set: // @@ -1952,7 +1956,7 @@ func (sb *statisticsBuilder) updateNullCountsFromProps( // discrepancy must be resolved by the calling function. func (sb *statisticsBuilder) updateDistinctCountsFromConstraint( c *constraint.Constraint, e RelExpr, relProps *props.Relational, -) (applied bool) { +) (applied int) { // All of the columns that are part of the prefix have a finite number of // distinct values. prefix := c.Prefix(sb.evalCtx) @@ -2024,7 +2028,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromConstraint( colID := c.Columns.Get(col).ID() sb.ensureColStat(util.MakeFastIntSet(int(colID)), distinctCount, e, relProps) - applied = true + applied = col + 1 } return applied @@ -2072,8 +2076,8 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency( } // selectivityFromDistinctCounts calculates the selectivity of a filter by -// taking the product of selectivities of each constrained column. In the general case, -// this can be represented by the formula: +// taking the product of selectivities of each constrained column. In the +// general case, this can be represented by the formula: // // ┬-┬ ⎛ new distinct(i) ⎞ // selectivity = │ │ ⎜ --------------- ⎟ @@ -2102,7 +2106,7 @@ func (sb *statisticsBuilder) selectivityFromDistinctCounts( oldDistinct := inputStat.DistinctCount if oldDistinct != 0 && newDistinct < oldDistinct { - selectivity *= newDistinct / oldDistinct + selectivity *= min(newDistinct/oldDistinct, unknownFilterSelectivity) } } @@ -2313,9 +2317,9 @@ func isEqualityWithTwoVars(cond opt.ScalarExpr) bool { } // numConjunctsInConstraint returns a rough estimate of the number of conjuncts -// used to build the given constraint. +// used to build the given constraint for the column at position nth. func (sb *statisticsBuilder) numConjunctsInConstraint( - c *constraint.Constraint, + c *constraint.Constraint, nth int, ) (numConjuncts float64) { if c.Spans.Count() == 0 { return 0 /* numConjuncts */ @@ -2325,25 +2329,20 @@ func (sb *statisticsBuilder) numConjunctsInConstraint( for i := 0; i < c.Spans.Count(); i++ { span := c.Spans.Get(i) numSpanConjuncts := float64(0) - // The first start and end keys in each span are the only ones that matter - // for determining selectivity when we have no knowledge of the data - // distribution. Technically, /a/b: [/5 - ] is more selective than - // /a/b: [/4/5 - ], which is more selective than /a/b: [/4 - ]. But we - // treat them all the same, with selectivity=1/3. - if span.StartKey().Length() > 0 { + if span.StartKey().Length() > nth { // Cases of NULL in a constraint should be ignored. For example, // without knowledge of the data distribution, /a: (/NULL - /10] should // have the same estimated selectivity as /a: [/10 - ]. Selectivity // of NULL constraints is handled in selectivityFromNullCounts. - if c.Columns.Get(0).Descending() || - span.StartKey().Value(0) != tree.DNull { + if c.Columns.Get(nth).Descending() || + span.StartKey().Value(nth) != tree.DNull { numSpanConjuncts++ } } - if span.EndKey().Length() > 0 { + if span.EndKey().Length() > nth { // Ignore cases of NULL in constraints. (see above comment). - if !c.Columns.Get(0).Descending() || - span.EndKey().Value(0) != tree.DNull { + if !c.Columns.Get(nth).Descending() || + span.EndKey().Value(nth) != tree.DNull { numSpanConjuncts++ } } diff --git a/pkg/sql/opt/memo/testdata/stats/project b/pkg/sql/opt/memo/testdata/stats/project index 329bf145dcf6..a590a5765c1a 100644 --- a/pkg/sql/opt/memo/testdata/stats/project +++ b/pkg/sql/opt/memo/testdata/stats/project @@ -142,7 +142,7 @@ SELECT * FROM (SELECT y + 3 AS v FROM a) WHERE v >= 1 AND v <= 100 ---- select ├── columns: v:5(int!null) - ├── stats: [rows=990, distinct(5)=100, null(5)=0] + ├── stats: [rows=660, distinct(5)=100, null(5)=0] ├── project │ ├── columns: v:5(int) │ ├── stats: [rows=2000, distinct(5)=200, null(5)=20] diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan index f7186c1b56a2..4a1d1a28d12f 100644 --- a/pkg/sql/opt/memo/testdata/stats/scan +++ b/pkg/sql/opt/memo/testdata/stats/scan @@ -41,7 +41,7 @@ SELECT * FROM a WHERE b ---- select ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null) b:5(bool!null) - ├── stats: [rows=990, distinct(1)=990, null(1)=0, distinct(4)=199.804688, null(4)=0, distinct(5)=1, null(5)=0] + ├── stats: [rows=660, distinct(1)=660, null(1)=0, distinct(4)=196.531694, null(4)=0, distinct(5)=1, null(5)=0] ├── key: (1) ├── fd: ()-->(5), (1)-->(2-4), (3,4)~~>(1,2) ├── scan a @@ -246,13 +246,13 @@ index-join a ├── fd: (1)-->(2-5), (3,4)-->(1,2,5) └── select ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null) - ├── stats: [rows=111.111111, distinct(1)=110.489355, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=97.0976681, null(4)=0] + ├── stats: [rows=37.037037, distinct(1)=36.9747958, null(1)=0, distinct(3)=1.99999999, null(3)=0, distinct(4)=35.7721483, null(4)=0] ├── key: (1) ├── fd: (1)-->(3,4), (3,4)-->(1) ├── scan a@secondary │ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null) │ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0] - │ ├── stats: [rows=1000, distinct(1)=911.337892, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0] + │ ├── stats: [rows=333.333333, distinct(1)=323.895037, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0] │ ├── key: (1) │ └── fd: (1)-->(3,4), (3,4)-->(1) └── filters @@ -407,15 +407,60 @@ index-join a ├── fd: (1)-->(2-5), (3,4)-->(1,2,5) └── select ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null) - ├── stats: [rows=74.0740741, distinct(1)=74.0740741, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=68.9343053, null(4)=0] + ├── stats: [rows=24.691358, distinct(1)=24.691358, null(1)=0, distinct(3)=1.99999586, null(3)=0, distinct(4)=24.5913408, null(4)=0] ├── key: (1) ├── fd: (1)-->(3,4), (3,4)-->(1) ├── scan a@secondary │ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null) │ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0] - │ ├── stats: [rows=666.666667, distinct(1)=666.666667, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0] + │ ├── stats: [rows=222.222222, distinct(1)=222.222222, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0] │ ├── key: (1) │ └── fd: (1)-->(3,4), (3,4)-->(1) └── filters ├── (s <= 'foo') OR (s >= 'foobar') [type=bool, outer=(3)] └── d <= 5.0 [type=bool, outer=(4), constraints=(/4: (/NULL - /5.0]; tight)] + +exec-ddl +CREATE TABLE abcde ( + a INT PRIMARY KEY, + b INT, + c STRING, + d INT, + e INT, + INDEX bad(b, d), + INDEX good(b, c, d) +) +---- +TABLE abcde + ├── a int not null + ├── b int + ├── c string + ├── d int + ├── e int + ├── INDEX primary + │ └── a int not null + ├── INDEX bad + │ ├── b int + │ ├── d int + │ └── a int not null + └── INDEX good + ├── b int + ├── c string + ├── d int + └── a int not null + +# Regression test for #31929. Ensure that the good index is chosen. +opt +SELECT * FROM abcde WHERE b = 1 AND c LIKE '+1-1000%' +---- +index-join abcde + ├── columns: a:1(int!null) b:2(int!null) c:3(string) d:4(int) e:5(int) + ├── stats: [rows=3.3, distinct(1)=3.3, null(1)=0, distinct(2)=1, null(2)=0] + ├── key: (1) + ├── fd: ()-->(2), (1)-->(3-5) + └── scan abcde@good + ├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int) + ├── constraint: /2/3/4/1: [/1/'+1-1000' - /1/'+1-1001') + ├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0] + ├── key: (1) + └── fd: ()-->(2), (1)-->(3,4) diff --git a/pkg/sql/opt/memo/testdata/stats/values b/pkg/sql/opt/memo/testdata/stats/values index dfc0d5c0948e..11447a51cf5c 100644 --- a/pkg/sql/opt/memo/testdata/stats/values +++ b/pkg/sql/opt/memo/testdata/stats/values @@ -4,7 +4,7 @@ SELECT * FROM (VALUES (1, 2), (1, 2), (1, 3), (2, 3)) AS q(x, y) WHERE x = 5 AND select ├── columns: x:1(int!null) y:2(int!null) ├── cardinality: [0 - 4] - ├── stats: [rows=1, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0] + ├── stats: [rows=0.444444444, distinct(1)=0.444444444, null(1)=0, distinct(2)=0.444444444, null(2)=0] ├── fd: ()-->(1,2) ├── values │ ├── columns: column1:1(int) column2:2(int) @@ -54,7 +54,7 @@ SELECT * FROM (VALUES (1), (1), (1), (2)) AS q(x) WHERE x = 1 select ├── columns: x:1(int!null) ├── cardinality: [0 - 4] - ├── stats: [rows=2, distinct(1)=1, null(1)=0] + ├── stats: [rows=1.33333333, distinct(1)=1, null(1)=0] ├── fd: ()-->(1) ├── values │ ├── columns: column1:1(int) diff --git a/pkg/sql/opt/xform/testdata/rules/select b/pkg/sql/opt/xform/testdata/rules/select index e5d45d277bba..71b52b4123ea 100644 --- a/pkg/sql/opt/xform/testdata/rules/select +++ b/pkg/sql/opt/xform/testdata/rules/select @@ -562,19 +562,19 @@ memo (optimized, ~4KB) ├── G1: (select G2 G3) (select G4 G3) │ └── [presentation: k:1,u:2,v:3,j:4] │ ├── best: (select G4 G3) - │ └── cost: 407.09 + │ └── cost: 45.23 ├── G2: (scan b) ├── G3: (filters G5 G6) ├── G4: (index-join G7 b,cols=(1-4)) │ └── [] │ ├── best: (index-join G7 b,cols=(1-4)) - │ └── cost: 406.30 + │ └── cost: 45.14 ├── G5: (gt G8 G9) ├── G6: (lt G8 G10) ├── G7: (scan b@u,cols=(1,2),constrained) │ └── [] │ ├── best: (scan b@u,cols=(1,2),constrained) - │ └── cost: 82.37 + │ └── cost: 9.15 ├── G8: (tuple G11) ├── G9: (tuple G12) ├── G10: (tuple G13)