opt: fix selectivity estimates for index constraints

Unlike the constraints found in Select and Join filters, an index constraint may represent multiple conjuncts. Therefore, the selectivity estimate for a Scan should account for the selectivity of each constrained column in the index constraint. This commit fixes the selectivity estimation in the optimizer to properly account for each constrained column in a Scan. Fixes #31929 Release note (bug fix): In some cases the optimizer was choosing the wrong index for a scan because of incorrect selectivity estimation. This estimation error has been fixed.
cockroachdb · Oct 26, 2018 · 841498f · 841498f
1 parent a0bde06
commit 841498f
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 47 deletions.
diff --git a/pkg/sql/opt/constraint/constraint.go b/pkg/sql/opt/constraint/constraint.go
@@ -448,6 +448,28 @@ func (c *Constraint) ExactPrefix(evalCtx *tree.EvalContext) int {
 	}
 }
 
+// ConstrainedColumns returns the number of columns which are constrained by
+// the Constraint. For example:
+//   /a/b/c: [/1/1 - /1] [/3 - /3]
+// has 2 constrained columns. This may be less than the total number of columns
+// in the constraint, especially if it represents an index constraint.
+func (c *Constraint) ConstrainedColumns(evalCtx *tree.EvalContext) int {
+	count := 0
+	for i := 0; i < c.Spans.Count(); i++ {
+		sp := c.Spans.Get(i)
+		start := sp.StartKey()
+		end := sp.EndKey()
+		if start.Length() > count {
+			count = start.Length()
+		}
+		if end.Length() > count {
+			count = end.Length()
+		}
+	}
+
+	return count
+}
+
 // Prefix returns the length of the longest prefix of columns for which all the
 // spans have the same start and end values. For example:
 //   /a/b/c: [/1/1/1 - /1/1/2] [/3/3/3 - /3/3/4]

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -454,22 +454,18 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
 	if scan.Constraint != nil {
 		// Calculate distinct counts for constrained columns
 		// -------------------------------------------------
-		applied := sb.applyConstraint(scan.Constraint, scan, relProps)
+		numUnappliedConjuncts := sb.applyIndexConstraint(scan.Constraint, scan, relProps)
 
 		var cols opt.ColSet
-		for i := 0; i < scan.Constraint.Columns.Count(); i++ {
+		for i := 0; i < scan.Constraint.ConstrainedColumns(sb.evalCtx); i++ {
 			cols.Add(int(scan.Constraint.Columns.Get(i).ID()))
 		}
 
 		// Calculate row count and selectivity
 		// -----------------------------------
 		inputRowCount := s.RowCount
-		if applied {
-			s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
-		} else {
-			numUnappliedConjuncts := sb.numConjunctsInConstraint(scan.Constraint)
-			s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
-		}
+		s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
+		s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
 
 		// Set null counts to 0 for non-nullable columns
 		// -------------------------------------------------
@@ -1846,18 +1842,26 @@ func (sb *statisticsBuilder) applyFilter(
 	return numUnappliedConjuncts, constrainedCols
 }
 
-func (sb *statisticsBuilder) applyConstraint(
+func (sb *statisticsBuilder) applyIndexConstraint(
 	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
-) (applied bool) {
+) (numUnappliedConjuncts float64) {
 	// If unconstrained, then no constraint could be derived from the expression,
 	// so fall back to estimate.
 	// If a contradiction, then optimizations must not be enabled (say for
 	// testing), or else this would have been reduced.
 	if c.IsUnconstrained() || c.IsContradiction() {
-		return false /* applied */
+		return 0 /* numUnappliedConjuncts */
 	}
 
-	return sb.updateDistinctCountsFromConstraint(c, e, relProps)
+	applied := sb.updateDistinctCountsFromConstraint(c, e, relProps)
+	for i, n := applied, c.ConstrainedColumns(sb.evalCtx); i < n; i++ {
+		// Unlike the constraints found in Select and Join filters, an index
+		// constraint may represent multiple conjuncts. Therefore, we need to
+		// calculate the number of unapplied conjuncts for each constrained column.
+		numUnappliedConjuncts += sb.numConjunctsInConstraint(c, i)
+	}
+
+	return numUnappliedConjuncts
 }
 
 func (sb *statisticsBuilder) applyConstraintSet(
@@ -1874,13 +1878,13 @@ func (sb *statisticsBuilder) applyConstraintSet(
 	numUnappliedConjuncts = 0
 	for i := 0; i < cs.Length(); i++ {
 		applied := sb.updateDistinctCountsFromConstraint(cs.Constraint(i), e, relProps)
-		if !applied {
+		if applied == 0 {
 			// If a constraint cannot be applied, it may represent an
 			// inequality like x < 1. As a result, distinctCounts does not fully
 			// represent the selectivity of the constraint set.
 			// We return an estimate of the number of unapplied conjuncts to the
 			// caller function to be used for selectivity calculation.
-			numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i))
+			numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i), 0 /* nth */)
 		}
 	}
 
@@ -1923,10 +1927,10 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
 
 // updateDistinctCountsFromConstraint updates the distinct count for each
 // column in a constraint that can be determined to have a finite number of
-// possible values. It returns a boolean indicating if the constraint was
-// applied (i.e., the distinct count for at least one column could be inferred
-// from the constraint). If the same column appears in multiple constraints,
-// the distinct count is the minimum for that column across all constraints.
+// possible values. It returns the number of columns for which the distinct
+// count could be inferred from the constraint. If the same column appears
+// in multiple constraints, the distinct count is the minimum for that column
+// across all constraints.
 //
 // For example, consider the following constraint set:
 //
@@ -1952,7 +1956,7 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
 // discrepancy must be resolved by the calling function.
 func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
 	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
-) (applied bool) {
+) (applied int) {
 	// All of the columns that are part of the prefix have a finite number of
 	// distinct values.
 	prefix := c.Prefix(sb.evalCtx)
@@ -2024,7 +2028,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
 
 		colID := c.Columns.Get(col).ID()
 		sb.ensureColStat(util.MakeFastIntSet(int(colID)), distinctCount, e, relProps)
-		applied = true
+		applied = col + 1
 	}
 
 	return applied
@@ -2072,8 +2076,8 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
 }
 
 // selectivityFromDistinctCounts calculates the selectivity of a filter by
-// taking the product of selectivities of each constrained column. In the general case,
-// this can be represented by the formula:
+// taking the product of selectivities of each constrained column. In the
+// general case, this can be represented by the formula:
 //
 //                  ┬-┬ ⎛ new distinct(i) ⎞
 //   selectivity =  │ │ ⎜ --------------- ⎟
@@ -2102,7 +2106,7 @@ func (sb *statisticsBuilder) selectivityFromDistinctCounts(
 		oldDistinct := inputStat.DistinctCount
 
 		if oldDistinct != 0 && newDistinct < oldDistinct {
-			selectivity *= newDistinct / oldDistinct
+			selectivity *= min(newDistinct/oldDistinct, unknownFilterSelectivity)
 		}
 	}
 
@@ -2313,9 +2317,9 @@ func isEqualityWithTwoVars(cond opt.ScalarExpr) bool {
 }
 
 // numConjunctsInConstraint returns a rough estimate of the number of conjuncts
-// used to build the given constraint.
+// used to build the given constraint for the column at position nth.
 func (sb *statisticsBuilder) numConjunctsInConstraint(
-	c *constraint.Constraint,
+	c *constraint.Constraint, nth int,
 ) (numConjuncts float64) {
 	if c.Spans.Count() == 0 {
 		return 0 /* numConjuncts */
@@ -2325,25 +2329,20 @@ func (sb *statisticsBuilder) numConjunctsInConstraint(
 	for i := 0; i < c.Spans.Count(); i++ {
 		span := c.Spans.Get(i)
 		numSpanConjuncts := float64(0)
-		// The first start and end keys in each span are the only ones that matter
-		// for determining selectivity when we have no knowledge of the data
-		// distribution. Technically, /a/b: [/5 - ] is more selective than
-		// /a/b: [/4/5 - ], which is more selective than /a/b: [/4 - ]. But we
-		// treat them all the same, with selectivity=1/3.
-		if span.StartKey().Length() > 0 {
+		if span.StartKey().Length() > nth {
 			// Cases of NULL in a constraint should be ignored. For example,
 			// without knowledge of the data distribution, /a: (/NULL - /10] should
 			// have the same estimated selectivity as /a: [/10 - ]. Selectivity
 			// of NULL constraints is handled in selectivityFromNullCounts.
-			if c.Columns.Get(0).Descending() ||
-				span.StartKey().Value(0) != tree.DNull {
+			if c.Columns.Get(nth).Descending() ||
+				span.StartKey().Value(nth) != tree.DNull {
 				numSpanConjuncts++
 			}
 		}
-		if span.EndKey().Length() > 0 {
+		if span.EndKey().Length() > nth {
 			// Ignore cases of NULL in constraints. (see above comment).
-			if !c.Columns.Get(0).Descending() ||
-				span.EndKey().Value(0) != tree.DNull {
+			if !c.Columns.Get(nth).Descending() ||
+				span.EndKey().Value(nth) != tree.DNull {
 				numSpanConjuncts++
 			}
 		}

diff --git a/pkg/sql/opt/memo/testdata/stats/project b/pkg/sql/opt/memo/testdata/stats/project
@@ -142,7 +142,7 @@ SELECT * FROM (SELECT y + 3 AS v FROM a) WHERE v >= 1 AND v <= 100
 ----
 select
  ├── columns: v:5(int!null)
- ├── stats: [rows=990, distinct(5)=100, null(5)=0]
+ ├── stats: [rows=660, distinct(5)=100, null(5)=0]
  ├── project
  │    ├── columns: v:5(int)
  │    ├── stats: [rows=2000, distinct(5)=200, null(5)=20]

diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan
@@ -41,7 +41,7 @@ SELECT * FROM a WHERE b
 ----
 select
  ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null) b:5(bool!null)
- ├── stats: [rows=990, distinct(1)=990, null(1)=0, distinct(4)=199.804688, null(4)=0, distinct(5)=1, null(5)=0]
+ ├── stats: [rows=660, distinct(1)=660, null(1)=0, distinct(4)=196.531694, null(4)=0, distinct(5)=1, null(5)=0]
  ├── key: (1)
  ├── fd: ()-->(5), (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan a
@@ -246,13 +246,13 @@ index-join a
  ├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
  └── select
       ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
-      ├── stats: [rows=111.111111, distinct(1)=110.489355, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=97.0976681, null(4)=0]
+      ├── stats: [rows=37.037037, distinct(1)=36.9747958, null(1)=0, distinct(3)=1.99999999, null(3)=0, distinct(4)=35.7721483, null(4)=0]
       ├── key: (1)
       ├── fd: (1)-->(3,4), (3,4)-->(1)
       ├── scan a@secondary
       │    ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
       │    ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
-      │    ├── stats: [rows=1000, distinct(1)=911.337892, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
+      │    ├── stats: [rows=333.333333, distinct(1)=323.895037, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
       │    ├── key: (1)
       │    └── fd: (1)-->(3,4), (3,4)-->(1)
       └── filters
@@ -407,15 +407,60 @@ index-join a
  ├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
  └── select
       ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
-      ├── stats: [rows=74.0740741, distinct(1)=74.0740741, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=68.9343053, null(4)=0]
+      ├── stats: [rows=24.691358, distinct(1)=24.691358, null(1)=0, distinct(3)=1.99999586, null(3)=0, distinct(4)=24.5913408, null(4)=0]
       ├── key: (1)
       ├── fd: (1)-->(3,4), (3,4)-->(1)
       ├── scan a@secondary
       │    ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
       │    ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
-      │    ├── stats: [rows=666.666667, distinct(1)=666.666667, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
+      │    ├── stats: [rows=222.222222, distinct(1)=222.222222, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
       │    ├── key: (1)
       │    └── fd: (1)-->(3,4), (3,4)-->(1)
       └── filters
            ├── (s <= 'foo') OR (s >= 'foobar') [type=bool, outer=(3)]
            └── d <= 5.0 [type=bool, outer=(4), constraints=(/4: (/NULL - /5.0]; tight)]
+
+exec-ddl
+CREATE TABLE abcde (
+  a INT PRIMARY KEY,
+  b INT,
+  c STRING,
+  d INT,
+  e INT,
+  INDEX bad(b, d),
+  INDEX good(b, c, d)
+)
+----
+TABLE abcde
+ ├── a int not null
+ ├── b int
+ ├── c string
+ ├── d int
+ ├── e int
+ ├── INDEX primary
+ │    └── a int not null
+ ├── INDEX bad
+ │    ├── b int
+ │    ├── d int
+ │    └── a int not null
+ └── INDEX good
+      ├── b int
+      ├── c string
+      ├── d int
+      └── a int not null
+
+# Regression test for #31929. Ensure that the good index is chosen.
+opt
+SELECT * FROM abcde WHERE b = 1 AND c LIKE '+1-1000%'
+----
+index-join abcde
+ ├── columns: a:1(int!null) b:2(int!null) c:3(string) d:4(int) e:5(int)
+ ├── stats: [rows=3.3, distinct(1)=3.3, null(1)=0, distinct(2)=1, null(2)=0]
+ ├── key: (1)
+ ├── fd: ()-->(2), (1)-->(3-5)
+ └── scan abcde@good
+      ├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int)
+      ├── constraint: /2/3/4/1: [/1/'+1-1000' - /1/'+1-1001')
+      ├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0]
+      ├── key: (1)
+      └── fd: ()-->(2), (1)-->(3,4)
diff --git a/pkg/sql/opt/memo/testdata/stats/values b/pkg/sql/opt/memo/testdata/stats/values
@@ -4,7 +4,7 @@ SELECT * FROM (VALUES (1, 2), (1, 2), (1, 3), (2, 3)) AS q(x, y) WHERE x = 5 AND
 select
  ├── columns: x:1(int!null) y:2(int!null)
  ├── cardinality: [0 - 4]
- ├── stats: [rows=1, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0]
+ ├── stats: [rows=0.444444444, distinct(1)=0.444444444, null(1)=0, distinct(2)=0.444444444, null(2)=0]
  ├── fd: ()-->(1,2)
  ├── values
  │    ├── columns: column1:1(int) column2:2(int)
@@ -54,7 +54,7 @@ SELECT * FROM (VALUES (1), (1), (1), (2)) AS q(x) WHERE x = 1
 select
  ├── columns: x:1(int!null)
  ├── cardinality: [0 - 4]
- ├── stats: [rows=2, distinct(1)=1, null(1)=0]
+ ├── stats: [rows=1.33333333, distinct(1)=1, null(1)=0]
  ├── fd: ()-->(1)
  ├── values
  │    ├── columns: column1:1(int)

diff --git a/pkg/sql/opt/xform/testdata/rules/select b/pkg/sql/opt/xform/testdata/rules/select
@@ -562,19 +562,19 @@ memo (optimized, ~4KB)
  ├── G1: (select G2 G3) (select G4 G3)
  │    └── [presentation: k:1,u:2,v:3,j:4]
  │         ├── best: (select G4 G3)
- │         └── cost: 407.09
+ │         └── cost: 45.23
  ├── G2: (scan b)
  ├── G3: (filters G5 G6)
  ├── G4: (index-join G7 b,cols=(1-4))
  │    └── []
  │         ├── best: (index-join G7 b,cols=(1-4))
- │         └── cost: 406.30
+ │         └── cost: 45.14
  ├── G5: (gt G8 G9)
  ├── G6: (lt G8 G10)
  ├── G7: (scan b@u,cols=(1,2),constrained)
  │    └── []
  │         ├── best: (scan b@u,cols=(1,2),constrained)
- │         └── cost: 82.37
+ │         └── cost: 9.15
  ├── G8: (tuple G11)
  ├── G9: (tuple G12)
  ├── G10: (tuple G13)