From 841498ff589a678906e27c8e4bde4238af52c125 Mon Sep 17 00:00:00 2001
From: Rebecca Taft <becca@cockroachlabs.com>
Date: Fri, 26 Oct 2018 16:07:42 -0400
Subject: [PATCH] opt: fix selectivity estimates for index constraints

Unlike the constraints found in Select and Join filters, an index
constraint may represent multiple conjuncts. Therefore, the selectivity
estimate for a Scan should account for the selectivity of each
constrained column in the index constraint. This commit fixes the
selectivity estimation in the optimizer to properly account for
each constrained column in a Scan.

Fixes #31929

Release note (bug fix): In some cases the optimizer was choosing
the wrong index for a scan because of incorrect selectivity
estimation. This estimation error has been fixed.
---
 pkg/sql/opt/constraint/constraint.go    | 22 ++++++++
 pkg/sql/opt/memo/statistics_builder.go  | 71 ++++++++++++-------------
 pkg/sql/opt/memo/testdata/stats/project |  2 +-
 pkg/sql/opt/memo/testdata/stats/scan    | 55 +++++++++++++++++--
 pkg/sql/opt/memo/testdata/stats/values  |  4 +-
 pkg/sql/opt/xform/testdata/rules/select |  6 +--
 6 files changed, 113 insertions(+), 47 deletions(-)

diff --git a/pkg/sql/opt/constraint/constraint.go b/pkg/sql/opt/constraint/constraint.go
index 9bb19e172fb5..ddd12849ca9b 100644
--- a/pkg/sql/opt/constraint/constraint.go
+++ b/pkg/sql/opt/constraint/constraint.go
@@ -448,6 +448,28 @@ func (c *Constraint) ExactPrefix(evalCtx *tree.EvalContext) int {
 	}
 }
 
+// ConstrainedColumns returns the number of columns which are constrained by
+// the Constraint. For example:
+//   /a/b/c: [/1/1 - /1] [/3 - /3]
+// has 2 constrained columns. This may be less than the total number of columns
+// in the constraint, especially if it represents an index constraint.
+func (c *Constraint) ConstrainedColumns(evalCtx *tree.EvalContext) int {
+	count := 0
+	for i := 0; i < c.Spans.Count(); i++ {
+		sp := c.Spans.Get(i)
+		start := sp.StartKey()
+		end := sp.EndKey()
+		if start.Length() > count {
+			count = start.Length()
+		}
+		if end.Length() > count {
+			count = end.Length()
+		}
+	}
+
+	return count
+}
+
 // Prefix returns the length of the longest prefix of columns for which all the
 // spans have the same start and end values. For example:
 //   /a/b/c: [/1/1/1 - /1/1/2] [/3/3/3 - /3/3/4]
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
index a291d99d3c90..71d7fe4fd4b7 100644
--- a/pkg/sql/opt/memo/statistics_builder.go
+++ b/pkg/sql/opt/memo/statistics_builder.go
@@ -454,22 +454,18 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
 	if scan.Constraint != nil {
 		// Calculate distinct counts for constrained columns
 		// -------------------------------------------------
-		applied := sb.applyConstraint(scan.Constraint, scan, relProps)
+		numUnappliedConjuncts := sb.applyIndexConstraint(scan.Constraint, scan, relProps)
 
 		var cols opt.ColSet
-		for i := 0; i < scan.Constraint.Columns.Count(); i++ {
+		for i := 0; i < scan.Constraint.ConstrainedColumns(sb.evalCtx); i++ {
 			cols.Add(int(scan.Constraint.Columns.Get(i).ID()))
 		}
 
 		// Calculate row count and selectivity
 		// -----------------------------------
 		inputRowCount := s.RowCount
-		if applied {
-			s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
-		} else {
-			numUnappliedConjuncts := sb.numConjunctsInConstraint(scan.Constraint)
-			s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
-		}
+		s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
+		s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
 
 		// Set null counts to 0 for non-nullable columns
 		// -------------------------------------------------
@@ -1846,18 +1842,26 @@ func (sb *statisticsBuilder) applyFilter(
 	return numUnappliedConjuncts, constrainedCols
 }
 
-func (sb *statisticsBuilder) applyConstraint(
+func (sb *statisticsBuilder) applyIndexConstraint(
 	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
-) (applied bool) {
+) (numUnappliedConjuncts float64) {
 	// If unconstrained, then no constraint could be derived from the expression,
 	// so fall back to estimate.
 	// If a contradiction, then optimizations must not be enabled (say for
 	// testing), or else this would have been reduced.
 	if c.IsUnconstrained() || c.IsContradiction() {
-		return false /* applied */
+		return 0 /* numUnappliedConjuncts */
 	}
 
-	return sb.updateDistinctCountsFromConstraint(c, e, relProps)
+	applied := sb.updateDistinctCountsFromConstraint(c, e, relProps)
+	for i, n := applied, c.ConstrainedColumns(sb.evalCtx); i < n; i++ {
+		// Unlike the constraints found in Select and Join filters, an index
+		// constraint may represent multiple conjuncts. Therefore, we need to
+		// calculate the number of unapplied conjuncts for each constrained column.
+		numUnappliedConjuncts += sb.numConjunctsInConstraint(c, i)
+	}
+
+	return numUnappliedConjuncts
 }
 
 func (sb *statisticsBuilder) applyConstraintSet(
@@ -1874,13 +1878,13 @@ func (sb *statisticsBuilder) applyConstraintSet(
 	numUnappliedConjuncts = 0
 	for i := 0; i < cs.Length(); i++ {
 		applied := sb.updateDistinctCountsFromConstraint(cs.Constraint(i), e, relProps)
-		if !applied {
+		if applied == 0 {
 			// If a constraint cannot be applied, it may represent an
 			// inequality like x < 1. As a result, distinctCounts does not fully
 			// represent the selectivity of the constraint set.
 			// We return an estimate of the number of unapplied conjuncts to the
 			// caller function to be used for selectivity calculation.
-			numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i))
+			numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i), 0 /* nth */)
 		}
 	}
 
@@ -1923,10 +1927,10 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
 
 // updateDistinctCountsFromConstraint updates the distinct count for each
 // column in a constraint that can be determined to have a finite number of
-// possible values. It returns a boolean indicating if the constraint was
-// applied (i.e., the distinct count for at least one column could be inferred
-// from the constraint). If the same column appears in multiple constraints,
-// the distinct count is the minimum for that column across all constraints.
+// possible values. It returns the number of columns for which the distinct
+// count could be inferred from the constraint. If the same column appears
+// in multiple constraints, the distinct count is the minimum for that column
+// across all constraints.
 //
 // For example, consider the following constraint set:
 //
@@ -1952,7 +1956,7 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
 // discrepancy must be resolved by the calling function.
 func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
 	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
-) (applied bool) {
+) (applied int) {
 	// All of the columns that are part of the prefix have a finite number of
 	// distinct values.
 	prefix := c.Prefix(sb.evalCtx)
@@ -2024,7 +2028,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
 
 		colID := c.Columns.Get(col).ID()
 		sb.ensureColStat(util.MakeFastIntSet(int(colID)), distinctCount, e, relProps)
-		applied = true
+		applied = col + 1
 	}
 
 	return applied
@@ -2072,8 +2076,8 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
 }
 
 // selectivityFromDistinctCounts calculates the selectivity of a filter by
-// taking the product of selectivities of each constrained column. In the general case,
-// this can be represented by the formula:
+// taking the product of selectivities of each constrained column. In the
+// general case, this can be represented by the formula:
 //
 //                  ┬-┬ ⎛ new distinct(i) ⎞
 //   selectivity =  │ │ ⎜ --------------- ⎟
@@ -2102,7 +2106,7 @@ func (sb *statisticsBuilder) selectivityFromDistinctCounts(
 		oldDistinct := inputStat.DistinctCount
 
 		if oldDistinct != 0 && newDistinct < oldDistinct {
-			selectivity *= newDistinct / oldDistinct
+			selectivity *= min(newDistinct/oldDistinct, unknownFilterSelectivity)
 		}
 	}
 
@@ -2313,9 +2317,9 @@ func isEqualityWithTwoVars(cond opt.ScalarExpr) bool {
 }
 
 // numConjunctsInConstraint returns a rough estimate of the number of conjuncts
-// used to build the given constraint.
+// used to build the given constraint for the column at position nth.
 func (sb *statisticsBuilder) numConjunctsInConstraint(
-	c *constraint.Constraint,
+	c *constraint.Constraint, nth int,
 ) (numConjuncts float64) {
 	if c.Spans.Count() == 0 {
 		return 0 /* numConjuncts */
@@ -2325,25 +2329,20 @@ func (sb *statisticsBuilder) numConjunctsInConstraint(
 	for i := 0; i < c.Spans.Count(); i++ {
 		span := c.Spans.Get(i)
 		numSpanConjuncts := float64(0)
-		// The first start and end keys in each span are the only ones that matter
-		// for determining selectivity when we have no knowledge of the data
-		// distribution. Technically, /a/b: [/5 - ] is more selective than
-		// /a/b: [/4/5 - ], which is more selective than /a/b: [/4 - ]. But we
-		// treat them all the same, with selectivity=1/3.
-		if span.StartKey().Length() > 0 {
+		if span.StartKey().Length() > nth {
 			// Cases of NULL in a constraint should be ignored. For example,
 			// without knowledge of the data distribution, /a: (/NULL - /10] should
 			// have the same estimated selectivity as /a: [/10 - ]. Selectivity
 			// of NULL constraints is handled in selectivityFromNullCounts.
-			if c.Columns.Get(0).Descending() ||
-				span.StartKey().Value(0) != tree.DNull {
+			if c.Columns.Get(nth).Descending() ||
+				span.StartKey().Value(nth) != tree.DNull {
 				numSpanConjuncts++
 			}
 		}
-		if span.EndKey().Length() > 0 {
+		if span.EndKey().Length() > nth {
 			// Ignore cases of NULL in constraints. (see above comment).
-			if !c.Columns.Get(0).Descending() ||
-				span.EndKey().Value(0) != tree.DNull {
+			if !c.Columns.Get(nth).Descending() ||
+				span.EndKey().Value(nth) != tree.DNull {
 				numSpanConjuncts++
 			}
 		}
diff --git a/pkg/sql/opt/memo/testdata/stats/project b/pkg/sql/opt/memo/testdata/stats/project
index 329bf145dcf6..a590a5765c1a 100644
--- a/pkg/sql/opt/memo/testdata/stats/project
+++ b/pkg/sql/opt/memo/testdata/stats/project
@@ -142,7 +142,7 @@ SELECT * FROM (SELECT y + 3 AS v FROM a) WHERE v >= 1 AND v <= 100
 ----
 select
  ├── columns: v:5(int!null)
- ├── stats: [rows=990, distinct(5)=100, null(5)=0]
+ ├── stats: [rows=660, distinct(5)=100, null(5)=0]
  ├── project
  │    ├── columns: v:5(int)
  │    ├── stats: [rows=2000, distinct(5)=200, null(5)=20]
diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan
index f7186c1b56a2..4a1d1a28d12f 100644
--- a/pkg/sql/opt/memo/testdata/stats/scan
+++ b/pkg/sql/opt/memo/testdata/stats/scan
@@ -41,7 +41,7 @@ SELECT * FROM a WHERE b
 ----
 select
  ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null) b:5(bool!null)
- ├── stats: [rows=990, distinct(1)=990, null(1)=0, distinct(4)=199.804688, null(4)=0, distinct(5)=1, null(5)=0]
+ ├── stats: [rows=660, distinct(1)=660, null(1)=0, distinct(4)=196.531694, null(4)=0, distinct(5)=1, null(5)=0]
  ├── key: (1)
  ├── fd: ()-->(5), (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan a
@@ -246,13 +246,13 @@ index-join a
  ├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
  └── select
       ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
-      ├── stats: [rows=111.111111, distinct(1)=110.489355, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=97.0976681, null(4)=0]
+      ├── stats: [rows=37.037037, distinct(1)=36.9747958, null(1)=0, distinct(3)=1.99999999, null(3)=0, distinct(4)=35.7721483, null(4)=0]
       ├── key: (1)
       ├── fd: (1)-->(3,4), (3,4)-->(1)
       ├── scan a@secondary
       │    ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
       │    ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
-      │    ├── stats: [rows=1000, distinct(1)=911.337892, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
+      │    ├── stats: [rows=333.333333, distinct(1)=323.895037, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
       │    ├── key: (1)
       │    └── fd: (1)-->(3,4), (3,4)-->(1)
       └── filters
@@ -407,15 +407,60 @@ index-join a
  ├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
  └── select
       ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
-      ├── stats: [rows=74.0740741, distinct(1)=74.0740741, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=68.9343053, null(4)=0]
+      ├── stats: [rows=24.691358, distinct(1)=24.691358, null(1)=0, distinct(3)=1.99999586, null(3)=0, distinct(4)=24.5913408, null(4)=0]
       ├── key: (1)
       ├── fd: (1)-->(3,4), (3,4)-->(1)
       ├── scan a@secondary
       │    ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
       │    ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
-      │    ├── stats: [rows=666.666667, distinct(1)=666.666667, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
+      │    ├── stats: [rows=222.222222, distinct(1)=222.222222, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
       │    ├── key: (1)
       │    └── fd: (1)-->(3,4), (3,4)-->(1)
       └── filters
            ├── (s <= 'foo') OR (s >= 'foobar') [type=bool, outer=(3)]
            └── d <= 5.0 [type=bool, outer=(4), constraints=(/4: (/NULL - /5.0]; tight)]
+
+exec-ddl
+CREATE TABLE abcde (
+  a INT PRIMARY KEY,
+  b INT,
+  c STRING,
+  d INT,
+  e INT,
+  INDEX bad(b, d),
+  INDEX good(b, c, d)
+)
+----
+TABLE abcde
+ ├── a int not null
+ ├── b int
+ ├── c string
+ ├── d int
+ ├── e int
+ ├── INDEX primary
+ │    └── a int not null
+ ├── INDEX bad
+ │    ├── b int
+ │    ├── d int
+ │    └── a int not null
+ └── INDEX good
+      ├── b int
+      ├── c string
+      ├── d int
+      └── a int not null
+
+# Regression test for #31929. Ensure that the good index is chosen.
+opt
+SELECT * FROM abcde WHERE b = 1 AND c LIKE '+1-1000%'
+----
+index-join abcde
+ ├── columns: a:1(int!null) b:2(int!null) c:3(string) d:4(int) e:5(int)
+ ├── stats: [rows=3.3, distinct(1)=3.3, null(1)=0, distinct(2)=1, null(2)=0]
+ ├── key: (1)
+ ├── fd: ()-->(2), (1)-->(3-5)
+ └── scan abcde@good
+      ├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int)
+      ├── constraint: /2/3/4/1: [/1/'+1-1000' - /1/'+1-1001')
+      ├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0]
+      ├── key: (1)
+      └── fd: ()-->(2), (1)-->(3,4)
diff --git a/pkg/sql/opt/memo/testdata/stats/values b/pkg/sql/opt/memo/testdata/stats/values
index dfc0d5c0948e..11447a51cf5c 100644
--- a/pkg/sql/opt/memo/testdata/stats/values
+++ b/pkg/sql/opt/memo/testdata/stats/values
@@ -4,7 +4,7 @@ SELECT * FROM (VALUES (1, 2), (1, 2), (1, 3), (2, 3)) AS q(x, y) WHERE x = 5 AND
 select
  ├── columns: x:1(int!null) y:2(int!null)
  ├── cardinality: [0 - 4]
- ├── stats: [rows=1, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0]
+ ├── stats: [rows=0.444444444, distinct(1)=0.444444444, null(1)=0, distinct(2)=0.444444444, null(2)=0]
  ├── fd: ()-->(1,2)
  ├── values
  │    ├── columns: column1:1(int) column2:2(int)
@@ -54,7 +54,7 @@ SELECT * FROM (VALUES (1), (1), (1), (2)) AS q(x) WHERE x = 1
 select
  ├── columns: x:1(int!null)
  ├── cardinality: [0 - 4]
- ├── stats: [rows=2, distinct(1)=1, null(1)=0]
+ ├── stats: [rows=1.33333333, distinct(1)=1, null(1)=0]
  ├── fd: ()-->(1)
  ├── values
  │    ├── columns: column1:1(int)
diff --git a/pkg/sql/opt/xform/testdata/rules/select b/pkg/sql/opt/xform/testdata/rules/select
index e5d45d277bba..71b52b4123ea 100644
--- a/pkg/sql/opt/xform/testdata/rules/select
+++ b/pkg/sql/opt/xform/testdata/rules/select
@@ -562,19 +562,19 @@ memo (optimized, ~4KB)
  ├── G1: (select G2 G3) (select G4 G3)
  │    └── [presentation: k:1,u:2,v:3,j:4]
  │         ├── best: (select G4 G3)
- │         └── cost: 407.09
+ │         └── cost: 45.23
  ├── G2: (scan b)
  ├── G3: (filters G5 G6)
  ├── G4: (index-join G7 b,cols=(1-4))
  │    └── []
  │         ├── best: (index-join G7 b,cols=(1-4))
- │         └── cost: 406.30
+ │         └── cost: 45.14
  ├── G5: (gt G8 G9)
  ├── G6: (lt G8 G10)
  ├── G7: (scan b@u,cols=(1,2),constrained)
  │    └── []
  │         ├── best: (scan b@u,cols=(1,2),constrained)
- │         └── cost: 82.37
+ │         └── cost: 9.15
  ├── G8: (tuple G11)
  ├── G9: (tuple G12)
  ├── G10: (tuple G13)