Skip to content

Commit

Permalink
opt: fix selectivity estimates for index constraints
Browse files Browse the repository at this point in the history
Unlike the constraints found in Select and Join filters, an index
constraint may represent multiple conjuncts. Therefore, the selectivity
estimate for a Scan should account for the selectivity of each
constrained column in the index constraint. This commit fixes the
selectivity estimation in the optimizer to properly account for
each constrained column in a Scan.

Fixes #31929

Release note (bug fix): In some cases the optimizer was choosing
the wrong index for a scan because of incorrect selectivity
estimation. This estimation error has been fixed.
  • Loading branch information
rytaft committed Oct 26, 2018
1 parent a0bde06 commit 841498f
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 47 deletions.
22 changes: 22 additions & 0 deletions pkg/sql/opt/constraint/constraint.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,28 @@ func (c *Constraint) ExactPrefix(evalCtx *tree.EvalContext) int {
}
}

// ConstrainedColumns returns the number of columns which are constrained by
// the Constraint. For example:
// /a/b/c: [/1/1 - /1] [/3 - /3]
// has 2 constrained columns. This may be less than the total number of columns
// in the constraint, especially if it represents an index constraint.
func (c *Constraint) ConstrainedColumns(evalCtx *tree.EvalContext) int {
count := 0
for i := 0; i < c.Spans.Count(); i++ {
sp := c.Spans.Get(i)
start := sp.StartKey()
end := sp.EndKey()
if start.Length() > count {
count = start.Length()
}
if end.Length() > count {
count = end.Length()
}
}

return count
}

// Prefix returns the length of the longest prefix of columns for which all the
// spans have the same start and end values. For example:
// /a/b/c: [/1/1/1 - /1/1/2] [/3/3/3 - /3/3/4]
Expand Down
71 changes: 35 additions & 36 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,22 +454,18 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
if scan.Constraint != nil {
// Calculate distinct counts for constrained columns
// -------------------------------------------------
applied := sb.applyConstraint(scan.Constraint, scan, relProps)
numUnappliedConjuncts := sb.applyIndexConstraint(scan.Constraint, scan, relProps)

var cols opt.ColSet
for i := 0; i < scan.Constraint.Columns.Count(); i++ {
for i := 0; i < scan.Constraint.ConstrainedColumns(sb.evalCtx); i++ {
cols.Add(int(scan.Constraint.Columns.Get(i).ID()))
}

// Calculate row count and selectivity
// -----------------------------------
inputRowCount := s.RowCount
if applied {
s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
} else {
numUnappliedConjuncts := sb.numConjunctsInConstraint(scan.Constraint)
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
}
s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))

// Set null counts to 0 for non-nullable columns
// -------------------------------------------------
Expand Down Expand Up @@ -1846,18 +1842,26 @@ func (sb *statisticsBuilder) applyFilter(
return numUnappliedConjuncts, constrainedCols
}

func (sb *statisticsBuilder) applyConstraint(
func (sb *statisticsBuilder) applyIndexConstraint(
c *constraint.Constraint, e RelExpr, relProps *props.Relational,
) (applied bool) {
) (numUnappliedConjuncts float64) {
// If unconstrained, then no constraint could be derived from the expression,
// so fall back to estimate.
// If a contradiction, then optimizations must not be enabled (say for
// testing), or else this would have been reduced.
if c.IsUnconstrained() || c.IsContradiction() {
return false /* applied */
return 0 /* numUnappliedConjuncts */
}

return sb.updateDistinctCountsFromConstraint(c, e, relProps)
applied := sb.updateDistinctCountsFromConstraint(c, e, relProps)
for i, n := applied, c.ConstrainedColumns(sb.evalCtx); i < n; i++ {
// Unlike the constraints found in Select and Join filters, an index
// constraint may represent multiple conjuncts. Therefore, we need to
// calculate the number of unapplied conjuncts for each constrained column.
numUnappliedConjuncts += sb.numConjunctsInConstraint(c, i)
}

return numUnappliedConjuncts
}

func (sb *statisticsBuilder) applyConstraintSet(
Expand All @@ -1874,13 +1878,13 @@ func (sb *statisticsBuilder) applyConstraintSet(
numUnappliedConjuncts = 0
for i := 0; i < cs.Length(); i++ {
applied := sb.updateDistinctCountsFromConstraint(cs.Constraint(i), e, relProps)
if !applied {
if applied == 0 {
// If a constraint cannot be applied, it may represent an
// inequality like x < 1. As a result, distinctCounts does not fully
// represent the selectivity of the constraint set.
// We return an estimate of the number of unapplied conjuncts to the
// caller function to be used for selectivity calculation.
numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i))
numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i), 0 /* nth */)
}
}

Expand Down Expand Up @@ -1923,10 +1927,10 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(

// updateDistinctCountsFromConstraint updates the distinct count for each
// column in a constraint that can be determined to have a finite number of
// possible values. It returns a boolean indicating if the constraint was
// applied (i.e., the distinct count for at least one column could be inferred
// from the constraint). If the same column appears in multiple constraints,
// the distinct count is the minimum for that column across all constraints.
// possible values. It returns the number of columns for which the distinct
// count could be inferred from the constraint. If the same column appears
// in multiple constraints, the distinct count is the minimum for that column
// across all constraints.
//
// For example, consider the following constraint set:
//
Expand All @@ -1952,7 +1956,7 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
// discrepancy must be resolved by the calling function.
func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
c *constraint.Constraint, e RelExpr, relProps *props.Relational,
) (applied bool) {
) (applied int) {
// All of the columns that are part of the prefix have a finite number of
// distinct values.
prefix := c.Prefix(sb.evalCtx)
Expand Down Expand Up @@ -2024,7 +2028,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(

colID := c.Columns.Get(col).ID()
sb.ensureColStat(util.MakeFastIntSet(int(colID)), distinctCount, e, relProps)
applied = true
applied = col + 1
}

return applied
Expand Down Expand Up @@ -2072,8 +2076,8 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
}

// selectivityFromDistinctCounts calculates the selectivity of a filter by
// taking the product of selectivities of each constrained column. In the general case,
// this can be represented by the formula:
// taking the product of selectivities of each constrained column. In the
// general case, this can be represented by the formula:
//
// ┬-┬ ⎛ new distinct(i) ⎞
// selectivity = │ │ ⎜ --------------- ⎟
Expand Down Expand Up @@ -2102,7 +2106,7 @@ func (sb *statisticsBuilder) selectivityFromDistinctCounts(
oldDistinct := inputStat.DistinctCount

if oldDistinct != 0 && newDistinct < oldDistinct {
selectivity *= newDistinct / oldDistinct
selectivity *= min(newDistinct/oldDistinct, unknownFilterSelectivity)
}
}

Expand Down Expand Up @@ -2313,9 +2317,9 @@ func isEqualityWithTwoVars(cond opt.ScalarExpr) bool {
}

// numConjunctsInConstraint returns a rough estimate of the number of conjuncts
// used to build the given constraint.
// used to build the given constraint for the column at position nth.
func (sb *statisticsBuilder) numConjunctsInConstraint(
c *constraint.Constraint,
c *constraint.Constraint, nth int,
) (numConjuncts float64) {
if c.Spans.Count() == 0 {
return 0 /* numConjuncts */
Expand All @@ -2325,25 +2329,20 @@ func (sb *statisticsBuilder) numConjunctsInConstraint(
for i := 0; i < c.Spans.Count(); i++ {
span := c.Spans.Get(i)
numSpanConjuncts := float64(0)
// The first start and end keys in each span are the only ones that matter
// for determining selectivity when we have no knowledge of the data
// distribution. Technically, /a/b: [/5 - ] is more selective than
// /a/b: [/4/5 - ], which is more selective than /a/b: [/4 - ]. But we
// treat them all the same, with selectivity=1/3.
if span.StartKey().Length() > 0 {
if span.StartKey().Length() > nth {
// Cases of NULL in a constraint should be ignored. For example,
// without knowledge of the data distribution, /a: (/NULL - /10] should
// have the same estimated selectivity as /a: [/10 - ]. Selectivity
// of NULL constraints is handled in selectivityFromNullCounts.
if c.Columns.Get(0).Descending() ||
span.StartKey().Value(0) != tree.DNull {
if c.Columns.Get(nth).Descending() ||
span.StartKey().Value(nth) != tree.DNull {
numSpanConjuncts++
}
}
if span.EndKey().Length() > 0 {
if span.EndKey().Length() > nth {
// Ignore cases of NULL in constraints. (see above comment).
if !c.Columns.Get(0).Descending() ||
span.EndKey().Value(0) != tree.DNull {
if !c.Columns.Get(nth).Descending() ||
span.EndKey().Value(nth) != tree.DNull {
numSpanConjuncts++
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/opt/memo/testdata/stats/project
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ SELECT * FROM (SELECT y + 3 AS v FROM a) WHERE v >= 1 AND v <= 100
----
select
├── columns: v:5(int!null)
├── stats: [rows=990, distinct(5)=100, null(5)=0]
├── stats: [rows=660, distinct(5)=100, null(5)=0]
├── project
│ ├── columns: v:5(int)
│ ├── stats: [rows=2000, distinct(5)=200, null(5)=20]
Expand Down
55 changes: 50 additions & 5 deletions pkg/sql/opt/memo/testdata/stats/scan
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ SELECT * FROM a WHERE b
----
select
├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null) b:5(bool!null)
├── stats: [rows=990, distinct(1)=990, null(1)=0, distinct(4)=199.804688, null(4)=0, distinct(5)=1, null(5)=0]
├── stats: [rows=660, distinct(1)=660, null(1)=0, distinct(4)=196.531694, null(4)=0, distinct(5)=1, null(5)=0]
├── key: (1)
├── fd: ()-->(5), (1)-->(2-4), (3,4)~~>(1,2)
├── scan a
Expand Down Expand Up @@ -246,13 +246,13 @@ index-join a
├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
└── select
├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
├── stats: [rows=111.111111, distinct(1)=110.489355, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=97.0976681, null(4)=0]
├── stats: [rows=37.037037, distinct(1)=36.9747958, null(1)=0, distinct(3)=1.99999999, null(3)=0, distinct(4)=35.7721483, null(4)=0]
├── key: (1)
├── fd: (1)-->(3,4), (3,4)-->(1)
├── scan a@secondary
│ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
│ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
│ ├── stats: [rows=1000, distinct(1)=911.337892, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
│ ├── stats: [rows=333.333333, distinct(1)=323.895037, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(3,4), (3,4)-->(1)
└── filters
Expand Down Expand Up @@ -407,15 +407,60 @@ index-join a
├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
└── select
├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
├── stats: [rows=74.0740741, distinct(1)=74.0740741, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=68.9343053, null(4)=0]
├── stats: [rows=24.691358, distinct(1)=24.691358, null(1)=0, distinct(3)=1.99999586, null(3)=0, distinct(4)=24.5913408, null(4)=0]
├── key: (1)
├── fd: (1)-->(3,4), (3,4)-->(1)
├── scan a@secondary
│ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
│ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
│ ├── stats: [rows=666.666667, distinct(1)=666.666667, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
│ ├── stats: [rows=222.222222, distinct(1)=222.222222, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(3,4), (3,4)-->(1)
└── filters
├── (s <= 'foo') OR (s >= 'foobar') [type=bool, outer=(3)]
└── d <= 5.0 [type=bool, outer=(4), constraints=(/4: (/NULL - /5.0]; tight)]

exec-ddl
CREATE TABLE abcde (
a INT PRIMARY KEY,
b INT,
c STRING,
d INT,
e INT,
INDEX bad(b, d),
INDEX good(b, c, d)
)
----
TABLE abcde
├── a int not null
├── b int
├── c string
├── d int
├── e int
├── INDEX primary
│ └── a int not null
├── INDEX bad
│ ├── b int
│ ├── d int
│ └── a int not null
└── INDEX good
├── b int
├── c string
├── d int
└── a int not null

# Regression test for #31929. Ensure that the good index is chosen.
opt
SELECT * FROM abcde WHERE b = 1 AND c LIKE '+1-1000%'
----
index-join abcde
├── columns: a:1(int!null) b:2(int!null) c:3(string) d:4(int) e:5(int)
├── stats: [rows=3.3, distinct(1)=3.3, null(1)=0, distinct(2)=1, null(2)=0]
├── key: (1)
├── fd: ()-->(2), (1)-->(3-5)
└── scan abcde@good
├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int)
├── constraint: /2/3/4/1: [/1/'+1-1000' - /1/'+1-1001')
├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0]
├── key: (1)
└── fd: ()-->(2), (1)-->(3,4)
4 changes: 2 additions & 2 deletions pkg/sql/opt/memo/testdata/stats/values
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ SELECT * FROM (VALUES (1, 2), (1, 2), (1, 3), (2, 3)) AS q(x, y) WHERE x = 5 AND
select
├── columns: x:1(int!null) y:2(int!null)
├── cardinality: [0 - 4]
├── stats: [rows=1, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0]
├── stats: [rows=0.444444444, distinct(1)=0.444444444, null(1)=0, distinct(2)=0.444444444, null(2)=0]
├── fd: ()-->(1,2)
├── values
│ ├── columns: column1:1(int) column2:2(int)
Expand Down Expand Up @@ -54,7 +54,7 @@ SELECT * FROM (VALUES (1), (1), (1), (2)) AS q(x) WHERE x = 1
select
├── columns: x:1(int!null)
├── cardinality: [0 - 4]
├── stats: [rows=2, distinct(1)=1, null(1)=0]
├── stats: [rows=1.33333333, distinct(1)=1, null(1)=0]
├── fd: ()-->(1)
├── values
│ ├── columns: column1:1(int)
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/opt/xform/testdata/rules/select
Original file line number Diff line number Diff line change
Expand Up @@ -562,19 +562,19 @@ memo (optimized, ~4KB)
├── G1: (select G2 G3) (select G4 G3)
│ └── [presentation: k:1,u:2,v:3,j:4]
│ ├── best: (select G4 G3)
│ └── cost: 407.09
│ └── cost: 45.23
├── G2: (scan b)
├── G3: (filters G5 G6)
├── G4: (index-join G7 b,cols=(1-4))
│ └── []
│ ├── best: (index-join G7 b,cols=(1-4))
│ └── cost: 406.30
│ └── cost: 45.14
├── G5: (gt G8 G9)
├── G6: (lt G8 G10)
├── G7: (scan b@u,cols=(1,2),constrained)
│ └── []
│ ├── best: (scan b@u,cols=(1,2),constrained)
│ └── cost: 82.37
│ └── cost: 9.15
├── G8: (tuple G11)
├── G9: (tuple G12)
├── G10: (tuple G13)
Expand Down

0 comments on commit 841498f

Please sign in to comment.