Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

opt: fix selectivity estimates for index constraints #31937

Merged
merged 1 commit into from
Oct 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions pkg/sql/opt/constraint/constraint.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,28 @@ func (c *Constraint) ExactPrefix(evalCtx *tree.EvalContext) int {
}
}

// ConstrainedColumns returns the number of columns which are constrained by
// the Constraint. For example:
// /a/b/c: [/1/1 - /1] [/3 - /3]
// has 2 constrained columns. This may be less than the total number of columns
// in the constraint, especially if it represents an index constraint.
func (c *Constraint) ConstrainedColumns(evalCtx *tree.EvalContext) int {
count := 0
for i := 0; i < c.Spans.Count(); i++ {
sp := c.Spans.Get(i)
start := sp.StartKey()
end := sp.EndKey()
if start.Length() > count {
count = start.Length()
}
if end.Length() > count {
count = end.Length()
}
}

return count
}

// Prefix returns the length of the longest prefix of columns for which all the
// spans have the same start and end values. For example:
// /a/b/c: [/1/1/1 - /1/1/2] [/3/3/3 - /3/3/4]
Expand Down
69 changes: 34 additions & 35 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,22 +454,18 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
if scan.Constraint != nil {
// Calculate distinct counts for constrained columns
// -------------------------------------------------
applied := sb.applyConstraint(scan.Constraint, scan, relProps)
numUnappliedConjuncts := sb.applyIndexConstraint(scan.Constraint, scan, relProps)

var cols opt.ColSet
for i := 0; i < scan.Constraint.Columns.Count(); i++ {
for i, n := 0, scan.Constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
cols.Add(int(scan.Constraint.Columns.Get(i).ID()))
}

// Calculate row count and selectivity
// -----------------------------------
inputRowCount := s.RowCount
if applied {
s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
} else {
numUnappliedConjuncts := sb.numConjunctsInConstraint(scan.Constraint)
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
}
s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, scan, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))

// Set null counts to 0 for non-nullable columns
// -------------------------------------------------
Expand Down Expand Up @@ -1846,18 +1842,26 @@ func (sb *statisticsBuilder) applyFilter(
return numUnappliedConjuncts, constrainedCols
}

func (sb *statisticsBuilder) applyConstraint(
func (sb *statisticsBuilder) applyIndexConstraint(
c *constraint.Constraint, e RelExpr, relProps *props.Relational,
) (applied bool) {
) (numUnappliedConjuncts float64) {
// If unconstrained, then no constraint could be derived from the expression,
// so fall back to estimate.
// If a contradiction, then optimizations must not be enabled (say for
// testing), or else this would have been reduced.
if c.IsUnconstrained() || c.IsContradiction() {
return false /* applied */
return 0 /* numUnappliedConjuncts */
}

return sb.updateDistinctCountsFromConstraint(c, e, relProps)
applied := sb.updateDistinctCountsFromConstraint(c, e, relProps)
for i, n := applied, c.ConstrainedColumns(sb.evalCtx); i < n; i++ {
// Unlike the constraints found in Select and Join filters, an index
// constraint may represent multiple conjuncts. Therefore, we need to
// calculate the number of unapplied conjuncts for each constrained column.
numUnappliedConjuncts += sb.numConjunctsInConstraint(c, i)
}

return numUnappliedConjuncts
}

func (sb *statisticsBuilder) applyConstraintSet(
Expand All @@ -1874,13 +1878,13 @@ func (sb *statisticsBuilder) applyConstraintSet(
numUnappliedConjuncts = 0
for i := 0; i < cs.Length(); i++ {
applied := sb.updateDistinctCountsFromConstraint(cs.Constraint(i), e, relProps)
if !applied {
if applied == 0 {
// If a constraint cannot be applied, it may represent an
// inequality like x < 1. As a result, distinctCounts does not fully
// represent the selectivity of the constraint set.
// We return an estimate of the number of unapplied conjuncts to the
// caller function to be used for selectivity calculation.
numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i))
numUnappliedConjuncts += sb.numConjunctsInConstraint(cs.Constraint(i), 0 /* nth */)
}
}

Expand Down Expand Up @@ -1923,10 +1927,10 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(

// updateDistinctCountsFromConstraint updates the distinct count for each
// column in a constraint that can be determined to have a finite number of
// possible values. It returns a boolean indicating if the constraint was
// applied (i.e., the distinct count for at least one column could be inferred
// from the constraint). If the same column appears in multiple constraints,
// the distinct count is the minimum for that column across all constraints.
// possible values. It returns the number of columns for which the distinct
// count could be inferred from the constraint. If the same column appears
// in multiple constraints, the distinct count is the minimum for that column
// across all constraints.
//
// For example, consider the following constraint set:
//
Expand All @@ -1952,7 +1956,7 @@ func (sb *statisticsBuilder) updateNullCountsFromProps(
// discrepancy must be resolved by the calling function.
func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
c *constraint.Constraint, e RelExpr, relProps *props.Relational,
) (applied bool) {
) (applied int) {
// All of the columns that are part of the prefix have a finite number of
// distinct values.
prefix := c.Prefix(sb.evalCtx)
Expand Down Expand Up @@ -2024,7 +2028,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(

colID := c.Columns.Get(col).ID()
sb.ensureColStat(util.MakeFastIntSet(int(colID)), distinctCount, e, relProps)
applied = true
applied = col + 1
}

return applied
Expand Down Expand Up @@ -2072,8 +2076,8 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
}

// selectivityFromDistinctCounts calculates the selectivity of a filter by
// taking the product of selectivities of each constrained column. In the general case,
// this can be represented by the formula:
// taking the product of selectivities of each constrained column. In the
// general case, this can be represented by the formula:
//
// ┬-┬ ⎛ new distinct(i) ⎞
// selectivity = │ │ ⎜ --------------- ⎟
Expand Down Expand Up @@ -2313,9 +2317,9 @@ func isEqualityWithTwoVars(cond opt.ScalarExpr) bool {
}

// numConjunctsInConstraint returns a rough estimate of the number of conjuncts
// used to build the given constraint.
// used to build the given constraint for the column at position nth.
func (sb *statisticsBuilder) numConjunctsInConstraint(
c *constraint.Constraint,
c *constraint.Constraint, nth int,
) (numConjuncts float64) {
if c.Spans.Count() == 0 {
return 0 /* numConjuncts */
Expand All @@ -2325,25 +2329,20 @@ func (sb *statisticsBuilder) numConjunctsInConstraint(
for i := 0; i < c.Spans.Count(); i++ {
span := c.Spans.Get(i)
numSpanConjuncts := float64(0)
// The first start and end keys in each span are the only ones that matter
// for determining selectivity when we have no knowledge of the data
// distribution. Technically, /a/b: [/5 - ] is more selective than
// /a/b: [/4/5 - ], which is more selective than /a/b: [/4 - ]. But we
// treat them all the same, with selectivity=1/3.
if span.StartKey().Length() > 0 {
if span.StartKey().Length() > nth {
// Cases of NULL in a constraint should be ignored. For example,
// without knowledge of the data distribution, /a: (/NULL - /10] should
// have the same estimated selectivity as /a: [/10 - ]. Selectivity
// of NULL constraints is handled in selectivityFromNullCounts.
if c.Columns.Get(0).Descending() ||
span.StartKey().Value(0) != tree.DNull {
if c.Columns.Get(nth).Descending() ||
span.StartKey().Value(nth) != tree.DNull {
numSpanConjuncts++
}
}
if span.EndKey().Length() > 0 {
if span.EndKey().Length() > nth {
// Ignore cases of NULL in constraints. (see above comment).
if !c.Columns.Get(0).Descending() ||
span.EndKey().Value(0) != tree.DNull {
if !c.Columns.Get(nth).Descending() ||
span.EndKey().Value(nth) != tree.DNull {
numSpanConjuncts++
}
}
Expand Down
53 changes: 49 additions & 4 deletions pkg/sql/opt/memo/testdata/stats/scan
Original file line number Diff line number Diff line change
Expand Up @@ -246,13 +246,13 @@ index-join a
├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
└── select
├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
├── stats: [rows=111.111111, distinct(1)=110.489355, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=97.0976681, null(4)=0]
├── stats: [rows=37.037037, distinct(1)=36.9747958, null(1)=0, distinct(3)=1.99999999, null(3)=0, distinct(4)=35.7721483, null(4)=0]
├── key: (1)
├── fd: (1)-->(3,4), (3,4)-->(1)
├── scan a@secondary
│ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
│ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
│ ├── stats: [rows=1000, distinct(1)=911.337892, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
│ ├── stats: [rows=333.333333, distinct(1)=323.895037, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(3,4), (3,4)-->(1)
└── filters
Expand Down Expand Up @@ -407,15 +407,60 @@ index-join a
├── fd: (1)-->(2-5), (3,4)-->(1,2,5)
└── select
├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
├── stats: [rows=74.0740741, distinct(1)=74.0740741, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=68.9343053, null(4)=0]
├── stats: [rows=24.691358, distinct(1)=24.691358, null(1)=0, distinct(3)=1.99999586, null(3)=0, distinct(4)=24.5913408, null(4)=0]
├── key: (1)
├── fd: (1)-->(3,4), (3,4)-->(1)
├── scan a@secondary
│ ├── columns: x:1(int!null) s:3(string!null) d:4(decimal!null)
│ ├── constraint: /-3/4: [ - /'foobar'/5.0] [/'foo' - /'bar'/5.0]
│ ├── stats: [rows=666.666667, distinct(1)=666.666667, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=294.797541, null(4)=0]
│ ├── stats: [rows=222.222222, distinct(1)=222.222222, null(1)=0, distinct(3)=2, null(3)=0, distinct(4)=207.616156, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(3,4), (3,4)-->(1)
└── filters
├── (s <= 'foo') OR (s >= 'foobar') [type=bool, outer=(3)]
└── d <= 5.0 [type=bool, outer=(4), constraints=(/4: (/NULL - /5.0]; tight)]

exec-ddl
CREATE TABLE abcde (
a INT PRIMARY KEY,
b INT,
c STRING,
d INT,
e INT,
INDEX bad(b, d),
INDEX good(b, c, d)
)
----
TABLE abcde
├── a int not null
├── b int
├── c string
├── d int
├── e int
├── INDEX primary
│ └── a int not null
├── INDEX bad
│ ├── b int
│ ├── d int
│ └── a int not null
└── INDEX good
├── b int
├── c string
├── d int
└── a int not null

# Regression test for #31929. Ensure that the good index is chosen.
opt
SELECT * FROM abcde WHERE b = 1 AND c LIKE '+1-1000%'
----
index-join abcde
├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int) e:5(int)
├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0]
├── key: (1)
├── fd: ()-->(2), (1)-->(3-5)
└── scan abcde@good
├── columns: a:1(int!null) b:2(int!null) c:3(string!null) d:4(int)
├── constraint: /2/3/4/1: [/1/'+1-1000' - /1/'+1-1001')
├── stats: [rows=1.089, distinct(1)=1.089, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1.089, null(3)=0]
├── key: (1)
└── fd: ()-->(2), (1)-->(3,4)
6 changes: 3 additions & 3 deletions pkg/sql/opt/xform/testdata/rules/select
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ memo (optimized, ~4KB, required=[presentation: k:1,u:2,v:3,j:4])
├── G1: (select G2 G3) (select G4 G3)
│ └── [presentation: k:1,u:2,v:3,j:4]
│ ├── best: (select G4 G3)
│ └── cost: 407.09
│ └── cost: 45.23
├── G2: (scan b)
│ └── []
│ ├── best: (scan b)
Expand All @@ -610,13 +610,13 @@ memo (optimized, ~4KB, required=[presentation: k:1,u:2,v:3,j:4])
├── G4: (index-join G7 b,cols=(1-4))
│ └── []
│ ├── best: (index-join G7 b,cols=(1-4))
│ └── cost: 406.30
│ └── cost: 45.14
├── G5: (gt G8 G9)
├── G6: (lt G8 G10)
├── G7: (scan b@u,cols=(1,2),constrained)
│ └── []
│ ├── best: (scan b@u,cols=(1,2),constrained)
│ └── cost: 82.37
│ └── cost: 9.15
├── G8: (tuple G11)
├── G9: (tuple G12)
├── G10: (tuple G13)
Expand Down