Skip to content

Commit

Permalink
opt: calculate more accurate selectivity for disjunctions
Browse files Browse the repository at this point in the history
The optimal plan for a query with a disjunctive filter is often a
DistinctOn+UnionAll of the results of two index scans, when two separate
indexes can satisfy either side of the disjunction. For example:

    SELECT * FROM t WHERE a = 'foo' OR b = 'foo' LIMIT 5

    limit
     ├── distinct-on
     │    └── union-all
     │         ├── scan t@a_idx
     │         │    └── constraint: /a: [/'foo' - /'foo']
     │         └── scan t@a_idx
     │              └── constraint: /b: [/'foo' - /'foo']
     └── 5

However, this optimal plan is not always chosen. A `constraint.Set`
cannot be built for a disjunction involving multiple columns, so the
statistics builder assumes the selectivity of the disjunction to be 1/3,
often resulting in an over-estimated row count. When the DistinctOn is
built during exploration it is added to the same memo group as the
Select that it replaces, so it shares the same row count estimate. Even
though the UnionAll's cost is low because it produces a small subset of
the table, the DistinctOn's cost is high because the coster is under the
assumption that the DistinctOn will produce 1/3 of the rows in the
table. The overhead of producing so many rows adds significant overhead
to the overall cost, preventing this plan from being chosen by the
optimizer.

This commit fixes the issue by attempting to build a constraint set for
each side of a disjunction in a filter. By unioning the selectivity of
each constraint set, a more accurate row count estimate is calculated
for the filter. As a result, the cost of the DistinctOn is more accurate
and the optimal plan is chosen.

This fix is only enabled if the cluster setting is enabled:
`sql.defaults.optimizer_improve_disjunction_selectivity.enabled`.

Informs #58744

Release note (performance improvement): A new cluster setting
`sql.defaults.optimizer_improve_disjunction_selectivity.enabled` enables
more accurate selectivity estimation of query filters with OR
expressions. This improves query plans in some cases. The cluster
setting is disabled by default.
  • Loading branch information
mgartner committed Jul 16, 2021
1 parent b7abd20 commit d2b1d9b
Show file tree
Hide file tree
Showing 5 changed files with 330 additions and 11 deletions.
192 changes: 192 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/select
Expand Up @@ -1791,3 +1791,195 @@ vectorized: true
missing stats
table: s@primary
spans: FULL SCAN

statement ok
CREATE TABLE disjunction (
k INT PRIMARY KEY,
a STRING,
b STRING,
c STRING,
INDEX a_idx (a),
INDEX b_idx (b),
INDEX c_idx (c)
)

statement ok
ALTER TABLE disjunction INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 5010,
"histo_col_type": "string",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "aaa"},
{"num_eq": 10, "num_range": 990, "distinct_range": 999, "upper_bound": "foo"},
{"num_eq": 990, "num_range": 10, "distinct_range": 9, "upper_bound": "fop"},
{"num_eq": 0, "num_range": 8000, "distinct_range": 4000, "upper_bound": "zoo"}
]
},
{
"columns": ["b"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 5010,
"histo_col_type": "string",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "aaa"},
{"num_eq": 10, "num_range": 990, "distinct_range": 999, "upper_bound": "foo"},
{"num_eq": 990, "num_range": 10, "distinct_range": 9, "upper_bound": "fop"},
{"num_eq": 0, "num_range": 8000, "distinct_range": 4000, "upper_bound": "zoo"}
]
},
{
"columns": ["c"],
"created_at": "2018-01-01 1:00:00.00000+00:00",
"row_count": 10000,
"distinct_count": 5010,
"histo_col_type": "string",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "aaa"},
{"num_eq": 10, "num_range": 990, "distinct_range": 999, "upper_bound": "foo"},
{"num_eq": 990, "num_range": 10, "distinct_range": 9, "upper_bound": "fop"},
{"num_eq": 0, "num_range": 8000, "distinct_range": 4000, "upper_bound": "zoo"}
]
}
]'

# Single disjunction. A plan with a UNION of two constrained scans is not selected
# because the selectivity of the disjunction is not accurately estimated.
query T
EXPLAIN SELECT * FROM disjunction WHERE a LIKE 'foo%' OR b LIKE 'foo%' LIMIT 5
----
distribution: local
vectorized: true
·
• limit
│ estimated row count: 5
│ count: 5
└── • filter
│ estimated row count: 3,333
│ filter: (a LIKE 'foo%') OR (b LIKE 'foo%')
└── • scan
estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
table: disjunction@primary
spans: FULL SCAN

# More accurate estimation of the disjunction's selectivity leads to a plan with
# a union of constrained scans. See #58744.
statement ok
SET CLUSTER SETTING sql.defaults.optimizer_improve_disjunction_selectivity.enabled = true

# Single disjunction.
query T
EXPLAIN SELECT * FROM disjunction WHERE a LIKE 'foo%' OR b LIKE 'foo%' LIMIT 5
----
distribution: local
vectorized: true
·
• limit
│ estimated row count: 5
│ count: 5
└── • distinct
│ estimated row count: 40
│ distinct on: k
└── • union all
│ estimated row count: 40
├── • index join
│ │ estimated row count: 20
│ │ table: disjunction@primary
│ │
│ └── • scan
│ estimated row count: 20 (0.20% of the table; stats collected <hidden> ago)
│ table: disjunction@a_idx
│ spans: [/'foo' - /'fop')
└── • index join
│ estimated row count: 20
│ table: disjunction@primary
└── • scan
estimated row count: 20 (0.20% of the table; stats collected <hidden> ago)
table: disjunction@b_idx
spans: [/'foo' - /'fop')

# Multiple disjunctions.
query T
EXPLAIN SELECT * FROM disjunction WHERE a LIKE 'foo%' OR b LIKE 'foo%' OR c LIKE 'foo%' LIMIT 5
----
distribution: local
vectorized: true
·
• limit
│ estimated row count: 5
│ count: 5
└── • distinct
│ estimated row count: 60
│ distinct on: k
└── • union all
│ estimated row count: 60
├── • index join
│ │ estimated row count: 20
│ │ table: disjunction@primary
│ │
│ └── • scan
│ estimated row count: 20 (0.20% of the table; stats collected <hidden> ago)
│ table: disjunction@a_idx
│ spans: [/'foo' - /'fop')
└── • distinct
│ estimated row count: 40
│ distinct on: k
└── • union all
│ estimated row count: 40
├── • index join
│ │ estimated row count: 20
│ │ table: disjunction@primary
│ │
│ └── • scan
│ estimated row count: 20 (0.20% of the table; stats collected <hidden> ago)
│ table: disjunction@c_idx
│ spans: [/'foo' - /'fop')
└── • index join
│ estimated row count: 20
│ table: disjunction@primary
└── • scan
estimated row count: 20 (0.20% of the table; stats collected <hidden> ago)
table: disjunction@b_idx
spans: [/'foo' - /'fop')

# The row count should not exceed the number of rows in the table when
# disjunctions have high selectivity values.
query T
EXPLAIN SELECT * FROM disjunction WHERE a > 'fop' OR b > 'fop' OR c > 'fop' LIMIT 5
----
distribution: local
vectorized: true
·
• limit
│ estimated row count: 5
│ count: 5
└── • filter
│ estimated row count: 10,000
│ filter: ((a > 'fop') OR (b > 'fop')) OR (c > 'fop')
└── • scan
estimated row count: 10,000 (100% of the table; stats collected <hidden> ago)
table: disjunction@primary
spans: FULL SCAN

statement ok
RESET CLUSTER SETTING sql.defaults.optimizer_improve_disjunction_selectivity.enabled
1 change: 1 addition & 0 deletions pkg/sql/opt/memo/BUILD.bazel
Expand Up @@ -24,6 +24,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/geo/geoindex",
"//pkg/settings",
"//pkg/sql/catalog/colinfo",
"//pkg/sql/catalog/descpb",
"//pkg/sql/inverted",
Expand Down
136 changes: 131 additions & 5 deletions pkg/sql/opt/memo/statistics_builder.go
Expand Up @@ -15,6 +15,7 @@ import (
"reflect"

"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
Expand All @@ -25,6 +26,16 @@ import (
"github.com/cockroachdb/errors"
)

// improveDisjunctionSelectivityEnabled indicates whether we should try to
// improve selectivity calculations for filters with disjunctions by unioning
// the selectivity of each side of the disjunction. This may lead to more
// efficient query plans in some cases.
var improveDisjunctionSelectivityEnabled = settings.RegisterBoolSetting(
"sql.defaults.optimizer_improve_disjunction_selectivity.enabled",
"enables improved selectivity calculations for queries with disjunctions",
false,
)

var statsAnnID = opt.NewTableAnnID()

// statisticsBuilder is responsible for building the statistics that are
Expand Down Expand Up @@ -2966,11 +2977,13 @@ func (sb *statisticsBuilder) applyFiltersItem(
// want to make sure that we don't include columns that were only present in
// equality conjuncts such as var1=var2. The selectivity of these conjuncts
// will be accounted for in selectivityFromEquivalencies.
s := &relProps.Stats
scalarProps := filter.ScalarProps()
constrainedCols.UnionWith(scalarProps.OuterCols)
if scalarProps.Constraints != nil {
switch {
case scalarProps.Constraints != nil:
histColsLocal := sb.applyConstraintSet(
scalarProps.Constraints, scalarProps.TightConstraints, e, relProps,
scalarProps.Constraints, scalarProps.TightConstraints, e, relProps, s,
)
histCols.UnionWith(histColsLocal)
if !scalarProps.TightConstraints {
Expand All @@ -2982,13 +2995,127 @@ func (sb *statisticsBuilder) applyFiltersItem(
numUnappliedConjuncts++
}
}
} else {
case improveDisjunctionSelectivityEnabled.Get(&sb.evalCtx.Settings.SV):
if constraintUnion := sb.buildDisjunctionConstraints(filter); len(constraintUnion) > 0 {
// The filters are one or more disjunctions and tight constraint sets
// could be built for each.
var tmpStats, unionStats props.Statistics
unionStats.CopyFrom(s)

// Get the stats for each constraint set, apply the selectivity to a
// temporary stats struct, and union the selectivity and row counts.
sb.constrainExpr(e, constraintUnion[0], relProps, &unionStats)
for i := 1; i < len(constraintUnion); i++ {
tmpStats.CopyFrom(s)
sb.constrainExpr(e, constraintUnion[i], relProps, &tmpStats)
unionStats.UnionWith(&tmpStats)
}

// The stats are unioned naively; the selectivity may be greater than 1
// and the row count may be greater than the row count of the input
// stats. We use the minimum selectivity and row count of the unioned
// stats and the input stats.
// TODO(mgartner): Calculate and set the column statistics based on
// constraintUnion.
s.Selectivity = props.MinSelectivity(s.Selectivity, unionStats.Selectivity)
s.RowCount = min(s.RowCount, unionStats.RowCount)
} else {
numUnappliedConjuncts++
}
default:
numUnappliedConjuncts++
}

return numUnappliedConjuncts, constrainedCols, histCols
}

// buildDisjunctionConstraints returns a slice of tight constraint sets that are
// built from one or more adjacent Or expressions in filter. This allows more
// accurate stats to be calculated for disjunctions. If any adjacent Or cannot
// be tightly constrained, then nil is returned.
func (sb *statisticsBuilder) buildDisjunctionConstraints(filter *FiltersItem) []*constraint.Set {
expr := filter.Condition

// If the expression is not an Or, we cannot build disjunction constraint
// sets.
or, ok := expr.(*OrExpr)
if !ok {
return nil
}

cb := constraintsBuilder{md: sb.md, evalCtx: sb.evalCtx}

unconstrained := false
var constraints []*constraint.Set
var collectConstraints func(opt.ScalarExpr)
collectConstraints = func(e opt.ScalarExpr) {
// If a constraint can be built from e, collect the constraint and its
// tightness.
c, tight := cb.buildConstraints(e)
if !c.IsUnconstrained() && tight {
constraints = append(constraints, c)
return
}

innerOr, ok := e.(*OrExpr)
if !ok {
// If a tight constraint could not be built and the expression is
// not an Or, set unconstrained so we can return nil.
unconstrained = true
return
}

// If a constraint could not be built and the expression is an Or,
// attempt to build a constraint for the left and right children.
collectConstraints(innerOr.Left)
collectConstraints(innerOr.Right)
}

// We intentionally call collectConstraints on the left and right of the
// top-level Or expression here. collectConstraints attempts to build a
// constraint for the given expression before recursing. This would be
// wasted computation because if a constraint could have been built for the
// top-level or expression, we would not have reached this point -
// applyFiltersItem would have handled this case before calling
// buildDisjunctionConstraints.
collectConstraints(or.Left)
collectConstraints(or.Right)

if unconstrained {
return nil
}

return constraints
}

// constrainExpr calculates the stats for a relational expression based on the
// constraint set. The constraint set must be tight.
func (sb *statisticsBuilder) constrainExpr(
e RelExpr, cs *constraint.Set, relProps *props.Relational, s *props.Statistics,
) {
constrainedCols := cs.ExtractCols()

// Calculate distinct counts and histograms for constrained columns
// ----------------------------------------------------------------
histCols := sb.applyConstraintSet(cs, true /* tight */, e, relProps, s)

// Set null counts to 0 for non-nullable columns
// ---------------------------------------------
notNullCols := relProps.NotNullCols.Copy()
// Add any not-null columns from this constraint set.
notNullCols.UnionWith(cs.ExtractNotNullCols(sb.evalCtx))
sb.updateNullCountsFromNotNullCols(notNullCols, s)

// Calculate row count and selectivity
// -----------------------------------
s.ApplySelectivity(sb.selectivityFromHistograms(histCols, e, s))
s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, e, s))
s.ApplySelectivity(sb.selectivityFromNullsRemoved(e, notNullCols, constrainedCols))

// Adjust the selectivity so we don't double-count the histogram columns.
s.UnapplySelectivity(sb.selectivityFromSingleColDistinctCounts(histCols, e, s))
}

// applyIndexConstraint is used to update the distinct counts and histograms
// for the constrained columns in an index constraint. Returns the set of
// constrained columns and the set of columns with a filtered histogram.
Expand Down Expand Up @@ -3058,7 +3185,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
// for the constrained columns in a constraint set. Returns the set of
// columns with a filtered histogram.
func (sb *statisticsBuilder) applyConstraintSet(
cs *constraint.Set, tight bool, e RelExpr, relProps *props.Relational,
cs *constraint.Set, tight bool, e RelExpr, relProps *props.Relational, s *props.Statistics,
) (histCols opt.ColSet) {
// If unconstrained, then no constraint could be derived from the expression,
// so fall back to estimate.
Expand All @@ -3068,7 +3195,6 @@ func (sb *statisticsBuilder) applyConstraintSet(
return opt.ColSet{}
}

s := &relProps.Stats
for i := 0; i < cs.Length(); i++ {
c := cs.Constraint(i)
col := c.Columns.Get(0).ID()
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/opt/memo/statistics_builder_test.go
Expand Up @@ -118,7 +118,7 @@ func TestGetStatsFromConstraint(t *testing.T) {
s.Init(relProps)

// Calculate distinct counts.
sb.applyConstraintSet(cs, true /* tight */, sel, relProps)
sb.applyConstraintSet(cs, true /* tight */, sel, relProps, &relProps.Stats)

// Calculate row count and selectivity.
s.RowCount = scan.Relational().Stats.RowCount
Expand Down

0 comments on commit d2b1d9b

Please sign in to comment.