-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
statistics.go
204 lines (176 loc) · 6.51 KB
/
statistics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package props
import (
"bytes"
"fmt"
"math"
"sort"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
)
// Statistics is a collection of measurements and statistics that is used by
// the coster to estimate the cost of expressions. Statistics are collected
// for tables and indexes and are exposed to the optimizer via opt.Catalog
// interfaces.
//
// As logical properties are derived bottom-up for each expression, the
// estimated row count is derived bottom-up for each relational expression.
// The column statistics (stored in ColStats and MultiColStats) are derived
// lazily, and only as needed to determine the row count for the current
// expression or a parent expression. For example:
//
// SELECT y FROM a WHERE x=1
//
// The only column that affects the row count of this query is x, since the
// distribution of values in x is what determines the selectivity of the
// predicate. As a result, column statistics will be derived for column x but
// not for column y.
//
// See memo/statistics_builder.go for more information about how statistics are
// calculated.
type Statistics struct {
// RowCount is the estimated number of rows returned by the expression.
// Note that - especially when there are no stats available - the scaling of
// the row counts can be unpredictable; thus, a row count of 0.001 should be
// considered 1000 times better than a row count of 1, even though if this was
// a true row count they would be pretty much the same thing.
RowCount float64
// ColStats is a collection of statistics that pertain to columns in an
// expression or table. It is keyed by a set of one or more columns over which
// the statistic is defined.
ColStats ColStatsMap
// Selectivity is a value between 0 and 1 representing the estimated
// reduction in number of rows for the top-level operator in this
// expression.
Selectivity float64
}
// Init initializes the data members of Statistics.
func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) {
if relProps.Cardinality.IsZero() {
s.RowCount = 0
s.Selectivity = 0
return true
}
s.Selectivity = 1
return false
}
// ApplySelectivity applies a given selectivity to the statistics. RowCount and
// Selectivity are updated. Note that DistinctCounts are not updated, other than
// limiting them to the new RowCount. See ColumnStatistic.ApplySelectivity for
// updating distinct counts.
func (s *Statistics) ApplySelectivity(selectivity float64) {
if selectivity == 0 {
s.RowCount = 0
for i, n := 0, s.ColStats.Count(); i < n; i++ {
s.ColStats.Get(i).DistinctCount = 0
}
return
}
s.RowCount *= selectivity
s.Selectivity *= selectivity
// Make sure none of the distinct / null counts are larger than the row count.
for i, n := 0, s.ColStats.Count(); i < n; i++ {
colStat := s.ColStats.Get(i)
if colStat.DistinctCount > s.RowCount {
colStat.DistinctCount = s.RowCount
}
if colStat.NullCount > s.RowCount {
colStat.NullCount = s.RowCount
}
}
}
func (s *Statistics) String() string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "[rows=%.9g", s.RowCount)
colStats := make(ColumnStatistics, s.ColStats.Count())
for i := 0; i < s.ColStats.Count(); i++ {
colStats[i] = s.ColStats.Get(i)
}
sort.Sort(colStats)
for _, col := range colStats {
fmt.Fprintf(&buf, ", distinct%s=%.9g", col.Cols.String(), col.DistinctCount)
fmt.Fprintf(&buf, ", null%s=%.9g", col.Cols.String(), col.NullCount)
}
buf.WriteString("]")
return buf.String()
}
// ColumnStatistic is a collection of statistics that applies to a particular
// set of columns. In theory, a table could have a ColumnStatistic object
// for every possible subset of columns. In practice, it is only worth
// maintaining statistics on a few columns and column sets that are frequently
// used in predicates, group by columns, etc.
type ColumnStatistic struct {
// Cols is the set of columns whose data are summarized by this
// ColumnStatistic struct.
Cols opt.ColSet
// DistinctCount is the estimated number of distinct values of this
// set of columns for this expression.
DistinctCount float64
// NullCount is the estimated number of null values of this set of
// columns for this expression. For multi-column stats, this null
// count tracks all instances of at least one null value in the
// column set.
NullCount float64
}
// ApplySelectivity updates the distinct count according to a given selectivity.
func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) {
if selectivity == 1 || c.DistinctCount == 0 {
return
}
if selectivity == 0 {
c.DistinctCount = 0
return
}
n := inputRows
d := c.DistinctCount
// If each distinct value appears n/d times, and the probability of a
// row being filtered out is (1 - selectivity), the probability that all
// n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected
// number of values that are filtered out is d*(1 - selectivity)^(n/d).
//
// This formula returns d * selectivity when d=n but is closer to d
// when d << n.
c.DistinctCount = d - d*math.Pow(1-selectivity, n/d)
// Since the null count is a more simpler count of all null rows, we can
// just multiply the selectivity with it.
c.NullCount *= selectivity
}
// ColumnStatistics is a slice of pointers to ColumnStatistic values.
type ColumnStatistics []*ColumnStatistic
// Len returns the number of ColumnStatistic values.
func (c ColumnStatistics) Len() int { return len(c) }
// Less is part of the Sorter interface.
func (c ColumnStatistics) Less(i, j int) bool {
if c[i].Cols.Len() != c[j].Cols.Len() {
return c[i].Cols.Len() < c[j].Cols.Len()
}
prev := 0
for {
nextI, ok := c[i].Cols.Next(prev)
if !ok {
return false
}
// No need to check if ok since both ColSets are the same length and
// so far have had the same elements.
nextJ, _ := c[j].Cols.Next(prev)
if nextI != nextJ {
return nextI < nextJ
}
prev = nextI
}
}
// Swap is part of the Sorter interface.
func (c ColumnStatistics) Swap(i, j int) {
c[i], c[j] = c[j], c[i]
}