forked from polarsignals/frostdb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter.go
266 lines (231 loc) · 6.19 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
package expr
import (
"errors"
"fmt"
"strings"
"sync"
"sync/atomic"
"github.com/parquet-go/parquet-go"
"github.com/dreamsxin/frostdb/pqarrow"
"github.com/dreamsxin/frostdb/query/logicalplan"
)
type PreExprVisitorFunc func(expr logicalplan.Expr) bool
func (f PreExprVisitorFunc) PreVisit(expr logicalplan.Expr) bool {
return f(expr)
}
func (f PreExprVisitorFunc) Visit(_ logicalplan.Expr) bool {
return false
}
func (f PreExprVisitorFunc) PostVisit(_ logicalplan.Expr) bool {
return false
}
// Particulate is an abstraction of something that can be filtered.
// A parquet.RowGroup is a particulate that is able to be filtered, and wrapping a parquet.File with
// ParquetFileParticulate allows a file to be filtered.
type Particulate interface {
Schema() *parquet.Schema
ColumnChunks() []parquet.ColumnChunk
}
type TrueNegativeFilter interface {
// Eval should be safe to call concurrently.
Eval(Particulate) (bool, error)
}
type AlwaysTrueFilter struct{}
func (f *AlwaysTrueFilter) Eval(_ Particulate) (bool, error) {
return true, nil
}
func binaryBooleanExpr(expr *logicalplan.BinaryExpr) (TrueNegativeFilter, error) {
switch expr.Op {
case logicalplan.OpNotEq:
fallthrough
case logicalplan.OpLt:
fallthrough
case logicalplan.OpLtEq:
fallthrough
case logicalplan.OpGt:
fallthrough
case logicalplan.OpGtEq:
fallthrough
case logicalplan.OpEq: // , logicalplan.OpNotEq, logicalplan.OpLt, logicalplan.OpLtEq, logicalplan.OpGt, logicalplan.OpGtEq, logicalplan.OpRegexMatch, logicalplan.RegexNotMatch:
var leftColumnRef *ColumnRef
expr.Left.Accept(PreExprVisitorFunc(func(expr logicalplan.Expr) bool {
switch e := expr.(type) {
case *logicalplan.Column:
leftColumnRef = &ColumnRef{
ColumnName: e.ColumnName,
}
return false
}
return true
}))
if leftColumnRef == nil {
return nil, errors.New("left side of binary expression must be a column")
}
var (
rightValue parquet.Value
err error
)
expr.Right.Accept(PreExprVisitorFunc(func(expr logicalplan.Expr) bool {
switch e := expr.(type) {
case *logicalplan.LiteralExpr:
rightValue, err = pqarrow.ArrowScalarToParquetValue(e.Value)
return false
}
return true
}))
if err != nil {
return nil, err
}
return &BinaryScalarExpr{
Left: leftColumnRef,
Op: expr.Op,
Right: rightValue,
}, nil
case logicalplan.OpAnd:
left, err := BooleanExpr(expr.Left)
if err != nil {
return nil, err
}
right, err := BooleanExpr(expr.Right)
if err != nil {
return nil, err
}
return &AndExpr{
Left: left,
Right: right,
}, nil
case logicalplan.OpOr:
left, err := BooleanExpr(expr.Left)
if err != nil {
return nil, err
}
right, err := BooleanExpr(expr.Right)
if err != nil {
return nil, err
}
return &OrExpr{
Left: left,
Right: right,
}, nil
default:
return &AlwaysTrueFilter{}, nil
}
}
func aggregationExpr(expr *logicalplan.AggregationFunction) (TrueNegativeFilter, error) {
switch expr.Func {
case logicalplan.AggFuncMax:
a := &MaxAgg{}
expr.Expr.Accept(PreExprVisitorFunc(func(expr logicalplan.Expr) bool {
switch e := expr.(type) {
case *logicalplan.Column:
a.columnName = e.ColumnName
case *logicalplan.DynamicColumn:
a.columnName = e.ColumnName
a.dynamic = true
default:
return true
}
return false
}))
return a, nil
default:
return &AlwaysTrueFilter{}, nil
}
}
type MaxAgg struct {
maxMap sync.Map
// It would be nicer to use a dynamic-aware ColumnRef here, but that would
// introduce allocations (slices for indexes and concrete names), so for
// performance reasons we execute the column lookup manually.
columnName string
dynamic bool
}
func (a *MaxAgg) Eval(p Particulate) (bool, error) {
processFurther := false
for i, f := range p.Schema().Fields() {
if (a.dynamic && !strings.HasPrefix(f.Name(), a.columnName+".")) || (!a.dynamic && f.Name() != a.columnName) {
continue
}
chunk := p.ColumnChunks()[i]
index, err := chunk.ColumnIndex()
if err != nil {
return false, fmt.Errorf("error retrieving column index in MaxAgg.Eval")
}
if NullCount(index) == chunk.NumValues() {
// This page is full of nulls. Nothing to do since we can't trust
// min/max index values. This chunk should not be processed since
// it can't contribute to min/max unless another column can.
continue
}
columnPointer, _ := a.maxMap.LoadOrStore(f.Name(), &atomic.Pointer[parquet.Value]{})
atomicMax := columnPointer.(*atomic.Pointer[parquet.Value])
v := Max(index)
for globalMax := atomicMax.Load(); globalMax == nil || compare(v, *globalMax) > 0; globalMax = atomicMax.Load() {
if atomicMax.CompareAndSwap(globalMax, &v) {
// At least one column exceeded the current max so this chunk
// satisfies the filter. Note that we do not break out of
// scanning the rest of the columns since we do want to memoize
// the max for other columns as well.
processFurther = true
break
}
}
if !a.dynamic && processFurther {
// No need to look at the remaining columns if we're only looking
// for a single concrete column.
break
}
}
return processFurther, nil
}
type AndExpr struct {
Left TrueNegativeFilter
Right TrueNegativeFilter
}
func (a *AndExpr) Eval(p Particulate) (bool, error) {
left, err := a.Left.Eval(p)
if err != nil {
return false, err
}
if !left {
return false, nil
}
right, err := a.Right.Eval(p)
if err != nil {
return false, err
}
return right, nil
}
type OrExpr struct {
Left TrueNegativeFilter
Right TrueNegativeFilter
}
func (a *OrExpr) Eval(p Particulate) (bool, error) {
left, err := a.Left.Eval(p)
if err != nil {
return false, err
}
if left {
return true, nil
}
right, err := a.Right.Eval(p)
if err != nil {
return false, err
}
return right, nil
}
func BooleanExpr(expr logicalplan.Expr) (TrueNegativeFilter, error) {
if expr == nil {
return &AlwaysTrueFilter{}, nil
}
switch e := expr.(type) {
case *logicalplan.BinaryExpr:
return binaryBooleanExpr(e)
case *logicalplan.AggregationFunction:
// NOTE: Aggregations are optimized in the case of no grouping columns
// or other filters.
return aggregationExpr(e)
default:
return nil, fmt.Errorf("unsupported boolean expression %T", e)
}
}