/
bloom_tester.go
306 lines (261 loc) · 8.74 KB
/
bloom_tester.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
package v1
import (
"unicode/utf8"
"github.com/grafana/regexp"
"github.com/grafana/loki/v3/pkg/logql/log"
"github.com/grafana/loki/v3/pkg/logql/log/pattern"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/storage/bloom/v1/filter"
)
type BloomTest interface {
Matches(bloom filter.Checker) bool
MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool
}
type BloomTests []BloomTest
func (b BloomTests) Matches(bloom filter.Checker) bool {
for _, test := range b {
if !test.Matches(bloom) {
return false
}
}
return true
}
func (b BloomTests) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
for _, test := range b {
if !test.MatchesWithPrefixBuf(bloom, buf, prefixLen) {
return false
}
}
return true
}
// ExtractTestableLineFilters extracts all line filters from an expression
// that can be tested against a bloom filter. This will skip any line filters
// after a line format expression. A line format expression might add content
// that the query later matches against, which can't be tested with a bloom filter.
// E.g. For {app="fake"} |= "foo" | line_format "thisNewTextShouldMatch" |= "thisNewTextShouldMatch"
// this function will return only the line filter for "foo" since the line filter for "thisNewTextShouldMatch"
// wouldn't match against the bloom filter but should match against the query.
func ExtractTestableLineFilters(expr syntax.Expr) []syntax.LineFilterExpr {
if expr == nil {
return nil
}
var filters []syntax.LineFilterExpr
var lineFmtFound bool
visitor := &syntax.DepthFirstTraversal{
VisitLineFilterFn: func(v syntax.RootVisitor, e *syntax.LineFilterExpr) {
if e != nil && !lineFmtFound {
filters = append(filters, *e)
}
},
VisitLineFmtFn: func(v syntax.RootVisitor, e *syntax.LineFmtExpr) {
if e != nil {
lineFmtFound = true
}
},
}
expr.Accept(visitor)
return filters
}
// FiltersToBloomTest converts a list of line filters to a BloomTest.
// Note that all the line filters should be testable against a bloom filter.
// Use ExtractTestableLineFilters to extract testable line filters from an expression.
// TODO(owen-d): limits the number of bloom lookups run.
// An arbitrarily high number can overconsume cpu and is a DoS vector.
// TODO(owen-d): use for loop not recursion to protect callstack
func FiltersToBloomTest(b NGramBuilder, filters ...syntax.LineFilterExpr) BloomTest {
tests := make(BloomTests, 0, len(filters))
for _, f := range filters {
if f.Left != nil {
tests = append(tests, FiltersToBloomTest(b, *f.Left))
}
if f.Or != nil {
left := FiltersToBloomTest(b, *f.Or)
right := simpleFilterToBloomTest(b, f.LineFilter)
tests = append(tests, newOrTest(left, right))
continue
}
tests = append(tests, simpleFilterToBloomTest(b, f.LineFilter))
}
return tests
}
func simpleFilterToBloomTest(b NGramBuilder, filter syntax.LineFilter) BloomTest {
switch filter.Ty {
case log.LineMatchNotEqual, log.LineMatchNotRegexp, log.LineMatchNotPattern:
// We cannot test _negated_ filters with a bloom filter since blooms are probabilistic
// filters that can only tell us if a string _might_ exist.
// For example, for `!= "foo"`, the bloom filter might tell us that the string "foo" might exist
// but because we are not sure, we cannot discard that chunk because it might actually not be there.
// Therefore, we return a test that always returns true.
return MatchAll
case log.LineMatchEqual:
return newStringTest(b, filter.Match)
case log.LineMatchRegexp:
return MatchAll
case log.LineMatchPattern:
return newPatternTest(b, filter.Match)
default:
return MatchAll
}
}
type bloomCheckerWrapper struct {
bloom filter.Checker
}
// Test implements the log.Checker interface
func (b bloomCheckerWrapper) Test(line []byte, _ bool, _ bool) bool {
return b.bloom.Test(line)
}
// TestRegex implements the log.Checker interface
func (b bloomCheckerWrapper) TestRegex(_ *regexp.Regexp) bool {
// We won't support regexes in bloom filters so we just return true
return true
}
type logCheckerWrapper struct {
checker log.Checker
}
// Test implements the filter.Checker interface
func (l logCheckerWrapper) Test(data []byte) bool {
return l.checker.Test(data, true, false)
}
type matcherFilterWrapper struct {
filter log.Matcher
}
func (m matcherFilterWrapper) Matches(bloom filter.Checker) bool {
return m.filter.Matches(bloomCheckerWrapper{bloom})
}
func (m matcherFilterWrapper) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
return m.filter.Matches(bloomCheckerWrapper{prefixedChecker{
checker: bloom,
buf: buf,
prefixLen: prefixLen,
}})
}
type prefixedChecker struct {
checker filter.Checker
buf []byte
prefixLen int
}
func (p prefixedChecker) Test(data []byte) bool {
return p.checker.Test(append(p.buf[:p.prefixLen], data...))
}
type matchAllTest struct{}
var MatchAll = matchAllTest{}
func (n matchAllTest) Matches(_ filter.Checker) bool {
return true
}
func (n matchAllTest) MatchesWithPrefixBuf(_ filter.Checker, _ []byte, _ int) bool {
return true
}
// NGramBuilder is an interface for tokenizing strings into ngrams
// Extracting this interface allows us to test the bloom filter without having to use the actual tokenizer
// TODO: This should be moved to tokenizer.go
type NGramBuilder interface {
Tokens(line string) Iterator[[]byte]
N() int
SkipFactor() int
}
type stringTest struct {
ngrams [][]byte
}
func newStringTest(b NGramBuilder, search string) (res BloomTest) {
// search string must be longer than the combined ngram length and skip factor
// in order for all possible skip offsets to have at least 1 ngram
skip := b.SkipFactor()
if ct := utf8.RuneCountInString(search); ct < b.N()+skip {
return MatchAll
}
tests := make([]stringTest, 0, skip)
for i := 0; i < skip+1; i++ {
searchWithOffset := search
for j := 0; j < i; j++ {
_, size := utf8.DecodeRuneInString(searchWithOffset)
// NB(owen-d): small bounds check for invalid utf8
searchWithOffset = searchWithOffset[min(size, len(searchWithOffset)):]
}
var test stringTest
it := b.Tokens(searchWithOffset)
for it.Next() {
ngram := make([]byte, len(it.At()))
copy(ngram, it.At())
test.ngrams = append(test.ngrams, ngram)
}
tests = append(tests, test)
}
res = tests[0]
for _, t := range tests[1:] {
res = newOrTest(res, t)
}
return res
}
// Matches implements the BloomTest interface
func (b stringTest) Matches(bloom filter.Checker) bool {
for _, ngram := range b.ngrams {
if !bloom.Test(ngram) {
return false
}
}
return true
}
// MatchesWithPrefixBuf implements the BloomTest interface
func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
for _, ngram := range b.ngrams {
buf = append(buf[:prefixLen], ngram...)
if !bloom.Test(buf) {
return false
}
}
return true
}
type stringMatcherFilter struct {
test BloomTest
}
// Matches implements the log.Filterer interface
func (b stringMatcherFilter) Matches(test log.Checker) bool {
return b.test.Matches(logCheckerWrapper{test})
}
func newStringFilterFunc(b NGramBuilder) log.NewMatcherFiltererFunc {
return func(match []byte, caseInsensitive bool) log.MatcherFilterer {
return log.WrapMatcher(stringMatcherFilter{
test: newStringTest(b, string(match)),
})
}
}
type orTest struct {
left, right BloomTest
}
// In addition to common `|= "foo" or "bar"`,
// orTest is particularly useful when testing skip-factors>0, which
// can result in different "sequences" of ngrams for a particular line
// and if either sequence matches the filter, the chunk is considered a match.
// For instance, with n=3,skip=1, the line "foobar" generates ngrams:
// ["foo", "oob", "oba", "bar"]
// Now let's say we want to search for the same "foobar".
// Note: we don't know which offset in the line this match may be,
// so we check every possible offset. The filter will match the ngrams:
// offset_0 creates ["foo", "oba"]
// offset_1 creates ["oob", "bar"]
// If either sequences are found in the bloom filter, the chunk is considered a match.
// Expanded, this is
// match == (("foo" && "oba") || ("oob" && "bar"))
func newOrTest(left, right BloomTest) orTest {
return orTest{
left: left,
right: right,
}
}
func (o orTest) Matches(bloom filter.Checker) bool {
return o.left.Matches(bloom) || o.right.Matches(bloom)
}
func (o orTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
return o.left.MatchesWithPrefixBuf(bloom, buf, prefixLen) || o.right.MatchesWithPrefixBuf(bloom, buf, prefixLen)
}
func newPatternTest(b NGramBuilder, match string) BloomTest {
lit, err := pattern.ParseLiterals(match)
if err != nil {
return MatchAll
}
var res BloomTests
for _, l := range lit {
res = append(res, newStringTest(b, string(l)))
}
return res
}