-
Notifications
You must be signed in to change notification settings - Fork 0
/
fts_edgegram.go
76 lines (65 loc) · 1.58 KB
/
fts_edgegram.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package v1
import (
"bytes"
"github.com/clipperhouse/jargon"
"github.com/clipperhouse/uax29/graphemes"
"github.com/ductone/protoc-gen-pgdb/internal/slice"
)
func edgegramStream(n int) jargon.Filter {
return func(incoming *jargon.TokenStream) *jargon.TokenStream {
rg := &edgegram{size: n, incoming: incoming}
return jargon.NewTokenStream(rg.next)
}
}
type edgegram struct {
size int
incoming *jargon.TokenStream
pending []*jargon.Token
}
func (t *edgegram) shiftPending() (*jargon.Token, error) {
if len(t.pending) >= 1 {
rv := t.pending[0]
t.pending = t.pending[1:]
return rv, nil
}
return nil, nil
}
func (t *edgegram) next() (*jargon.Token, error) {
if tk, err := t.shiftPending(); tk != nil || err != nil {
return tk, err
}
token, err := t.incoming.Next()
if err != nil {
return nil, err
}
if token == nil {
return nil, nil
}
if token.IsPunct() || token.IsSpace() {
return token, nil
}
v := token.String()
t.pending = make([]*jargon.Token, 0, (len(v)+3)/3)
segments := graphemes.NewSegmenter([]byte(v))
sg := graphemes.SegmentAll([]byte(v))
offset := 0
grams := make([]string, 0, (len(sg)+3)/3)
for i := 1; i <= len(sg); i++ {
k := string(bytes.Join(sg[0:i], []byte{}))
grams = append(grams, k)
if i > t.size {
offset = i - t.size
k := string(bytes.Join(sg[offset:i], []byte{}))
grams = append(grams, k)
}
}
grams = slice.Unique(grams)
slice.Sort(grams)
for _, k := range grams {
t.pending = append(t.pending, jargon.NewToken(k, true))
}
if err := segments.Err(); err != nil {
return nil, err
}
return t.shiftPending()
}