/
tokenizer.go
103 lines (93 loc) · 2.08 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package gse
import (
"errors"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/go-ego/gse"
)
const Name = "gse"
type GseTokenizer struct {
seg *gse.Segmenter
}
// NewGseTokenizer create a gse cut tokenizer
func NewGseTokenizer(dictPath, stopPath string, alpha bool) (*GseTokenizer, error) {
var (
seg gse.Segmenter
err error
)
seg.SkipLog = true
if alpha {
seg.AlphaNum = true
}
if dictPath != "" {
err = seg.LoadDict(dictPath)
if err != nil {
return nil, err
}
} else {
err = seg.LoadDictEmbed()
if err != nil {
return nil, err
}
}
if stopPath != "" {
err = seg.LoadStop(stopPath)
if err != nil {
return nil, err
}
} else {
err = seg.LoadStopEmbed()
if err != nil {
return nil, err
}
}
return &GseTokenizer{seg: &seg}, nil
}
// Tokenize cut the text to bleve token stream
func (g *GseTokenizer) Tokenize(text []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
t := string(text)
cuts := g.seg.Trim(g.seg.CutSearch(t, true))
// fmt.Println("cuts: ", cuts)
azs := g.seg.Analyze(cuts, t)
for _, az := range azs {
typ := analysis.Ideographic
alphaNumeric := true
for _, r := range az.Text {
if r < 32 || r > 126 {
alphaNumeric = false
break
}
}
if alphaNumeric {
typ = analysis.AlphaNumeric
}
token := analysis.Token{
Term: []byte(az.Text),
Start: az.Start,
End: az.End,
Position: az.Position,
Type: typ,
}
result = append(result, &token)
}
return result
}
func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
dictpath, ok := config["dict_path"].(string)
if !ok {
return nil, errors.New("config dict_path not found")
}
stoppath, ok := config["stop_words"].(string)
if !ok {
return nil, errors.New("config stop_words not found")
}
alpha, ok := config["alpha"].(bool)
if !ok {
return nil, errors.New("config alpha not found")
}
return NewGseTokenizer(dictpath, stoppath, alpha)
}
func init() {
registry.RegisterTokenizer(Name, tokenizerConstructor)
}