-
Notifications
You must be signed in to change notification settings - Fork 5
/
gnfinder.go
89 lines (75 loc) · 2.31 KB
/
gnfinder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package gnfinder
import (
"time"
"github.com/gnames/bayes"
"github.com/gnames/gnfinder/config"
"github.com/gnames/gnfinder/ent/heuristic"
"github.com/gnames/gnfinder/ent/lang"
"github.com/gnames/gnfinder/ent/nlp"
"github.com/gnames/gnfinder/ent/output"
"github.com/gnames/gnfinder/ent/token"
"github.com/gnames/gnfinder/io/dict"
"github.com/gnames/gnlib/ent/gnvers"
)
type gnfinder struct {
config.Config
// TextOdds captures "concentration" of names as it is found for the whole
// text by heuristic name-finding. It should be close enough for real
// number of names in text. We use it when we do not have local conentration
// of names in a region of text.
TextOdds bayes.LabelFreq
// Dictionary contains black, grey, and white list dictionaries.
*dict.Dictionary
// BayesWeights weights based on Bayes' training
bayesWeights map[lang.Language]*bayes.NaiveBayes
}
func New(
cfg config.Config,
dictionaries *dict.Dictionary,
weights map[lang.Language]*bayes.NaiveBayes,
) GNfinder {
gnf := &gnfinder{
Config: cfg,
Dictionary: dictionaries,
bayesWeights: weights,
}
if gnf.WithBayes && gnf.bayesWeights == nil {
gnf.bayesWeights = nlp.BayesWeights()
}
return gnf
}
// Find takes a text as a slice of bytes, detects names and returns the found
// names.
func (gnf gnfinder) Find(file, txt string) output.Output {
start := time.Now()
text := []rune(string(txt))
tokens := token.Tokenize(text)
if gnf.WithLanguageDetection {
gnf.Language, gnf.LanguageDetected = lang.DetectLanguage(text)
}
heuristic.TagTokens(tokens, gnf.Dictionary)
if gnf.WithBayes {
nb := gnf.bayesWeights[gnf.Language]
nlp.TagTokens(tokens, gnf.Dictionary, nb, gnf.BayesOddsThreshold)
}
o := output.TokensToOutput(tokens, text, Version, gnf.GetConfig())
o.Meta.InputFile = file
dur := time.Now().Sub(start)
o.NameFindingSec = float32(dur) / float32(time.Second)
return o
}
// GetConfig returns the configuration object.
func (gnf gnfinder) GetConfig() config.Config {
return gnf.Config
}
// ChangeConfig allows to modify Config fields.
func (gnf gnfinder) ChangeConfig(opts ...config.Option) GNfinder {
for _, opt := range opts {
opt(&gnf.Config)
}
return gnf
}
// GetVersion returns version of gnfinder.
func (gnf gnfinder) GetVersion() gnvers.Version {
return gnvers.Version{Version: Version, Build: Build}
}