/
gnfinder.go
76 lines (65 loc) · 1.85 KB
/
gnfinder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
//go:generate statik -f -src=./data/files
package gnfinder
import (
"github.com/gnames/gnfinder/dict"
"github.com/gnames/gnfinder/heuristic"
"github.com/gnames/gnfinder/lang"
"github.com/gnames/gnfinder/nlp"
"github.com/gnames/gnfinder/token"
"github.com/gnames/gnfinder/util"
)
// FindNamesJSON takes a text and returns scientific names found in the text,
// as well as tokens
func FindNamesJSON(data []byte, dict *dict.Dictionary,
opts ...util.Opt) []byte {
m := util.NewModel(opts...)
output := FindNames([]rune(string(data)), dict, m)
return output.ToJSON()
}
// FindNames traverses a text and finds scientific names in it.
func FindNames(text []rune, d *dict.Dictionary, m *util.Model) Output {
tokens := token.Tokenize(text)
if m.Language == lang.NotSet {
m.Language = lang.DetectLanguage(text)
}
if m.Language != lang.UnknownLanguage {
m.Bayes = true
}
heuristic.TagTokens(tokens, d, m)
if m.Bayes {
nlp.TagTokens(tokens, d, m)
}
return CollectOutput(tokens, text, m)
}
// CollectOutput takes tagged tokens and assembles gnfinder output out of them.
func CollectOutput(ts []token.Token, text []rune, m *util.Model) Output {
var names []Name
l := len(ts)
for i := range ts {
u := &ts[i]
if u.Decision == token.NotName {
continue
}
name := TokensToName(ts[i:util.UpperIndex(i, l)], text)
if name.Odds == 0.0 || name.Odds > 1.0 || name.Type == "Binomial" ||
name.Type == "Trinomial" {
names = append(names, name)
}
}
output := NewOutput(names, ts, m)
return output
}
// UniqueNameStrings takes a list of names, and returns a list of unique
// name-strings
func UniqueNameStrings(names []Name) []string {
var empty struct{}
var set = make(map[string]struct{})
var uniqueNames []string
for _, n := range names {
set[n.Name] = empty
}
for n := range set {
uniqueNames = append(uniqueNames, n)
}
return uniqueNames
}