/
trainer.go
169 lines (149 loc) · 4.13 KB
/
trainer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package main
import (
"bytes"
"fmt"
"log"
"os"
"path/filepath"
"github.com/gnames/bayes"
"github.com/gnames/bayes/ent/feature"
"github.com/gnames/gnfinder/pkg/ent/heuristic"
"github.com/gnames/gnfinder/pkg/ent/lang"
"github.com/gnames/gnfinder/pkg/ent/nlp"
"github.com/gnames/gnfinder/pkg/ent/token"
"github.com/gnames/gnfinder/pkg/io/dict"
jsoniter "github.com/json-iterator/go"
)
var inGenusButNoName = make(map[string]struct{})
type FileName string
// TrainingLanguageData associates a Language with training data
type TrainingLanguageData map[lang.Language]TrainingData
type TrainingData map[FileName]*TextData
type TextData struct {
Text []rune
NamesPositions
}
type NamesPositions []NameData
type NameData struct {
Name string `json:"name"`
Start int `json:"start"`
End int `json:"end"`
}
// Train performs the training process
func Train(td TrainingData, d *dict.Dictionary) bayes.Bayes {
lfs := processTrainingData(td, d)
nb := bayes.New()
nb.Train(lfs)
return nb
}
// LoadTrainingData loads TrainingData from a file.
func NewTrainingLanguageData(dir string) TrainingLanguageData {
tld := make(TrainingLanguageData)
for lang := range lang.LanguagesSet {
path := filepath.Join(dir, lang.String())
td := NewTrainingData(path)
tld[lang] = td
}
return tld
}
// NewTrainingData assembles text and name occurance information from several
// files that contain no names at all, or are botanical and zoological research
// papers that do contain names.
func NewTrainingData(path string) TrainingData {
td := make(TrainingData)
// files := [...]string{"no_names", "names", "phyto1", "phyto2", "zoo1",
// "zoo2", "zoo3", "zoo4"}
files := [...]string{"no_names", "names"}
for _, v := range files {
txt := fmt.Sprintf("%s.txt", v)
txtPath := filepath.Join(path, txt)
txtBytes, err := os.ReadFile(txtPath)
if err != nil {
log.Fatal(err)
}
text := []rune(string(txtBytes))
json := fmt.Sprintf("%s.json", v)
jsonPath := filepath.Join(path, json)
namesBytes, err := os.ReadFile(jsonPath)
if err != nil {
log.Fatal(err)
}
r := bytes.NewReader(namesBytes)
var nps NamesPositions
err = jsoniter.NewDecoder(r).Decode(&nps)
if err != nil {
log.Fatal(err)
}
td[FileName(v)] = &TextData{Text: text, NamesPositions: nps}
}
return td
}
// processTrainingData takes data from several training texts, ignores
// the name of the file and collects training information from names in
// the texts.
func processTrainingData(
td TrainingData,
d *dict.Dictionary,
) []feature.ClassFeatures {
var lfs []feature.ClassFeatures
for _, v := range td {
lfsText := processText(v, d)
lfs = append(lfs, lfsText...)
}
return lfs
}
// processText
func processText(t *TextData, d *dict.Dictionary) []feature.ClassFeatures {
var lfs, lfsText []feature.ClassFeatures
var nd NameData
ts := token.Tokenize(t.Text)
heuristic.TagTokens(ts, d)
l := len(t.NamesPositions)
var nameIdx, i int
for {
if l > 0 {
nd = t.NamesPositions[nameIdx]
}
i, lfsText = getFeatures(i, ts, &nd)
lfs = append(lfs, lfsText...)
nameIdx++
if nameIdx == l || i == -1 {
break
}
}
return lfs
}
// getFeatures collects features for non-names that happen before a
// known name. It takes index of the first token to traverse, tokens, and
// currenly available name metadata, if any. It returns all the features
// and a new index to continue collecting data.
func getFeatures(
i int,
ts []token.TokenSN,
nd *NameData,
) (int, []feature.ClassFeatures) {
var lfs []feature.ClassFeatures
class := nlp.IsNotName
for j := i; j < len(ts); j++ {
t := ts[j]
if !t.Features().IsCapitalized {
continue
}
upperIndex := token.UpperIndex(j, len(ts))
featureSet := nlp.NewFeatureSet(ts[j:upperIndex])
if nd.Name != "" && t.End() > nd.Start {
class = nlp.IsName
lfs = append(lfs, feature.ClassFeatures{Features: featureSet.Flatten(),
Class: class})
return j + 1, lfs
}
for _, v := range featureSet.Uninomial {
if v.Name == "uniDict" && v.Value == "inGenus" {
inGenusButNoName[t.Cleaned()] = struct{}{}
}
}
lfs = append(lfs, feature.ClassFeatures{Features: featureSet.Flatten(),
Class: class})
}
return -1, lfs
}