/
trainer.go
152 lines (132 loc) · 3.82 KB
/
trainer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
package nlp
import (
"bytes"
"fmt"
"io/ioutil"
"path/filepath"
"github.com/gnames/gnfinder/heuristic"
"github.com/gnames/bayes"
"github.com/gnames/gnfinder/dict"
"github.com/gnames/gnfinder/lang"
"github.com/gnames/gnfinder/token"
"github.com/gnames/gnfinder/util"
jsoniter "github.com/json-iterator/go"
)
type FileName string
// TrainingLanguageData associates a Language with training data
type TrainingLanguageData map[lang.Language]TrainingData
type TrainingData map[FileName]*TextData
type TextData struct {
Text []rune
NamesPositions
}
type NamesPositions []NameData
type NameData struct {
Name string `json:"name"`
Start int `json:"start"`
End int `json:"end"`
}
// Train performs the training process
func Train(td TrainingData, d *dict.Dictionary) *bayes.NaiveBayes {
lfs := processTrainingData(td, d)
nb := bayes.TrainNB(lfs)
return nb
}
// LoadTrainingData loads TrainingData from a file.
func NewTrainingLanguageData(dir string) TrainingLanguageData {
tld := make(TrainingLanguageData)
for i := 1; i < int(lang.NotSet); i++ {
lang := lang.Language(i)
path := filepath.Join(dir, lang.String())
td := NewTrainingData(path)
tld[lang] = td
}
return tld
}
// NewTrainingData assembles text and name occurance information from several
// files that contain no names at all, or are botanical and zoological research
// papers that do contain names.
func NewTrainingData(path string) TrainingData {
td := make(TrainingData)
// files := [...]string{"no_names", "names", "phyto1", "phyto2", "zoo1",
// "zoo2", "zoo3", "zoo4"}
files := [...]string{"no_names", "names"}
for _, v := range files {
txt := fmt.Sprintf("%s.txt", v)
txtPath := filepath.Join(path, txt)
txtBytes, err := ioutil.ReadFile(txtPath)
util.Check(err)
text := []rune(string(txtBytes))
json := fmt.Sprintf("%s.json", v)
jsonPath := filepath.Join(path, json)
namesBytes, err := ioutil.ReadFile(jsonPath)
util.Check(err)
r := bytes.NewReader(namesBytes)
var nps NamesPositions
err = jsoniter.NewDecoder(r).Decode(&nps)
util.Check(err)
td[FileName(v)] = &TextData{Text: text, NamesPositions: nps}
}
return td
}
// processTrainingData takes data from several training texts, ignores
// the name of the file and collects training information from names in
// the texts.
func processTrainingData(td TrainingData,
d *dict.Dictionary) []bayes.LabeledFeatures {
var lfs []bayes.LabeledFeatures
for _, v := range td {
lfsText := processText(v, d)
lfs = append(lfs, lfsText...)
}
return lfs
}
// processText
func processText(t *TextData, d *dict.Dictionary) []bayes.LabeledFeatures {
var lfs, lfsText []bayes.LabeledFeatures
var nd NameData
ts := token.Tokenize(t.Text)
m := util.NewModel()
heuristic.TagTokens(ts, d, m)
l := len(t.NamesPositions)
nameIdx, i := 0, 0
for {
if l > 0 {
nd = t.NamesPositions[nameIdx]
}
i, lfsText = getFeatures(i, ts, &nd)
lfs = append(lfs, lfsText...)
nameIdx++
if nameIdx == l || i == -1 {
break
}
}
return lfs
}
// getFeatures collects features for non-names that happen before a
// known name. It takes index of the first token to traverse, tokens, and
// currenly available name metadata, if any. It returns all the features
// and a new index to continue collecting data.
func getFeatures(i int, ts []token.Token,
nd *NameData) (int, []bayes.LabeledFeatures) {
l := len(ts)
var lfs []bayes.LabeledFeatures
label := NotName
for j := i; j < l; j++ {
t := &ts[j]
if !t.Capitalized {
continue
}
upperIndex := util.UpperIndex(j, l)
featureSet := NewFeatureSet(ts[j:upperIndex])
if nd.Name != "" && t.End > nd.Start {
label = Name
lfs = append(lfs, bayes.LabeledFeatures{Features: featureSet.Flatten(),
Label: label})
return j + 1, lfs
}
lfs = append(lfs, bayes.LabeledFeatures{Features: featureSet.Flatten(),
Label: label})
}
return -1, lfs
}