-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.go
299 lines (247 loc) · 7.51 KB
/
config.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
package config
import (
"fmt"
"os"
"path/filepath"
"github.com/gnames/gnfmt"
"github.com/gnames/gnsys"
"github.com/rs/zerolog/log"
)
// Config contains data needed for BHLnames functionality.
type Config struct {
// BHLDumpURL contains the URL containing Biodiversity Heritage Library
// dump files. These files provide metadata necessary for connection of
// names occurrences with BHL pages.
BHLDumpURL string
// BHLNamesURL provides URL to BHLindex Data. This data provides names
// occurrences and verifications. Together with data from BHL dumps it allows
// to connect a name to pages in BHL.
BHLNamesURL string
// CoLDataURL provides a URL to the Catalogue of Life data in Darwin Core
// format.
CoLDataURL string
// DbHost provides an IP or host name where PostgreSQL is located. The
// database is used as the major data store for the project.
DbHost string
// DbUser is the username in PostgreSQL database. The user must have
// writing permissions to the database.
DbUser string
// DbPass is the password for DBUser.
DbPass string
// DbDatabase is the name of the database to keep BHLnames data. By default
// it is `bhlnames`.
DbDatabase string
// JobsNum provides concurrency value for finding references that contain
// specified names.
JobsNum int
// PortREST is used for BHLnames RESTful service port.
PortREST int
// Format determines format of the output data.
Format gnfmt.Format
// Delimiter allows to set a delimiter for ingesting input CSV files. These
// files contain names and other metadata to use for matching names,
// citations to BHL pages.
Delimiter rune
// WithSynonyms determines if to provide synonyms of a name in the output.
WithSynonyms bool
// WithRebuild determines if BHL dump data need to be uploaded again, or
// the data from local cache can be used. If `true` then local cache is
// ignored and data is downloaded from BHLDumpURL.
WithRebuild bool
// WithCoLRecalc indicates that calculation of CoL nomenclatural events
// tables will be emptied, and CoL nomenclatural data will be reimported
// before linking to BHL data.
WithCoLRecalc bool
// SortDesc determines the order of sorting the output data. If `true`
// data are sorted by year from latest to earliest. If `false` then from
// earliest to latest.
SortDesc bool
// WithShortenedOutput determines if references details will be provided.
// If it is `true`, found references are not provided, only the metadata
// about them.
WithShortenedOutput bool
// InputDir provides the `root` directory where all the BHLnames files are
// created.
InputDir string
// DownloadBHLFile provides the path where BHL dump compressed file will be
// stored.
DownloadBHLFile string
// DownloadNamesFile provides the path where BHL dump compressed file will be
// stored.
DownloadNamesFile string
// DownloadCoLFile provides the path where CoL DwCA compressed file will be
// stored.
DownloadCoLFile string
// DownloadDir is the directory where BHLnames extracts data from
// BHL dump.
DownloadDir string
// PageDir provides the directory where BHLnames keeps key-value database for
// pages information. We do not have file name of a page connected to page ID
// in the BHL data dump. So we have to calculate this ID by using page
// sequence in a title. We find out page id by concatenation of
// "FileNum|TitleID" fields.
//
// This key-value store is generated using data dump from BHL databse.
PageDir string
// PageFileDir provides the directory to a key-value store database that
// connects BHL's PageID to the page's file name in the BHL corpus
// directory structure.
//
// It is generated using bhlindex page dump and key-value store from
// PageDir
PageFileDir string
// PartDir is another key-value database to keep data about BHL's `parts`.
// A `part` is usually a distinct entity in `item`, for example it can be
// an scientific paper.
PartDir string
// AhoCorasickDir provides a directory where Aho-Corasick algorithm stores
// its cached data.
AhoCorasickDir string
// AhoCorKeyValDir provides a directory to keep a Key-Value store used by
// AhoCorasic library.
AhoCorKeyValDir string
}
// Option type for changing GNfinder settings.
type Option func(*Config)
func OptBHLDumpURL(s string) Option {
return func(cfg *Config) {
cfg.BHLDumpURL = s
}
}
func OptBHLNamesURL(s string) Option {
return func(cfg *Config) {
cfg.BHLNamesURL = s
}
}
func OptCoLDataURL(s string) Option {
return func(cfg *Config) {
cfg.CoLDataURL = s
}
}
func OptWithCoLRecalc(b bool) Option {
return func(cfg *Config) {
cfg.WithCoLRecalc = b
}
}
func OptDelimiter(r rune) Option {
return func(cfg *Config) {
cfg.Delimiter = r
}
}
func OptInputDir(s string) Option {
return func(cfg *Config) {
var err error
s, err = gnsys.ConvertTilda(s)
if err != nil {
err = fmt.Errorf("config.OptInputDir: %#w", err)
log.Fatal().Err(err).Msg("OptInputDir")
}
cfg.InputDir = s
}
}
func OptDbHost(s string) Option {
return func(cfg *Config) {
cfg.DbHost = s
}
}
func OptDbUser(s string) Option {
return func(cfg *Config) {
cfg.DbUser = s
}
}
func OptDbPass(s string) Option {
return func(cfg *Config) {
cfg.DbPass = s
}
}
func OptDbName(s string) Option {
return func(cfg *Config) {
cfg.DbDatabase = s
}
}
func OptFormat(f gnfmt.Format) Option {
return func(cfg *Config) {
cfg.Format = f
}
}
func OptWithRebuild(b bool) Option {
return func(cfg *Config) {
cfg.WithRebuild = b
}
}
func OptJobsNum(i int) Option {
return func(cfg *Config) {
cfg.JobsNum = i
}
}
func OptSortDesc(b bool) Option {
return func(cfg *Config) {
cfg.SortDesc = b
}
}
func OptShort(b bool) Option {
return func(cfg *Config) {
cfg.WithShortenedOutput = b
}
}
func OptWithSynonyms(b bool) Option {
return func(cfg *Config) {
cfg.WithSynonyms = b
}
}
func OptPortREST(i int) Option {
return func(cfg *Config) {
if i > 0 {
cfg.PortREST = i
}
}
}
func OptWithShortenedOutput(b bool) Option {
return func(cfg *Config) {
cfg.WithShortenedOutput = b
}
}
func InputDir() string {
inputDir, err := os.UserCacheDir()
if err != nil {
inputDir = os.TempDir()
}
return filepath.Join(inputDir, "bhlnames")
}
func New(opts ...Option) Config {
cfg := Config{
BHLDumpURL: "http://opendata.globalnames.org/dumps/bhl-data.zip",
BHLNamesURL: "http://opendata.globalnames.org/dumps/bhlindex-latest.zip",
CoLDataURL: "https://api.checklistbank.org/dataset/3LR/export?format=dwca",
InputDir: InputDir(),
Delimiter: ',',
DbHost: "0.0.0.0",
DbUser: "postgres",
DbPass: "postgres",
DbDatabase: "bhlnames",
JobsNum: 4,
PortREST: 8888,
Format: gnfmt.CSV,
WithSynonyms: true,
WithRebuild: false,
SortDesc: false,
WithShortenedOutput: false,
}
for _, opt := range opts {
opt(&cfg)
}
// if we redownload CoL files, we always reimport data.
if cfg.WithRebuild {
cfg.WithCoLRecalc = true
}
cfg.DownloadBHLFile = filepath.Join(cfg.InputDir, "bhl-data.zip")
cfg.DownloadNamesFile = filepath.Join(cfg.InputDir, "bhlindex-latest.zip")
cfg.DownloadCoLFile = filepath.Join(cfg.InputDir, "col.zip")
cfg.DownloadDir = filepath.Join(cfg.InputDir, "Data")
cfg.PageDir = filepath.Join(cfg.InputDir, "page")
cfg.PageFileDir = filepath.Join(cfg.InputDir, "page-file")
cfg.PartDir = filepath.Join(cfg.InputDir, "part")
cfg.AhoCorasickDir = filepath.Join(cfg.InputDir, "ac")
cfg.AhoCorKeyValDir = filepath.Join(cfg.InputDir, "ackv")
return cfg
}