/
stemskv.go
127 lines (115 loc) · 3.04 KB
/
stemskv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package trie
import (
"bytes"
"database/sql"
"encoding/gob"
"io/ioutil"
"github.com/dgraph-io/badger/v2"
mlib "github.com/gnames/gnlib/domain/entity/matcher"
"github.com/gnames/gnlib/sys"
log "github.com/sirupsen/logrus"
)
// initStemsKV creates key-value store for stems and their canonical forms.
func initStemsKV(path string, db *sql.DB) {
var err error
err = sys.MakeDir(path)
if err != nil {
log.Fatalf("Cannot create %s: %s", path, err)
}
if keyValExists(path) {
log.Info("Stems key-value store already exists, skipping.")
return
}
kv := connectKeyVal(path)
defer kv.Close()
q := `SELECT s.name as name_stem, c.name, c.id
FROM canonical_stems s
JOIN name_strings ns
ON ns.canonical_stem_id = s.id
JOIN canonicals c
ON ns.canonical_id = c.id
GROUP BY c.name, c.id, s.name
ORDER BY name`
rows, err := db.Query(q)
if err != nil {
log.Fatalf("Cannot get stems from DB: %s.", err)
}
kvTxn := kv.NewTransaction(true)
var stemRes []mlib.MatchItem
var currentStem, stem, name, id string
count := 0
for rows.Next() {
if err := rows.Scan(&stem, &name, &id); err != nil {
log.Fatalf("Cannot read stem data from query: %s.", err)
}
if currentStem == "" {
currentStem = stem
}
if stem != currentStem {
count += 1
key := []byte(currentStem)
var b bytes.Buffer
enc := gob.NewEncoder(&b)
if err = enc.Encode(stemRes); err != nil {
log.Fatalf("Cannot marshal canonicals: %s.", err)
}
val := b.Bytes()
if err = kvTxn.Set(key, val); err != nil {
log.Fatalf("Transaction failed to set key: %s.", err)
}
if count > 10_000 {
err = kvTxn.Commit()
if err != nil {
log.Fatalf("Transaction commit faied: %s.", err)
}
count = 0
kvTxn = kv.NewTransaction(true)
}
currentStem = stem
stemRes = nil
}
stemRes = append(stemRes, mlib.MatchItem{ID: id, MatchStr: name})
}
err = kvTxn.Commit()
if err != nil {
log.Fatal(err)
}
}
// connectKeyVal connects to a key-value store
func connectKeyVal(path string) *badger.DB {
options := badger.DefaultOptions(path)
// running in mem: options := badger.DefaultOptions("").WithInMemory(true)
options.Logger = nil
bdb, err := badger.Open(options)
if err != nil {
log.Fatalf("Cannot connect to key-value store: %s.", err)
}
return bdb
}
// getValue takes a string and a connection to a key-value store and checks if
// there is such stem key. It returns a list of canonicals that correspond to
// that key.
func getValue(kv *badger.DB, key string) []byte {
var res []byte
err := kv.View(func(txn *badger.Txn) error {
item, err := txn.Get([]byte(key))
if err == badger.ErrKeyNotFound {
return nil
} else if err != nil {
log.Fatal(err)
}
return item.Value(func(val []byte) error {
res = append([]byte{}, val...)
return nil
})
})
if err != nil {
log.Fatal(err)
}
return res
}
// keyValExists checks if key-value store is set.
func keyValExists(path string) bool {
files, err := ioutil.ReadDir(path)
return (err == nil && len(files) > 0)
}