/
worker.go
148 lines (134 loc) · 3.61 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package htindex
import (
"archive/zip"
"crypto/sha256"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"github.com/gnames/gnfinder"
"github.com/gnames/gnfinder/lang"
"github.com/gnames/gnfinder/output"
)
// isPage determines if a file represents a page with text from the title.
var isPage = regexp.MustCompile(`\d{6}\.txt`)
// page allows to presort pages from zip file in case if they are given
// in a wrong order.
type page struct {
id string
text []byte
res *output.Output
}
// title represents data and metadata from a title/book/volume.
type title struct {
id string
sha256 string
path string
pages []page
namesNum int
pagesNumBadNames int
}
type htiError struct {
ts string
titleID string
pageID string
msg string
}
// byID allows to sort page slice using its `id` field.
type byID []page
func (b byID) Len() int { return len(b) }
func (b byID) Less(i, j int) bool { return b[i].id < b[j].id }
func (b byID) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
// worker is the maing workhorse of the app. It reads zip file, extracts
// data from pages, and prepares title data and results of name-finding for
// the output. In case if some errors happened during processing, they will be
// prepared for logging.
func (hti *HTindex) worker(inCh <-chan string, outCh chan<- *title,
errCh chan<- *htiError, wg *sync.WaitGroup) {
defer wg.Done()
opts := []gnfinder.Option{
gnfinder.OptDict(hti.Dict),
gnfinder.OptBayes(true),
gnfinder.OptTokensAround(hti.WordsAround),
gnfinder.OptLanguage(lang.English),
}
gnf := gnfinder.NewGNfinder(opts...)
for zipPath := range inCh {
t := title{id: getID(zipPath), path: zipPath}
path := filepath.Join(hti.RootPrefix, zipPath)
r, err := zip.OpenReader(path)
if err != nil {
errCh <- &htiError{msg: err.Error(), titleID: t.id, ts: ts()}
}
pcs, pagesNumBadNames := pagesContent(r, errCh)
t.pages = make([]page, len(pcs))
if pagesNumBadNames > 0 {
msg := fmt.Sprintf("non-standard naming for %d pages", pagesNumBadNames)
errCh <- &htiError{msg: msg, titleID: t.id, ts: ts()}
}
t.pagesNumBadNames = pagesNumBadNames
if len(pcs) == 0 {
errCh <- &htiError{ts: ts(), titleID: t.id, msg: "no pages detected"}
continue
}
for i, p := range pcs {
t.pages[i].id = p.id
t.pages[i].res = gnf.FindNames(p.text)
t.namesNum += len(t.pages[i].res.Names)
}
r.Close()
t.sha256 = getSHA256(path)
outCh <- &t
}
}
func getSHA256(path string) string {
f, err := os.Open(path)
if err != nil {
return "n/a"
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "n/a"
}
return fmt.Sprintf("%x", h.Sum(nil))
}
// pagesContent generates a list of all pages with their texts sorted according
// to their position in the title.
func pagesContent(r *zip.ReadCloser, errCh chan<- *htiError) ([]page, int) {
badPageName := 0
var pages []page
for _, f := range r.File {
fn := f.Name
fnl := len(fn)
if fnl < 12 || !isPage.MatchString(fn[fnl-12:fnl]) {
continue
}
zf, err := f.Open()
if err != nil {
errCh <- &htiError{msg: err.Error()}
}
id := fn[fnl-12 : fnl-4]
if !strings.HasPrefix(id, "00") {
badPageName++
}
text, err := ioutil.ReadAll(zf)
if err != nil {
errCh <- &htiError{msg: err.Error()}
}
pages = append(pages, page{id: id, text: text})
zf.Close()
}
sort.Sort(byID(pages))
return pages, badPageName
}
// getID generates the id of a title from its filepath.
func getID(p string) string {
el := strings.Split(p, "/")
return fmt.Sprintf("%s.%s", el[0], el[len(el)-2])
}