/
gndoc.go
118 lines (103 loc) · 2.51 KB
/
gndoc.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package gndoc
import (
"context"
"fmt"
"io"
"os"
"strings"
"time"
"github.com/gnames/gndoc/io/gnhttp"
"github.com/gnames/gnsys"
"github.com/google/go-tika/tika"
)
var timeout = 15 * time.Second
type gndoc struct {
tclient *tika.Client
text string
}
func New(tikaURL string) GNdoc {
tclient := tika.NewClient(nil, tikaURL)
return &gndoc{
tclient: tclient,
}
}
// TextFromFile takes a path to a file, and returns the converted
// UTF8-encoded text, elapsed time in seconds or an error.
func (d *gndoc) TextFromFile(
path string,
plainInput bool,
) (string, float32, error) {
var err error
var bs []byte
var txt string
var dur float32
start := time.Now()
exists, err := gnsys.FileExists(path)
if err != nil {
return "", dur, err
}
if !exists {
return "", dur, fmt.Errorf("file '%s' does not exist", path)
}
f, err := os.Open(path)
if err != nil {
return "", dur, err
}
defer f.Close()
if plainInput {
bs, err = io.ReadAll(f)
txt = string(bs)
dur = float32(time.Since(start)) / float32(time.Second)
if err != nil {
return "", dur, err
}
return txt, dur, nil
} else {
txt, err = d.GetText(f)
if err != nil {
return "", dur, err
}
}
dur = float32(time.Since(start)) / float32(time.Second)
return txt, dur, nil
}
// TextFromURL takes a URL to a page, reads its content, and converts it into
// a plain UTF8-encoded text. If it succeeds it returns the text, the time it
// spend on conversion, and a nil. If it does not succeed, it returns an
// empty string and error.
func (d *gndoc) TextFromURL(url string) (string, float32, error) {
var dur float32
var err error
start := time.Now()
h := gnhttp.New()
_, mime, body, err := h.Get(url)
if err != nil {
return "", dur, err
}
if !strings.Contains(mime, "text/html") {
err = fmt.Errorf("not an HTML text: %s", mime)
return "", dur, err
}
res, err := d.GetText(body)
if err != nil {
return "", dur, err
}
dur = float32(time.Now().Sub(start)) / float32(time.Second)
return res, dur, nil
}
// GetText takes a io.Reader interface (for example opened file)
// and returns back the UTF8-encoded textual content of the input.
func (d *gndoc) GetText(input io.Reader) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
txt, err := d.tclient.Parse(ctx, input)
if err == nil {
d.text = txt
}
return txt, err
}
// Text returns the UTF8-encoded textual content of a file, if it was
// already received by running other methods.
func (d *gndoc) Text() string {
return d.text
}