/
title.go
88 lines (74 loc) · 1.99 KB
/
title.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
package service
import (
"io"
"net/http"
log "github.com/go-pkgz/lgr"
"github.com/go-pkgz/rest/cache"
"github.com/pkg/errors"
"golang.org/x/net/html"
)
const teMaxCachedRecs = 1000
// TitleExtractor gets html title from remote page, cached
type TitleExtractor struct {
client http.Client
cache cache.LoadingCache
}
// NewTitleExtractor makes extractor with cache. If memory cache failed, switching to no-cache
func NewTitleExtractor(client http.Client) *TitleExtractor {
res := TitleExtractor{
client: client,
}
var err error
res.cache, err = cache.NewMemoryCache(cache.MaxKeys(teMaxCachedRecs))
if err != nil {
log.Printf("[WARN] failed to make cache, %v", err)
res.cache = &cache.Nop{}
}
return &res
}
// Get page for url and return title
func (t *TitleExtractor) Get(url string) (string, error) {
b, err := t.cache.Get(cache.NewKey("site").ID(url), func() ([]byte, error) {
resp, err := t.client.Get(url)
if err != nil {
return nil, errors.Wrapf(err, "failed to load page %s", url)
}
defer resp.Body.Close() //nolint
if resp.StatusCode != 200 {
return nil, errors.Errorf("can't load page %s, code %d", url, resp.StatusCode)
}
title, ok := t.getTitle(resp.Body)
if !ok {
return nil, errors.Errorf("can't get title for %s", url)
}
return []byte(title), nil
})
if err != nil {
return "", err
}
return string(b), nil
}
// get title from body reader, traverse recursively
func (t *TitleExtractor) getTitle(r io.Reader) (string, bool) {
doc, err := html.Parse(r)
if err != nil {
log.Printf("[WARN] can't get header, %+v", err)
return "", false
}
return t.traverse(doc)
}
func (t *TitleExtractor) isTitleElement(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "title"
}
func (t *TitleExtractor) traverse(n *html.Node) (string, bool) {
if t.isTitleElement(n) {
return n.FirstChild.Data, true
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
result, ok := t.traverse(c)
if ok {
return result, ok
}
}
return "", false
}