/
title.go
117 lines (102 loc) · 2.6 KB
/
title.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package httpx
import (
"bytes"
"fmt"
"io"
"regexp"
"strings"
"golang.org/x/net/html"
)
var (
cutset = "\n\t\v\f\r"
reTitle = regexp.MustCompile(`(?im)<\s*title.*>(.*?)<\s*/\s*title>`)
reContentType = regexp.MustCompile(`(?im)\s*charset="(.*?)"|charset=(.*?)"\s*`)
)
// ExtractTitle from a response
func ExtractTitle(r *Response) (title string) {
// Try to parse the DOM
titleDom, err := getTitleWithDom(r)
// In case of error fallback to regex
if err != nil {
for _, match := range reTitle.FindAllString(r.Raw, -1) {
title = match
break
}
} else {
title = renderNode(titleDom)
}
title = html.UnescapeString(trimTitleTags(title))
// remove unwanted chars
title = strings.TrimSpace(strings.Trim(title, cutset))
title = strings.ReplaceAll(title, "\n", "")
title = strings.ReplaceAll(title, "\r", "")
// Non UTF-8
if contentTypes, ok := r.Headers["Content-Type"]; ok {
contentType := strings.Join(contentTypes, ";")
// special cases
if strings.Contains(strings.ToLower(contentType), "charset=gb2312") ||
strings.Contains(strings.ToLower(contentType), "charset=gbk") {
titleUtf8, err := Decodegbk([]byte(title))
if err != nil {
return
}
return string(titleUtf8)
}
// Content-Type from head tag
var match = reContentType.FindSubmatch(r.Data)
var mcontentType = ""
if len(match) != 0 {
for i, v := range match {
if string(v) != "" && i != 0 {
mcontentType = string(v)
}
}
mcontentType = strings.ToLower(mcontentType)
}
if strings.Contains(mcontentType, "gb2312") || strings.Contains(mcontentType, "gbk") {
titleUtf8, err := Decodegbk([]byte(title))
if err != nil {
return
}
return string(titleUtf8)
}
}
return //nolint
}
func getTitleWithDom(r *Response) (*html.Node, error) {
var title *html.Node
var crawler func(*html.Node)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == "title" {
title = node
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
htmlDoc, err := html.Parse(bytes.NewReader(r.Data))
if err != nil {
return nil, err
}
crawler(htmlDoc)
if title != nil {
return title, nil
}
return nil, fmt.Errorf("title not found")
}
func renderNode(n *html.Node) string {
var buf bytes.Buffer
w := io.Writer(&buf)
html.Render(w, n) //nolint
return buf.String()
}
func trimTitleTags(title string) string {
// trim <title>*</title>
titleBegin := strings.Index(title, ">")
titleEnd := strings.Index(title, "</")
if titleEnd < 0 || titleBegin < 0 {
return title
}
return title[titleBegin+1 : titleEnd]
}