-
-
Notifications
You must be signed in to change notification settings - Fork 136
/
unescape.go
129 lines (116 loc) · 2.45 KB
/
unescape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package html
import "unicode/utf8"
// unescapeEntity reads an entity like "<" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
//
// This is adaption of html.UnescapeString from Go sources.
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
}
if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
if c == 'x' || c == 'X' {
hex = true
i++
}
x := '\x00'
for i < len(s) {
c = s[i]
i++
if hex {
switch {
case '0' <= c && c <= '9':
x = 16*x + rune(c) - '0'
continue
case 'a' <= c && c <= 'f':
x = 16*x + rune(c) - 'a' + 10
continue
case 'A' <= c && c <= 'F':
x = 16*x + rune(c) - 'A' + 10
continue
}
} else if '0' <= c && c <= '9' {
x = 10*x + rune(c) - '0'
continue
}
if c != ';' {
i--
}
break
}
if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
}
if x == 0 || x >= 0x10ffff {
b[dst] = b[src]
return dst + 1, src + 1
}
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
for i < len(s) {
c := s[i]
i++
// Lower-cased characters are more common in entities, so we check for them first.
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue
}
if c != ';' {
i--
}
break
}
var x rune
tagEnd := i
if i > 0 && s[tagEnd-1] == ';' {
tagEnd--
}
switch string(s[1:tagEnd]) {
case "lt":
x = '<'
case "gt":
x = '>'
case "amp":
x = '&'
case "quot":
x = '"'
}
if x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1
}
// telegramEscape implements Telegram BotAPI HTML unescape.
func telegramUnescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}