forked from ampproject/amppackager
/
urls.go
259 lines (233 loc) · 8.77 KB
/
urls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Utilities related to handling of URLs in AMP.
package amphtml
import (
"crypto/sha256"
"encoding/base32"
"net/url"
"strconv"
"strings"
ampurl "github.com/ampproject/amppackager/internal/url"
"github.com/pkg/errors"
"golang.org/x/net/idna"
)
// sameURLIgnoringFragment is a helper for AbsoluteUrlValue below.
// Returns true if |base| is the same as |u| with the exception that |u| may
// also have an additional fragment component.
func sameURLIgnoringFragment(base string, u *url.URL) bool {
// Due to https://github.com/golang/go/issues/29603 we have an extra check
// for the empty fragment case.
if u.Fragment == "" {
return base == ampurl.String(u)
}
return base+"#"+u.Fragment == ampurl.String(u)
}
// isProtocolRelative is a mostly correct parse for protocol relative inputs
// by looking for a "//" prefix after stripping any leading whitespace and
// control characters.
func isProtocolRelative(urlParam string) bool {
urlParam = strings.TrimLeftFunc(urlParam, func(r rune) bool {
return r <= 0x20
})
return strings.HasPrefix(urlParam, "//")
}
// ToAbsoluteURL absolute-ifies |urlParam|, using |baseURL| as the base if
// |urlParam| is relative. If |urlParam| contains a fragment, this method
// will return only a fragment if its absolute URL matches |documentURL|,
// which prevents changing an in-document navigation to a out-of-document
// navigation.
func ToAbsoluteURL(documentURL string, baseURL *url.URL,
urlParam string) string {
if urlParam == "" {
return ""
}
refurl, err := url.Parse(urlParam)
// TODO(gregable): Should we strip this URL instead (ie: return "").
if err != nil {
return urlParam
}
// Go URL parser is strict, but we want non-strict behavior for resolving
// references as per the algorithm in https://tools.ietf.org/html/rfc3986#section-5.2.2
// tl;dr If the scheme is the same, unset it so the relative URL truly is relative.
if refurl.Scheme == baseURL.Scheme {
refurl.Scheme = ""
}
// Handle relative URLs that have a different scheme but no authority. In this
// case, use the base's authority. Note that this behavior is not
// compliant with RFC 3986 Section 5, however, this is what the Chrome browser
// does. See b/124445904 .
if refurl.Scheme != "" && refurl.Host == "" {
refurl.Host = baseURL.Host
}
absoluteURL := baseURL.ResolveReference(refurl)
// TODO(gregable): We should probably assemble data: / mailto: / etc URLs,
// which will force them to be URL encoded, but this was left to maintain
// the old behavior for now.
if absoluteURL.Scheme != "http" && absoluteURL.Scheme != "https" {
return urlParam
}
// Check for a specific case of protocol relative URL (ex: "//foo.com/")
// which specifies the host, but not the protocol. For b/27292423.
// Essentially we use protocol relative as a hint that this resource will
// be available on https even if it's resolved path was http. In this hinted
// case, we always prefer https.
if isProtocolRelative(urlParam) {
absoluteURL.Scheme = "https"
}
// Avoid rewriting a local fragment such as "#top" to a remote absolute URL
// of "http://example.com/#top" if it wasn't a remote URL already.
// Note that we also try to identify empty fragments (ex: href="#").
// net/url doesn't support these (https://github.com/golang/go/issues/29603)
// so we try to detect them heuristically.
if (absoluteURL.Fragment != "" || strings.HasPrefix(urlParam, "#")) &&
sameURLIgnoringFragment(documentURL, absoluteURL) {
return "#" + absoluteURL.Fragment
}
return ampurl.String(absoluteURL)
}
// SubresourceType describes the type of subresource
type SubresourceType int8
const (
// ImageType is a subresource for an image
ImageType SubresourceType = iota
// OtherType is a subresource for everything besides an image.
OtherType
)
// SubresourceOffset describes the location of a subresource URL within some text.
// For example, if the text value is ".a {background-image:url(foo.jpg)}", then
// Start === 25 and End === 32
type SubresourceOffset struct {
SubType SubresourceType
// The offset position denoting the start of the substring (inclusive)
Start int
// The offset position denoting the end of the substring (exclusive)
End int
// If the type is an image, an optional width to convert the image so.
DesiredImageWidth int
}
// CacheURL represents an AMP Cache URL
type CacheURL struct {
Subdomain string // publisher's subdomain within the cache. e.g. "example-com"
descriptor string // Optional descriptor (used for image candidates), representing width or pixel density.
*url.URL
}
// OriginDomain returns the scheme and host name, ignoring any path info.
func (c *CacheURL) OriginDomain() string {
return "https://" + c.Subdomain + "." + AMPCacheHostName
}
// String reassembles the URL into a URL string
func (c *CacheURL) String() string {
s := ampurl.String(c.URL)
if len(c.descriptor) > 0 {
s = s + " " + c.descriptor
}
return s
}
// IsCacheURL returns true if the given string is from the AMPCache domain. This check is overly
// simplistic and does no actual verification that the URL resolves (doesn't 404), nor if the URL
// is of the correct format for the resource type (image, or otherwise).
func IsCacheURL(input string) bool {
if u, err := url.Parse(input); err == nil {
return strings.HasSuffix(u.Hostname(), AMPCacheHostName)
}
return false
}
// GetCacheURL returns an AMP Cache URL structure for the URL identified by
// the given offset (relative to 'input') or an error if the URL could not be
// parsed.
func (so *SubresourceOffset) GetCacheURL(documentURL string, base *url.URL,
input string) (*CacheURL, error) {
urlStr := (input)[so.Start:so.End]
absolute := ToAbsoluteURL(documentURL, base, urlStr)
if len(absolute) == 0 {
return nil, errors.New("unable to convert empty URL string")
}
origURL, err := url.Parse(absolute)
if err != nil {
return nil, errors.Wrap(err, "error parsing URL")
}
secureInfix := ""
switch origURL.Scheme {
case "https":
// Add the secure infix
secureInfix = "s/"
case "http":
// Supported
default:
// Unsupported scheme
return nil, errors.New("unsupported scheme")
}
c := CacheURL{URL: origURL}
// simplistic idempotent check
if IsCacheURL(absolute) {
c.Subdomain = strings.TrimSuffix(c.Hostname(), "."+AMPCacheHostName)
return &c, nil
}
prefix := "/r/"
if so.SubType == ImageType {
prefix = "/i/"
if so.DesiredImageWidth > 0 {
wStr := strconv.Itoa(so.DesiredImageWidth)
prefix = "/ii/w" + wStr + "/"
}
}
c.Path = prefix + secureInfix + c.Hostname() + c.Path
c.Scheme = "https"
c.Subdomain = ToCacheURLSubdomain(c.Hostname())
c.Host = c.Subdomain + "." + AMPCacheHostName
return &c, nil
}
// ToCacheURLSubdomain converts an origin domain name to a dot-free human-readable string,
// that can be used in combination with an AMP Cache domain to identify the publisher's
// subdomain within that cache. If problems are encountered, fallback to a one-way hash.
//
// 1. Converts the origin domain from IDN (Punycode) to UTF-8.
// 2. Replaces every "-" (dash) with "--"(2 dashes).
// 3. Replaces every "." (dot) with a "-" (dash).
// 4. Converts back to IDN (Punycode).
//
// For example, if the origin is www.example.com, this returns www-example-com.
// On Google's AMP Cache, this will be prepended to the Google cache domain resulting in
// www-example-com.cdn.ampproject.org .
// See https://developers.google.com/amp/cache/overview for more info
func ToCacheURLSubdomain(originHost string) string {
p := idna.New(idna.MapForLookup(), idna.VerifyDNSLength(true), idna.Transitional(true), idna.BidiRule())
unicode, err := p.ToUnicode(originHost)
if err != nil {
return fallbackCacheURLSubdomain(originHost)
}
var sb strings.Builder
for _, rune := range unicode {
switch rune {
case '.':
sb.WriteRune('-')
case '-':
sb.WriteString("--")
default:
sb.WriteRune(rune)
}
}
if result, err := p.ToASCII(sb.String()); err == nil && strings.ContainsRune(sb.String(), '-') {
return result
}
return fallbackCacheURLSubdomain(originHost)
}
func fallbackCacheURLSubdomain(originHost string) string {
sha := sha256.New()
sha.Write([]byte(originHost))
result := base32.StdEncoding.EncodeToString(sha.Sum(nil))
// Remove the last four chars are always "====" which are not legal in a domain name.
return strings.ToLower(result[0:52])
}