forked from richyen/walker
-
Notifications
You must be signed in to change notification settings - Fork 1
/
url.go
249 lines (221 loc) · 7.02 KB
/
url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
package walker
import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"
"time"
"code.google.com/p/go.net/publicsuffix"
"github.com/PuerkitoBio/purell"
)
// URL is the walker URL object, which embeds *url.URL but has extra data and
// capabilities used by walker. Note that LastCrawled should not be set to its
// zero value, it should be set to NotYetCrawled.
type URL struct {
*url.URL
// LastCrawled is the last time we crawled this URL, for example to use a
// Last-Modified header.
LastCrawled time.Time
}
// CreateURL creates a walker URL from values usually pulled out of the
// datastore. subdomain may optionally include a trailing '.', and path may
// optionally include a prefixed '/'.
func CreateURL(domain, subdomain, path, protocol string, lastcrawled time.Time) (*URL, error) {
if subdomain != "" && !strings.HasSuffix(subdomain, ".") {
subdomain = subdomain + "."
}
if path != "" && !strings.HasPrefix(path, "/") {
path = "/" + path
}
ref := fmt.Sprintf("%s://%s%s%s", protocol, subdomain, domain, path)
u, err := ParseURL(ref)
if err != nil {
return nil, err
}
u.LastCrawled = lastcrawled
return u, nil
}
var parseURLPathStrip *regexp.Regexp
var parseURLPurgeMap map[string]bool
func setupNormalizeURL() error {
if len(Config.Fetcher.PurgeSidList) == 0 {
parseURLPathStrip = nil
} else {
// Here we want to write a regexp that looks like
// \;jsessionid=.*$|\;other=.*$
var buffer bytes.Buffer
buffer.WriteString("(?i)") // case-insensitive
startedLoop := false
for _, sid := range Config.Fetcher.PurgeSidList {
if startedLoop {
buffer.WriteRune('|')
}
startedLoop = true
buffer.WriteString(`\;`)
buffer.WriteString(sid)
buffer.WriteString(`\=.*$`)
}
var err error
parseURLPathStrip, err = regexp.Compile(buffer.String())
if err != nil {
return fmt.Errorf("Failed setupParseURL: %v", err)
}
}
parseURLPurgeMap = map[string]bool{}
for _, p := range Config.Fetcher.PurgeSidList {
parseURLPurgeMap[strings.ToLower(p)] = true
}
return nil
}
// ParseURL is the walker.URL equivalent of url.Parse. Note, all URL's should
// be passed through this function so that we get consistency.
func ParseURL(ref string) (*URL, error) {
u, err := url.Parse(ref)
if err != nil {
return nil, err
}
wurl := &URL{URL: u, LastCrawled: NotYetCrawled}
return wurl, nil
}
// ParseAndNormalizeURL will walker.ParseURL the argument string,
// and then Normalize the resulting URL.
func ParseAndNormalizeURL(ref string) (*URL, error) {
u, err := ParseURL(ref)
if err != nil {
return u, err
}
u.Normalize()
return u, nil
}
// Normalize will process the URL according to the current set of normalizing rules.
func (u *URL) Normalize() {
rawURL := u.URL
// Apply standard normalization filters to url. This call will
// modify the url in place.
purell.NormalizeURL(rawURL, purell.FlagsSafe|purell.FlagRemoveFragment)
// Filter the path to catch embedded session ids
if parseURLPathStrip != nil {
// Remove SID from path
u.Path = parseURLPathStrip.ReplaceAllString(rawURL.Path, "")
}
//Rewrite the query string to canonical order, removing SID's as needed.
if rawURL.RawQuery != "" {
purge := parseURLPurgeMap
params := rawURL.Query()
for k := range params {
if purge[strings.ToLower(k)] {
delete(params, k)
}
}
rawURL.RawQuery = params.Encode()
}
}
// Clone will create a copy of this walker.URL
func (u *URL) Clone() *URL {
nurl := *u.URL
if nurl.User != nil {
userInfo := *nurl.User
nurl.User = &userInfo
}
return &URL{
URL: &nurl,
LastCrawled: u.LastCrawled,
}
}
// NormalizedForm returns nil if u is normalized. Otherwise, return the normalized version of u.
func (u *URL) NormalizedForm() *URL {
// We compare the fields of url.URL below. A few notes:
// (a) We do not compare the Opaque field, as it doesn't appear links we'll be looking at will use that field.
// (b) We do not consider the User field (of type Userinfo). You can see where the User field comes into play by
// looking at this (from url.URL)
// scheme://[userinfo@]host/path[?query][#fragment]
// the userinfo information should never be changed by normalization, so it appears there is no need to compare
// it.
c := u.Clone()
c.Normalize()
normal := c.URL.Scheme == u.URL.Scheme &&
c.URL.Host == u.URL.Host &&
c.URL.Path == u.URL.Path &&
c.URL.RawQuery == u.URL.RawQuery &&
c.URL.Fragment == u.URL.Fragment
if normal {
return nil
}
return c
}
// ToplevelDomainPlusOne returns the Effective Toplevel Domain of this host as
// defined by https://publicsuffix.org/, plus one extra domain component.
//
// For example the TLD of http://www.bbc.co.uk/ is 'co.uk', plus one is
// 'bbc.co.uk'. Walker uses these TLD+1 domains as the primary unit of
// grouping.
func (u *URL) ToplevelDomainPlusOne() (string, error) {
return publicsuffix.EffectiveTLDPlusOne(u.Host)
}
// Subdomain provides the remaining subdomain after removing the
// ToplevelDomainPlusOne. For example http://www.bbc.co.uk/ will return 'www'
// as the subdomain (note that there is no trailing period). If there is no
// subdomain it will return "".
func (u *URL) Subdomain() (string, error) {
dom, err := u.ToplevelDomainPlusOne()
if err != nil {
return "", err
}
if len(u.Host) == len(dom) {
return "", nil
}
return strings.TrimSuffix(u.Host, "."+dom), nil
}
// TLDPlusOneAndSubdomain is a convenience function that calls
// ToplevelDomainPlusOne and Subdomain, returning an error if we could not get
// either one.
// The first return is the TLD+1 and second is the subdomain
func (u *URL) TLDPlusOneAndSubdomain() (string, string, error) {
dom, err := u.ToplevelDomainPlusOne()
if err != nil {
return "", "", err
}
subdom, err := u.Subdomain()
if err != nil {
return "", "", err
}
return dom, subdom, nil
}
// PrimaryKey returns the 5 tuple that is the primary key for this url in the links table. The return values
// are (with cassandra keys in parens)
// (a) Domain (dom)
// (b) Subdomain (subdom)
// (c) Path part of url (path)
// (d) Schema of url (proto)
// (e) last update time of link (time)
// (f) any errors that occurred
func (u *URL) PrimaryKey() (dom string, subdom string, path string, proto string, time time.Time, err error) {
// Grab new and old variables
dom, subdom, err = u.TLDPlusOneAndSubdomain()
if err != nil {
return
}
path = u.RequestURI()
proto = u.Scheme
time = u.LastCrawled
return
}
// MakeAbsolute uses URL.ResolveReference to make this URL object an absolute
// reference (having Schema and Host), if it is not one already. It is
// resolved using `base` as the base URL.
func (u *URL) MakeAbsolute(base *URL) {
if u.IsAbs() {
return
}
u.URL = base.URL.ResolveReference(u.URL)
}
// Equal returns true if this link is identical to `other`.
func (u *URL) Equal(other *URL) bool {
return u.LastCrawled.Equal(other.LastCrawled) && u.EqualIgnoreLastCrawled(other)
}
// EqualIgnoreLastCrawled returns true if the URL portion of this link
// (excluding LastCrawled) is equal to `other`.
func (u *URL) EqualIgnoreLastCrawled(other *URL) bool {
return *u.URL == *other.URL
}