diff --git a/ctrl.go b/ctrl.go index ac537bc..bbb5ed8 100644 --- a/ctrl.go +++ b/ctrl.go @@ -8,11 +8,9 @@ import ( ) type Ticket struct { - At time.Time - Score int - RetryMax int - RetryDelay time.Duration - Ctx context.Context + At time.Time + Score int + Ctx context.Context } // Controller controls the working progress of crawler. @@ -20,8 +18,11 @@ type Controller interface { // Prepare sets options(client, headers, ...) for a http request. Prepare(req *Request) - // Handle handles a response. Handle should also extract hyperlinks - // from the response and send them to the channel. + // Handle handles a response(writing to disk/DB, ...). Handle should + // also extract hyperlinks from the response and send them to the + // channel. Note that r.NewURL may differ from r.URL if r.URL has been + // redirected, so r.NewURL should also be included if following + // redirects is expected. Handle(r *Response, ch chan<- *url.URL) // Accept determines whether a URL should be processed. It is redundant diff --git a/handle.go b/handle.go index edc6a9e..1efe902 100644 --- a/handle.go +++ b/handle.go @@ -64,16 +64,18 @@ func (h *handler) handle(r *Response) error { } ch := make(chan *url.URL, perPage) go func() { - original := r.URL.String() - // Treat the new url as one found under the original url - if r.NewURL.String() != original { - newurl := *r.NewURL - ch <- &newurl - } - if refresh := r.Refresh.URL; refresh != nil && - refresh.String() != original { - newurl := *refresh - ch <- &newurl + if h.cw.opt.FollowRedirect { + // Treat the new URL as one found under the original URL + original := r.URL.String() + if r.NewURL.String() != original { + newurl := *r.NewURL + ch <- &newurl + } + if refresh := r.Refresh.URL; refresh != nil && + refresh.String() != original { + newurl := *refresh + ch <- &newurl + } } h.cw.ctrl.Handle(r, ch) close(ch) diff --git a/option.go b/option.go index 4991834..252a26f 100644 --- a/option.go +++ b/option.go @@ -7,29 +7,21 @@ const ( ) type Option struct { - UserAgent string - RobotAgent string - EnableCache bool - MaxCacheSize int64 - MinDelay time.Duration - RetryDuration time.Duration - MaxRetry int - RobotoAgent string - MaxHTML int64 - NWorker struct { + UserAgent string + RobotAgent string + MinDelay time.Duration + RobotoAgent string + FollowRedirect bool + NWorker struct { Maker, Fetcher, Handler, Scheduler int } } var ( DefaultOption = &Option{ - UserAgent: browserAgant, - RobotAgent: "gocrawler", - MaxCacheSize: 1024, - MaxHTML: 1 << 20, // iMB - MinDelay: 10 * time.Second, - RetryDuration: 10 * time.Second, - MaxRetry: 4, + UserAgent: browserAgant, + RobotAgent: "gocrawler", + MinDelay: 10 * time.Second, NWorker: struct { Maker, Fetcher, Handler, Scheduler int }{ diff --git a/schedule.go b/schedule.go index 957521d..24476d2 100644 --- a/schedule.go +++ b/schedule.go @@ -208,7 +208,11 @@ func (sd *scheduler) cleanup() { if err := sd.queue.Close(); err != nil { sd.logger.Error("close wait queue", "err", err) } - close(sd.quit) + select { + case <-sd.quit: // closed + default: + close(sd.quit) + } } func (sd *scheduler) exit() { diff --git a/sim/fingerprint/fingerprint.go b/sim/fingerprint/fingerprint.go index 4934268..398bd68 100644 --- a/sim/fingerprint/fingerprint.go +++ b/sim/fingerprint/fingerprint.go @@ -1,7 +1,7 @@ package fingerprint import ( - "bytes" + "hash/fnv" "io" "github.com/mfonda/simhash" @@ -27,19 +27,35 @@ func Compute(r io.Reader, N, shingle int) uint64 { ch := make(chan uint64, 128) go func() { + // Avoid allocation s := make([][]byte, shingle) + joined := make([][]byte, 2*shingle-1) + space := []byte(" ") + var i, n int for f := range chFeature { + // Collect enough features if n < shingle { s[n] = []byte(f) - n++ + if n++; n == shingle { + goto JOIN + } continue } + // Shift array to produce one space for i = 0; i < shingle-1; i++ { s[i] = s[i+1] } s[i] = []byte(f) - ch <- simhash.NewFeature(bytes.Join(s, []byte(" "))).Sum() + + JOIN: + for i, f := range s { + joined[2*i] = f + if i+1 != len(s) { + joined[2*i+1] = space + } + } + ch <- hash(joined...) } close(ch) }() @@ -86,3 +102,11 @@ func genFeature(t *html.Token, ch chan<- string) { } ch <- s } + +func hash(s ...[]byte) uint64 { + h := fnv.New64() + for _, b := range s { + h.Write(b) + } + return h.Sum64() +} diff --git a/util/util.go b/util/util.go index 810e6bf..381bb52 100644 --- a/util/util.go +++ b/util/util.go @@ -105,3 +105,24 @@ func Btoi64(b []byte) int64 { i, _ := binary.Varint(b) return i } + +// func Token(z *Tokenizer) *Token { +// t := Token{Type: z.tt} +// switch z.tt { +// case TextToken, CommentToken, DoctypeToken: +// t.Data = string(z.Text()) +// case StartTagToken, SelfClosingTagToken, EndTagToken: +// name, moreAttr := z.TagName() +// for moreAttr { +// var key, val []byte +// key, val, moreAttr = z.TagAttr() +// t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) +// } +// if a := atom.Lookup(name); a != 0 { +// t.DataAtom, t.Data = a, a.String() +// } else { +// t.DataAtom, t.Data = 0, string(name) +// } +// } +// return t +// }