Skip to content

Commit

Permalink
Don't enqueue redirect URL by default; reduce alloction
Browse files Browse the repository at this point in the history
  • Loading branch information
fanyang01 committed May 10, 2016
1 parent 8394bd9 commit 4f20060
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 38 deletions.
15 changes: 8 additions & 7 deletions ctrl.go
Expand Up @@ -8,20 +8,21 @@ import (
)

type Ticket struct {
At time.Time
Score int
RetryMax int
RetryDelay time.Duration
Ctx context.Context
At time.Time
Score int
Ctx context.Context
}

// Controller controls the working progress of crawler.
type Controller interface {
// Prepare sets options(client, headers, ...) for a http request.
Prepare(req *Request)

// Handle handles a response. Handle should also extract hyperlinks
// from the response and send them to the channel.
// Handle handles a response(writing to disk/DB, ...). Handle should
// also extract hyperlinks from the response and send them to the
// channel. Note that r.NewURL may differ from r.URL if r.URL has been
// redirected, so r.NewURL should also be included if following
// redirects is expected.
Handle(r *Response, ch chan<- *url.URL)

// Accept determines whether a URL should be processed. It is redundant
Expand Down
22 changes: 12 additions & 10 deletions handle.go
Expand Up @@ -64,16 +64,18 @@ func (h *handler) handle(r *Response) error {
}
ch := make(chan *url.URL, perPage)
go func() {
original := r.URL.String()
// Treat the new url as one found under the original url
if r.NewURL.String() != original {
newurl := *r.NewURL
ch <- &newurl
}
if refresh := r.Refresh.URL; refresh != nil &&
refresh.String() != original {
newurl := *refresh
ch <- &newurl
if h.cw.opt.FollowRedirect {
// Treat the new URL as one found under the original URL
original := r.URL.String()
if r.NewURL.String() != original {
newurl := *r.NewURL
ch <- &newurl
}
if refresh := r.Refresh.URL; refresh != nil &&
refresh.String() != original {
newurl := *refresh
ch <- &newurl
}
}
h.cw.ctrl.Handle(r, ch)
close(ch)
Expand Down
26 changes: 9 additions & 17 deletions option.go
Expand Up @@ -7,29 +7,21 @@ const (
)

type Option struct {
UserAgent string
RobotAgent string
EnableCache bool
MaxCacheSize int64
MinDelay time.Duration
RetryDuration time.Duration
MaxRetry int
RobotoAgent string
MaxHTML int64
NWorker struct {
UserAgent string
RobotAgent string
MinDelay time.Duration
RobotoAgent string
FollowRedirect bool
NWorker struct {
Maker, Fetcher, Handler, Scheduler int
}
}

var (
DefaultOption = &Option{
UserAgent: browserAgant,
RobotAgent: "gocrawler",
MaxCacheSize: 1024,
MaxHTML: 1 << 20, // iMB
MinDelay: 10 * time.Second,
RetryDuration: 10 * time.Second,
MaxRetry: 4,
UserAgent: browserAgant,
RobotAgent: "gocrawler",
MinDelay: 10 * time.Second,
NWorker: struct {
Maker, Fetcher, Handler, Scheduler int
}{
Expand Down
6 changes: 5 additions & 1 deletion schedule.go
Expand Up @@ -208,7 +208,11 @@ func (sd *scheduler) cleanup() {
if err := sd.queue.Close(); err != nil {
sd.logger.Error("close wait queue", "err", err)
}
close(sd.quit)
select {
case <-sd.quit: // closed
default:
close(sd.quit)
}
}

func (sd *scheduler) exit() {
Expand Down
30 changes: 27 additions & 3 deletions sim/fingerprint/fingerprint.go
@@ -1,7 +1,7 @@
package fingerprint

import (
"bytes"
"hash/fnv"
"io"

"github.com/mfonda/simhash"
Expand All @@ -27,19 +27,35 @@ func Compute(r io.Reader, N, shingle int) uint64 {

ch := make(chan uint64, 128)
go func() {
// Avoid allocation
s := make([][]byte, shingle)
joined := make([][]byte, 2*shingle-1)
space := []byte(" ")

var i, n int
for f := range chFeature {
// Collect enough features
if n < shingle {
s[n] = []byte(f)
n++
if n++; n == shingle {
goto JOIN
}
continue
}
// Shift array to produce one space
for i = 0; i < shingle-1; i++ {
s[i] = s[i+1]
}
s[i] = []byte(f)
ch <- simhash.NewFeature(bytes.Join(s, []byte(" "))).Sum()

JOIN:
for i, f := range s {
joined[2*i] = f
if i+1 != len(s) {
joined[2*i+1] = space
}
}
ch <- hash(joined...)
}
close(ch)
}()
Expand Down Expand Up @@ -86,3 +102,11 @@ func genFeature(t *html.Token, ch chan<- string) {
}
ch <- s
}

func hash(s ...[]byte) uint64 {
h := fnv.New64()
for _, b := range s {
h.Write(b)
}
return h.Sum64()
}
21 changes: 21 additions & 0 deletions util/util.go
Expand Up @@ -105,3 +105,24 @@ func Btoi64(b []byte) int64 {
i, _ := binary.Varint(b)
return i
}

// func Token(z *Tokenizer) *Token {
// t := Token{Type: z.tt}
// switch z.tt {
// case TextToken, CommentToken, DoctypeToken:
// t.Data = string(z.Text())
// case StartTagToken, SelfClosingTagToken, EndTagToken:
// name, moreAttr := z.TagName()
// for moreAttr {
// var key, val []byte
// key, val, moreAttr = z.TagAttr()
// t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
// }
// if a := atom.Lookup(name); a != 0 {
// t.DataAtom, t.Data = a, a.String()
// } else {
// t.DataAtom, t.Data = 0, string(name)
// }
// }
// return t
// }

0 comments on commit 4f20060

Please sign in to comment.