Skip to content

Commit

Permalink
Check name pattern before add sub folders as new candidates
Browse files Browse the repository at this point in the history
  • Loading branch information
daviddengcn committed Mar 5, 2016
1 parent f2c15df commit b740964
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 42 deletions.
38 changes: 9 additions & 29 deletions crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ func (br *BlackRequest) Do(req *http.Request) (*http.Response, error) {
if err != nil {
return resp, err
}

if resp.StatusCode == 500 {
log.Printf("Put %s into 500 blacklist", u)
r := *resp
Expand All @@ -126,7 +125,6 @@ func GenHttpClient(proxy string) doc.HttpClient {
tp.Proxy = http.ProxyURL(proxyURL)
}
}

return &BlackRequest{
badUrls: make(map[string]http.Response),
client: &http.Client{
Expand All @@ -148,7 +146,6 @@ func FullProjectOfPackage(pkg string) string {
if len(parts) == 0 {
return ""
}

switch parts[0] {
case "llamaslayers.net", "bazil.org":
if len(parts) > 2 {
Expand Down Expand Up @@ -225,12 +222,12 @@ func Plusone(httpClient doc.HttpClient, url string) (int, error) {
`[{"method":"pos.plusones.get","id":"p","params":{"nolog":true,"id": "`+
url+`","source":"widget","userId":"@viewer","groupId":"@self"},"jsonrpc":"2.0","key":"p","apiVersion":"v1"}]`)))
if err != nil {
return 0, err
return 0, errorsp.WithStacksAndMessage(err, "new request for crawling g+ of %v failed", url)
}
req.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(req)
if err != nil {
return 0, err
return 0, errorsp.WithStacksAndMessage(err, "crawling g+ of %v failed", url)
}
defer resp.Body.Close()

Expand All @@ -245,9 +242,8 @@ func Plusone(httpClient doc.HttpClient, url string) (int, error) {
}
}
if err := dec.Decode(&v); err != nil {
return 0, err
return 0, errorsp.WithStacksAndMessage(err, "decoding g+ of %v failed", url)
}

return int(0.5 + v[0].Result.Metadata.GlobalCounts.Count), nil
}

Expand All @@ -269,7 +265,6 @@ func LikeButton(httpClient doc.HttpClient, Url string) (int, error) {
if err := dec.Decode(&v); err != nil {
return 0, err
}

return v[Url].Shares, nil
}

Expand All @@ -280,11 +275,9 @@ func fuseStars(a, b int) int {
if b < 0 {
return a
}

if a > b {
a, b = b, a
}

/*
Now, a <= b
Supposing half of the stargzers are shared ones. The numbers could
Expand All @@ -294,7 +287,6 @@ func fuseStars(a, b int) int {
if a <= b/3 {
return b
}

return (a + b) * 3 / 4
}

Expand Down Expand Up @@ -349,7 +341,6 @@ func newDocGet(httpClient doc.HttpClient, pkg string, etag string) (p *doc.Packa

StarCount: -1,
}, nil
// return nil, nil
}

var GithubSpider *github.Spider
Expand Down Expand Up @@ -434,7 +425,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
log.Printf("Panic when crawling package %s: %v", pkg, err)
}
}()

var pdoc *doc.Package

if strings.HasPrefix(pkg, "thezombie.net") {
Expand All @@ -454,7 +444,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
if err != nil {
return nil, folders, errorsp.WithStacks(err)
}

if pdoc.StarCount < 0 {
// if starcount is not fetched, choose fusion of Plusone and
// Like Button
Expand All @@ -467,7 +456,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
}
pdoc.StarCount = fuseStars(plus, like)
}

readmeFn, readmeData := "", ""
for fn, data := range pdoc.ReadmeFiles {
readmeFn, readmeData = strings.TrimSpace(fn),
Expand All @@ -478,16 +466,13 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
readmeFn, readmeData = "", ""
}
}

// try find synopsis from readme
if pdoc.Doc == "" && pdoc.Synopsis == "" {
pdoc.Synopsis = godoc.Synopsis(ReadmeToText(readmeFn, readmeData))
}

if len(readmeData) > 100*1024 {
readmeData = readmeData[:100*1024]
}

importsSet := stringsp.NewSet(pdoc.Imports...)
importsSet.Delete(pdoc.ImportPath)
imports := importsSet.Elements()
Expand All @@ -503,7 +488,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
for _, t := range pdoc.Types {
exported.Add(t.Name)
}

return &Package{
Package: pdoc.ImportPath,
Name: pdoc.Name,
Expand Down Expand Up @@ -612,7 +596,6 @@ func (db PackedDocDB) Put(key string, data interface{}) {
log.Printf("Put %s failed: %v", key, err)
return
}

db.MemDB.Put(key, []byte(bs))
}

Expand Down Expand Up @@ -699,12 +682,12 @@ func (nda *NewDocAction) WriteTo(w sophie.Writer) error {

func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error {
if err := nda.Action.ReadFrom(r, -1); err != nil {
return err
return errorsp.WithStacks(err)
}
if nda.Action == NDA_DEL {
return nil
}
return nda.DocInfo.ReadFrom(r, -1)
return errorsp.WithStacks(nda.DocInfo.ReadFrom(r, -1))
}

const (
Expand All @@ -715,29 +698,26 @@ const (
func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error) {
req, err := http.NewRequest("GET", godocApiUrl, nil)
if err != nil {
return nil, err
return nil, errorsp.WithStacksAndMessage(err, "new request for %v failed", godocApiUrl)
}
resp, err := httpClient.Do(req)
if err != nil {
return nil, err
return nil, errorsp.WithStacksAndMessage(err, "fetching %v failed", godocApiUrl)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, errors.New(fmt.Sprintf("StatusCode: %d", resp.StatusCode))
return nil, errorsp.NewWithStacks("StatusCode: %d", resp.StatusCode)
}

var results map[string][]map[string]string
dec := json.NewDecoder(resp.Body)

if err := dec.Decode(&results); err != nil {
return nil, err
return nil, errorsp.WithStacks(err)
}

list := make([]string, 0, len(results["results"]))
for _, res := range results["results"] {
list = append(list, res["path"])
}

return list, nil
}

Expand Down
5 changes: 1 addition & 4 deletions crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ func TestReadmeToText_Panic(t *testing.T) {
func TestPlusone(t *testing.T) {
url := "http://www.google.com/"
cnt, err := Plusone(http.DefaultClient, url)
if err != nil {
t.Error(err)
return
}
assert.NoError(t, err)
t.Logf("Plusone of %s: %d", url, cnt)
if cnt <= 0 {
t.Errorf("Zero Plusone count for %s", url)
Expand Down
5 changes: 3 additions & 2 deletions data.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"unicode"

"github.com/golangplus/bytes"
"github.com/golangplus/errors"
"github.com/golangplus/strings"

"github.com/agonopol/go-stem"
Expand Down Expand Up @@ -38,13 +39,13 @@ func NewDocInfo() sophie.Sophier {
}

func (d *DocInfo) WriteTo(w sophie.Writer) error {
return gob.NewEncoder(w).Encode(d)
return errorsp.WithStacks(gob.NewEncoder(w).Encode(d))
}

func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error {
// clear before decoding, otherwise some slice will be reused
*d = DocInfo{}
return gob.NewDecoder(r).Decode(d)
return errorsp.WithStacks(gob.NewDecoder(r).Decode(d))
}

// HitInfo is the information provided to frontend
Expand Down
5 changes: 4 additions & 1 deletion pipelines/crawler/package.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

"github.com/daviddengcn/gcse"
"github.com/daviddengcn/gcse/configs"
"github.com/daviddengcn/gcse/spider"
"github.com/daviddengcn/gddo/doc"
"github.com/daviddengcn/go-easybi"
"github.com/daviddengcn/sophie"
Expand Down Expand Up @@ -116,7 +117,9 @@ func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector

p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
for _, fld := range flds {
appendPackage(pkg + "/" + fld.Path)
if spider.LikeGoSubFolder(fld.Name) {
appendPackage(pkg + "/" + fld.Path)
}
}
if err != nil && err != gcse.ErrPackageNotModifed {
log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
Expand Down
6 changes: 1 addition & 5 deletions pipelines/mergedocs/mergedocs.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,18 @@ func main() {
return &mr.MapperStruct{
NewKeyF: sophie.NewRawString,
NewValF: gcse.NewDocInfo,
MapF: func(key, val sophie.SophieWriter,
c mr.PartCollector) error {

MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
pkg := key.(*sophie.RawString).String()
di := val.(*gcse.DocInfo)
act := gcse.NewDocAction{
Action: gcse.NDA_ORIGINAL,
DocInfo: *di,
}

part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
return c.CollectTo(part, key, &act)
},
}
}

// Mapper for new docs
return &mr.MapperStruct{
NewKeyF: sophie.NewRawString,
Expand Down
2 changes: 1 addition & 1 deletion proto/spider/spider.proto
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ message RepoInfo {

// Information for non-repository folder.
message FolderInfo {
// E.g. "spider"
// E.g. "github"
string name = 1;

// E.g. "spider/github"
Expand Down
23 changes: 23 additions & 0 deletions spider/ranking.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package spider

import (
"regexp"

"github.com/golangplus/strings"
)

var nonGoSubFolders = stringsp.NewSet(
"javascript", "js", "css", "image", "images", "font", "fonts", "script", "scripts", "themes", "templates", "vendor", "bin", "cpp", "python", "nodejs",
)

var nonGoSubPattern = regexp.MustCompile(`^[0-9\-_]+$`)

func LikeGoSubFolder(folder string) bool {
if nonGoSubFolders.Contain(folder) {
return false
}
if nonGoSubPattern.MatchString(folder) {
return false
}
return true
}
24 changes: 24 additions & 0 deletions spider/ranking_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package spider

import (
"fmt"
"testing"

"github.com/golangplus/testing/assert"
)

func TestLikeGoSubFolder(t *testing.T) {
pos_cases := []string{
"go", "v8", "v-8",
}
for _, c := range pos_cases {
assert.True(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
}

neg_cases := []string{
"js", "1234", "1234-5678", "1234_5678",
}
for _, c := range neg_cases {
assert.False(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
}
}

0 comments on commit b740964

Please sign in to comment.