Permalink
Browse files

Check name pattern before add sub folders as new candidates

  • Loading branch information...
1 parent f2c15df commit b74096497af7f8e97316f9354217dc502018bd27 @daviddengcn committed Mar 5, 2016
Showing with 66 additions and 42 deletions.
  1. +9 −29 crawler.go
  2. +1 −4 crawler_test.go
  3. +3 −2 data.go
  4. +4 −1 pipelines/crawler/package.go
  5. +1 −5 pipelines/mergedocs/mergedocs.go
  6. +1 −1 proto/spider/spider.proto
  7. +23 −0 spider/ranking.go
  8. +24 −0 spider/ranking_test.go
View
@@ -102,7 +102,6 @@ func (br *BlackRequest) Do(req *http.Request) (*http.Response, error) {
if err != nil {
return resp, err
}
-
if resp.StatusCode == 500 {
log.Printf("Put %s into 500 blacklist", u)
r := *resp
@@ -126,7 +125,6 @@ func GenHttpClient(proxy string) doc.HttpClient {
tp.Proxy = http.ProxyURL(proxyURL)
}
}
-
return &BlackRequest{
badUrls: make(map[string]http.Response),
client: &http.Client{
@@ -148,7 +146,6 @@ func FullProjectOfPackage(pkg string) string {
if len(parts) == 0 {
return ""
}
-
switch parts[0] {
case "llamaslayers.net", "bazil.org":
if len(parts) > 2 {
@@ -225,12 +222,12 @@ func Plusone(httpClient doc.HttpClient, url string) (int, error) {
`[{"method":"pos.plusones.get","id":"p","params":{"nolog":true,"id": "`+
url+`","source":"widget","userId":"@viewer","groupId":"@self"},"jsonrpc":"2.0","key":"p","apiVersion":"v1"}]`)))
if err != nil {
- return 0, err
+ return 0, errorsp.WithStacksAndMessage(err, "new request for crawling g+ of %v failed", url)
}
req.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(req)
if err != nil {
- return 0, err
+ return 0, errorsp.WithStacksAndMessage(err, "crawling g+ of %v failed", url)
}
defer resp.Body.Close()
@@ -245,9 +242,8 @@ func Plusone(httpClient doc.HttpClient, url string) (int, error) {
}
}
if err := dec.Decode(&v); err != nil {
- return 0, err
+ return 0, errorsp.WithStacksAndMessage(err, "decoding g+ of %v failed", url)
}
-
return int(0.5 + v[0].Result.Metadata.GlobalCounts.Count), nil
}
@@ -269,7 +265,6 @@ func LikeButton(httpClient doc.HttpClient, Url string) (int, error) {
if err := dec.Decode(&v); err != nil {
return 0, err
}
-
return v[Url].Shares, nil
}
@@ -280,11 +275,9 @@ func fuseStars(a, b int) int {
if b < 0 {
return a
}
-
if a > b {
a, b = b, a
}
-
/*
Now, a <= b
Supposing half of the stargzers are shared ones. The numbers could
@@ -294,7 +287,6 @@ func fuseStars(a, b int) int {
if a <= b/3 {
return b
}
-
return (a + b) * 3 / 4
}
@@ -349,7 +341,6 @@ func newDocGet(httpClient doc.HttpClient, pkg string, etag string) (p *doc.Packa
StarCount: -1,
}, nil
- // return nil, nil
}
var GithubSpider *github.Spider
@@ -434,7 +425,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
log.Printf("Panic when crawling package %s: %v", pkg, err)
}
}()
-
var pdoc *doc.Package
if strings.HasPrefix(pkg, "thezombie.net") {
@@ -454,7 +444,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
if err != nil {
return nil, folders, errorsp.WithStacks(err)
}
-
if pdoc.StarCount < 0 {
// if starcount is not fetched, choose fusion of Plusone and
// Like Button
@@ -467,7 +456,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
}
pdoc.StarCount = fuseStars(plus, like)
}
-
readmeFn, readmeData := "", ""
for fn, data := range pdoc.ReadmeFiles {
readmeFn, readmeData = strings.TrimSpace(fn),
@@ -478,16 +466,13 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
readmeFn, readmeData = "", ""
}
}
-
// try find synopsis from readme
if pdoc.Doc == "" && pdoc.Synopsis == "" {
pdoc.Synopsis = godoc.Synopsis(ReadmeToText(readmeFn, readmeData))
}
-
if len(readmeData) > 100*1024 {
readmeData = readmeData[:100*1024]
}
-
importsSet := stringsp.NewSet(pdoc.Imports...)
importsSet.Delete(pdoc.ImportPath)
imports := importsSet.Elements()
@@ -503,7 +488,6 @@ func CrawlPackage(httpClient doc.HttpClient, pkg string, etag string) (p *Packag
for _, t := range pdoc.Types {
exported.Add(t.Name)
}
-
return &Package{
Package: pdoc.ImportPath,
Name: pdoc.Name,
@@ -612,7 +596,6 @@ func (db PackedDocDB) Put(key string, data interface{}) {
log.Printf("Put %s failed: %v", key, err)
return
}
-
db.MemDB.Put(key, []byte(bs))
}
@@ -699,12 +682,12 @@ func (nda *NewDocAction) WriteTo(w sophie.Writer) error {
func (nda *NewDocAction) ReadFrom(r sophie.Reader, l int) error {
if err := nda.Action.ReadFrom(r, -1); err != nil {
- return err
+ return errorsp.WithStacks(err)
}
if nda.Action == NDA_DEL {
return nil
}
- return nda.DocInfo.ReadFrom(r, -1)
+ return errorsp.WithStacks(nda.DocInfo.ReadFrom(r, -1))
}
const (
@@ -715,29 +698,26 @@ const (
func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error) {
req, err := http.NewRequest("GET", godocApiUrl, nil)
if err != nil {
- return nil, err
+ return nil, errorsp.WithStacksAndMessage(err, "new request for %v failed", godocApiUrl)
}
resp, err := httpClient.Do(req)
if err != nil {
- return nil, err
+ return nil, errorsp.WithStacksAndMessage(err, "fetching %v failed", godocApiUrl)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
- return nil, errors.New(fmt.Sprintf("StatusCode: %d", resp.StatusCode))
+ return nil, errorsp.NewWithStacks("StatusCode: %d", resp.StatusCode)
}
-
var results map[string][]map[string]string
dec := json.NewDecoder(resp.Body)
if err := dec.Decode(&results); err != nil {
- return nil, err
+ return nil, errorsp.WithStacks(err)
}
-
list := make([]string, 0, len(results["results"]))
for _, res := range results["results"] {
list = append(list, res["path"])
}
-
return list, nil
}
View
@@ -27,10 +27,7 @@ func TestReadmeToText_Panic(t *testing.T) {
func TestPlusone(t *testing.T) {
url := "http://www.google.com/"
cnt, err := Plusone(http.DefaultClient, url)
- if err != nil {
- t.Error(err)
- return
- }
+ assert.NoError(t, err)
t.Logf("Plusone of %s: %d", url, cnt)
if cnt <= 0 {
t.Errorf("Zero Plusone count for %s", url)
View
@@ -8,6 +8,7 @@ import (
"unicode"
"github.com/golangplus/bytes"
+ "github.com/golangplus/errors"
"github.com/golangplus/strings"
"github.com/agonopol/go-stem"
@@ -38,13 +39,13 @@ func NewDocInfo() sophie.Sophier {
}
func (d *DocInfo) WriteTo(w sophie.Writer) error {
- return gob.NewEncoder(w).Encode(d)
+ return errorsp.WithStacks(gob.NewEncoder(w).Encode(d))
}
func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error {
// clear before decoding, otherwise some slice will be reused
*d = DocInfo{}
- return gob.NewDecoder(r).Decode(d)
+ return errorsp.WithStacks(gob.NewDecoder(r).Decode(d))
}
// HitInfo is the information provided to frontend
@@ -12,6 +12,7 @@ import (
"github.com/daviddengcn/gcse"
"github.com/daviddengcn/gcse/configs"
+ "github.com/daviddengcn/gcse/spider"
"github.com/daviddengcn/gddo/doc"
"github.com/daviddengcn/go-easybi"
"github.com/daviddengcn/sophie"
@@ -116,7 +117,9 @@ func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector
p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
for _, fld := range flds {
- appendPackage(pkg + "/" + fld.Path)
+ if spider.LikeGoSubFolder(fld.Name) {
+ appendPackage(pkg + "/" + fld.Path)
+ }
}
if err != nil && err != gcse.ErrPackageNotModifed {
log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
@@ -44,22 +44,18 @@ func main() {
return &mr.MapperStruct{
NewKeyF: sophie.NewRawString,
NewValF: gcse.NewDocInfo,
- MapF: func(key, val sophie.SophieWriter,
- c mr.PartCollector) error {
-
+ MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error {
pkg := key.(*sophie.RawString).String()
di := val.(*gcse.DocInfo)
act := gcse.NewDocAction{
Action: gcse.NDA_ORIGINAL,
DocInfo: *di,
}
-
part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS)
return c.CollectTo(part, key, &act)
},
}
}
-
// Mapper for new docs
return &mr.MapperStruct{
NewKeyF: sophie.NewRawString,
@@ -31,7 +31,7 @@ message RepoInfo {
// Information for non-repository folder.
message FolderInfo {
- // E.g. "spider"
+ // E.g. "github"
string name = 1;
// E.g. "spider/github"
View
@@ -0,0 +1,23 @@
+package spider
+
+import (
+ "regexp"
+
+ "github.com/golangplus/strings"
+)
+
+var nonGoSubFolders = stringsp.NewSet(
+ "javascript", "js", "css", "image", "images", "font", "fonts", "script", "scripts", "themes", "templates", "vendor", "bin", "cpp", "python", "nodejs",
+)
+
+var nonGoSubPattern = regexp.MustCompile(`^[0-9\-_]+$`)
+
+func LikeGoSubFolder(folder string) bool {
+ if nonGoSubFolders.Contain(folder) {
+ return false
+ }
+ if nonGoSubPattern.MatchString(folder) {
+ return false
+ }
+ return true
+}
@@ -0,0 +1,24 @@
+package spider
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/golangplus/testing/assert"
+)
+
+func TestLikeGoSubFolder(t *testing.T) {
+ pos_cases := []string{
+ "go", "v8", "v-8",
+ }
+ for _, c := range pos_cases {
+ assert.True(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
+ }
+
+ neg_cases := []string{
+ "js", "1234", "1234-5678", "1234_5678",
+ }
+ for _, c := range neg_cases {
+ assert.False(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
+ }
+}

0 comments on commit b740964

Please sign in to comment.