Permalink
Browse files

Add more non go sub folder names. Refactoring of configs package usage.

  • Loading branch information...
1 parent 59702b2 commit a6deea793673e33c35c49c7669323b3b74dafa03 @daviddengcn committed Oct 18, 2016
View
@@ -12,12 +12,13 @@ import (
"github.com/daviddengcn/go-easybi"
"github.com/daviddengcn/go-ljson-conf"
"github.com/daviddengcn/go-villa"
+ "github.com/daviddengcn/sophie"
)
const (
- FnCrawlerDB = "crawler"
+ fnCrawlerDB = "crawler"
- FnToCrawl = "tocrawl"
+ fnToCrawl = "tocrawl"
FnPackage = "package"
FnPerson = "person"
// key: RawString, value: DocInfo
@@ -94,12 +95,32 @@ func init() {
BiWebPath = conf.String("bi.web_path", BiWebPath)
}
+func DataRootFsPath() sophie.FsPath {
+ return sophie.LocalFsPath(DataRoot.S())
+}
+
func CrawlerDBPath() villa.Path {
- return DataRoot.Join(FnCrawlerDB)
+ return DataRoot.Join(fnCrawlerDB)
+}
+
+func CrawlerDBFsPath() sophie.FsPath {
+ return DataRootFsPath().Join(fnCrawlerDB)
+}
+
+func DocsDBPath() string {
+ return DataRoot.Join(FnDocs).S()
+}
+
+func DocsDBFsPath() sophie.FsPath {
+ return DataRootFsPath().Join(FnDocs)
+}
+
+func ToCrawlPath() string {
+ return DataRoot.Join(fnToCrawl).S()
}
-func DocsDBPath() villa.Path {
- return DataRoot.Join(FnDocs)
+func ToCrawlFsPath() sophie.FsPath {
+ return DataRootFsPath().Join(fnToCrawl)
}
func IndexPath() villa.Path {
@@ -156,12 +156,7 @@ func main() {
// Load CrawlerDB
cDB = gcse.LoadCrawlerDB()
- fpDataRoot := sophie.FsPath{
- Fs: sophie.LocalFS,
- Path: configs.DataRoot.S(),
- }
-
- fpDocs := fpDataRoot.Join(configs.FnDocs)
+ fpDocs := configs.DocsDBFsPath()
if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
log.Fatalf("loadAllDocsPkgs: %v", err)
}
@@ -170,8 +165,8 @@ func main() {
AppStopTime = time.Now().Add(configs.CrawlerDuePerRun)
//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
- fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
- fpToCrawl := fpDataRoot.Join(configs.FnToCrawl)
+ fpCrawler := configs.CrawlerDBFsPath()
+ fpToCrawl := configs.ToCrawlFsPath()
fpNewDocs := fpCrawler.Join(configs.FnNewDocs)
fpNewDocs.Remove()
@@ -10,7 +10,6 @@ import (
"github.com/daviddengcn/gcse/store"
"github.com/daviddengcn/gcse/utils"
"github.com/daviddengcn/go-easybi"
- "github.com/daviddengcn/sophie"
"github.com/daviddengcn/sophie/kv"
)
@@ -48,7 +47,7 @@ func doIndex() bool {
log.Printf("Indexing to %v ...", idxSegm)
- fpDocDB := sophie.LocalFsPath(configs.DocsDBPath().S())
+ fpDocDB := configs.DocsDBFsPath()
ts, err := gcse.Index(kv.DirInput(fpDocDB), string(idxSegm))
if err != nil {
log.Printf("Indexing failed: %v", err)
@@ -15,6 +15,7 @@ import (
"github.com/daviddengcn/gcse"
"github.com/daviddengcn/gcse/configs"
+ "github.com/daviddengcn/go-villa"
"github.com/daviddengcn/sophie"
"github.com/daviddengcn/sophie/kv"
"github.com/daviddengcn/sophie/mr"
@@ -31,7 +32,7 @@ func main() {
fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S())
- fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
+ fpCrawler := configs.CrawlerDBFsPath()
outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated"))
outDocsUpdated.Clean()
@@ -155,7 +156,7 @@ func main() {
log.Printf("New: %v", cntNew)
log.Printf("Unchanged: %v", cntUnchanged)
- pDocs := configs.DataRoot.Join(configs.FnDocs)
+ pDocs := villa.Path(configs.DocsDBPath())
pUpdated := configs.DataRoot.Join("docs-updated")
pTmp := configs.DataRoot.Join("docs-tmp")
@@ -20,6 +20,7 @@ import (
"github.com/daviddengcn/gcse/utils"
"github.com/daviddengcn/gddo/doc"
"github.com/daviddengcn/go-easybi"
+ "github.com/daviddengcn/go-villa"
"github.com/daviddengcn/sophie"
"github.com/daviddengcn/sophie/kv"
@@ -197,8 +198,7 @@ func main() {
cDB = gcse.LoadCrawlerDB()
// load pkgUTs
- pkgUTs, err := loadPackageUpdateTimes(
- sophie.LocalFsPath(configs.DocsDBPath().S()))
+ pkgUTs, err := loadPackageUpdateTimes(sophie.LocalFsPath(configs.DocsDBPath()))
if err != nil {
log.Fatalf("loadPackageUpdateTimes failed: %v", err)
}
@@ -236,7 +236,7 @@ func main() {
log.Printf("Package DB: %d entries", cDB.PackageDB.Count())
log.Printf("Person DB: %d entries", cDB.PersonDB.Count())
- pathToCrawl := configs.DataRoot.Join(configs.FnToCrawl)
+ pathToCrawl := villa.Path(configs.ToCrawlPath())
kvPackage := kv.DirOutput(sophie.LocalFsPath(
pathToCrawl.Join(configs.FnPackage).S()))
View
@@ -2,6 +2,7 @@ package spider
import (
"regexp"
+ "strings"
"time"
"github.com/golang/protobuf/ptypes"
@@ -19,18 +20,46 @@ const (
)
var nonGoSubFolders = stringsp.NewSet(
- "javascript", "js", "css", "image", "images", "font", "fonts", "script", "scripts", "themes", "templates", "vendor", "bin", "cpp", "python", "nodejs",
+ "android",
+ "bin", "binary",
+ "c", "cmd", "cpp", "css",
+ "doc", "dll",
+ "faq", "font", "fonts",
+ "gif", "django",
+ "help", "html",
+ "image", "images", "icon", "icons",
+ "java", "javascript", "js", "jpg", "jpeg",
+ "lib", "less",
+ "nodejs",
+ "pdf", "python",
+ "r", "readme",
+ "src", "script", "scripts", "static",
+ "themes", "templates", "tex",
+ "vendor",
+ "wav",
+ "xml",
+ "zip",
)
var nonGoSubPattern = regexp.MustCompile(`^[0-9\-_]+$`)
func LikeGoSubFolder(folder string) bool {
+ folder = strings.ToLower(folder)
if nonGoSubFolders.Contain(folder) {
return false
}
if nonGoSubPattern.MatchString(folder) {
return false
}
+ if strings.ContainsAny(folder, ".") {
+ return false
+ }
+ if folder[0] < 'a' || folder[0] > 'z' {
+ return false
+ }
+ if strings.Contains(folder, "nodejs") {
+ return false
+ }
return true
}
@@ -1,21 +1,77 @@
package main
import (
+ "io"
"log"
"strings"
+ "github.com/golangplus/errors"
"github.com/golangplus/fmt"
+ "github.com/golangplus/strings"
"github.com/daviddengcn/gcse"
+ "github.com/daviddengcn/gcse/configs"
+ "github.com/daviddengcn/gcse/spider"
+ "github.com/daviddengcn/sophie"
+ "github.com/daviddengcn/sophie/kv"
)
+func loadDocsPkgs(in kv.DirInput) (stringsp.Set, error) {
+ var pkgs stringsp.Set
+ cnt, err := in.PartCount()
+ if err != nil {
+ return nil, err
+ }
+ for part := 0; part < cnt; part++ {
+ c, err := in.Iterator(part)
+ if err != nil {
+ return nil, err
+ }
+ for {
+ var key sophie.RawString
+ var val gcse.DocInfo
+ if err := c.Next(&key, &val); err != nil {
+ if errorsp.Cause(err) == io.EOF {
+ break
+ }
+ return nil, err
+ }
+ pkgs.Add(string(key))
+ // value is ignored
+ }
+ }
+ return pkgs, nil
+}
+
func main() {
+ dryRun := false
// Load CrawlerDB
cDB := gcse.LoadCrawlerDB()
+ fpDataRoot := sophie.FsPath{
+ Fs: sophie.LocalFS,
+ Path: configs.DataRoot.S(),
+ }
+ pkgs, err := loadDocsPkgs(kv.DirInput(fpDataRoot.Join(configs.FnDocs)))
+ if err != nil {
+ log.Fatalf("loadDocsPkgs failed: %v", err)
+ }
db := cDB.PackageDB
var toDelete []string
if err := db.Iterate(func(id string, val interface{}) error {
+ if pkgs.Contain(id) {
+ // If the pacakge is already in docs, do not touch it.
+ return nil
+ }
parts := strings.Split(id, "/")
+ if len(parts) >= 4 {
+ // Check last part.
+ // github.com/user/repo/sub
+ name := parts[len(parts)-1]
+ if !spider.LikeGoSubFolder(name) {
+ toDelete = append(toDelete, id)
+ return nil
+ }
+ }
if len(parts) < 6 || len(parts)%2 != 0 {
return nil
}
@@ -33,6 +89,9 @@ func main() {
log.Fatalf("Iterate failed: %v", err)
}
fmtp.Printfln("Total: %d", len(toDelete))
+ if dryRun {
+ return
+ }
for _, id := range toDelete {
db.Delete(id)
}

0 comments on commit a6deea7

Please sign in to comment.