forked from daviddengcn/gcse
/
importsents.go
64 lines (52 loc) · 1.37 KB
/
importsents.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package main
import(
"fmt"
"log"
"github.com/daviddengcn/gcse"
"github.com/daviddengcn/go-villa"
)
const (
fnDocDB = "docdb"
)
var (
DocDBPath villa.Path
// CrawlerDBPath villa.Path
)
func init() {
DocDBPath = gcse.DataRoot.Join(fnDocDB)
// CrawlerDBPath = gcse.DataRoot.Join(fnCrawlerDB)
}
func main() {
docDB := gcse.NewMemDB(DocDBPath, gcse.KindDocDB)
countAll, countReadme, countHasSents := 0, 0, 0
countSents := 0
f, err := villa.Path("exps/notfound.txt").Create()
if err != nil {
log.Fatal(err)
}
defer f.Close()
log.Printf("Start processing ...")
if err := docDB.Iterate(func(key string, val interface{}) error {
countAll ++
d := val.(gcse.DocInfo)
if d.ReadmeData != "" {
countReadme ++
readme := gcse.ReadmeToText(d.ReadmeFn, d.ReadmeData)
sents := gcse.ChooseImportantSentenses(readme, d.Name, d.Package)
if len(sents) > 0 {
countSents += len(sents)
countHasSents ++
} else {
fmt.Fprintln(f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
fmt.Fprintf(f, "%s - %s - %s\n", d.Name, d.Package, d.ReadmeFn)
fmt.Fprintf(f, "%s\n", readme)
}
}
return nil
}); err != nil {
log.Fatalf("docDB.Iterate failed: %v", err)
}
log.Printf("%d documents processed.", countAll)
log.Printf("%d have readme.", countReadme)
log.Printf("%d found %d important sentenses.", countHasSents, countSents)
}