Skip to content
This repository has been archived by the owner on Nov 28, 2023. It is now read-only.

Commit

Permalink
more test; added "brass-rail" to the list of tags to clean
Browse files Browse the repository at this point in the history
  • Loading branch information
quipo committed Jan 20, 2017
1 parent f31638b commit 3fc8d86
Show file tree
Hide file tree
Showing 5 changed files with 2,644 additions and 4 deletions.
5 changes: 3 additions & 2 deletions cleaner.go
Expand Up @@ -51,8 +51,9 @@ var removeNodesRegEx = regexp.MustCompile("" +
"^banner|" +
"^bar$|" +
"blog-pager|" +
"button|" +
"brass\\-rail|" +
"breadcrumbs|" +
"button|" +
"byline|" +
"cabecalho|" +
"^caption$|" +
Expand Down Expand Up @@ -210,7 +211,7 @@ var removeNodesRegEx = regexp.MustCompile("" +
// Clean removes HTML elements around the main content and prepares the document for parsing
func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document {
if c.config.debug {
log.Println("Starting cleaning phase with Cleaner")
log.Println("Starting cleaning phase with Cleaner\n")
}
docToClean = c.cleanArticleTags(docToClean)
docToClean = c.cleanEMTags(docToClean)
Expand Down
21 changes: 20 additions & 1 deletion crawler_test.go
Expand Up @@ -41,7 +41,7 @@ func ValidateArticle(expected Article, removed *[]string) error {
}

if !strings.Contains(result.CleanedText, expected.CleanedText) {
//fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText)
fmt.Printf("EXPECTED: %s \n\n\n\nACTUAL: %s\n\n", expected.CleanedText, result.CleanedText)
return fmt.Errorf("article cleanedText does not contain %q", expected.CleanedText)
}

Expand Down Expand Up @@ -570,6 +570,25 @@ func Test_HuffingtonPostCoUk(t *testing.T) {
}
}

func Test_HuffingtonPostJp(t *testing.T) {
article := Article{
Domain: "huffingtonpost.jp",
Title: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明",
MetaDescription: "クロマグロやカツオ類が大量死した問題で、葛西臨海水族園(東京都江戸川区)は3日、病理検査の結果、海の養殖魚を大量死させることで知られる2種類のウイルスが原因ではないことが確認されたと発表した。",
CleanedText: "",
MetaKeywords: "クロマグロ残り2匹 葛西臨海水族園の大量死は未だに原因不明, japan",
CanonicalLink: "http://www.huffingtonpost.jp/2015/03/03/tuna-death_n_6796602.html",
TopImage: "http://i.huffpost.com/gen/2678692/images/o-TUNA-DEATH-facebook.jpg",
}
//article.Links = []string{""}

removed := []string{"~~~REMOVED~~~"}
err := ValidateArticle(article, &removed)
if err != nil {
t.Error(err)
}
}

func Test_IncCom(t *testing.T) {
article := Article{
Domain: "inc.com",
Expand Down
1 change: 0 additions & 1 deletion outputformatter.go
Expand Up @@ -83,7 +83,6 @@ func (formatter *outputFormatter) linksToText() []string {
}

func (formatter *outputFormatter) getOutputText() string {

out := formatter.topNode.Text()
out = normalizeWhitespaceRegexp.ReplaceAllString(out, " ")

Expand Down
2,351 changes: 2,351 additions & 0 deletions sites/huffingtonpost.jp.html

Large diffs are not rendered by default.

0 comments on commit 3fc8d86

Please sign in to comment.