ce
is a golang package for multilingual web page content extraction. It is used to extract the content of article type web pages, such as news, blog posts, etc.
package main
import (
"encoding/json"
"flag"
"fmt"
"strings"
"github.com/crawlerclub/ce"
"github.com/crawlerclub/dl"
)
var (
url = flag.String("url",
"http://china.huanqiu.com/article/2017-07/11034896.html",
"news url")
debug = flag.Bool("debug", false, "debug mode")
)
func main() {
flag.Parse()
res := dl.DownloadUrl(*url)
if res.Error != nil {
fmt.Println(res.Error)
return
}
items := strings.Split(res.RemoteAddr, ":")
ip := ""
if len(items) > 0 {
ip = items[0]
}
doc := ce.ParsePro(*url, res.Text, ip, *debug)
j, _ := json.Marshal(doc)
fmt.Println(string(j))
}
ce
can extract the following fields from raw web htmls:
title
: the title of articletext
: the main content of article in plain texthtml
: the main content of article with basic html format, images includedpublish_date
: the publish time of articlelanguage
: the language of articlelocation
: the country codeauthor
: the author of artileimages
: the images used in the article