Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
171 lines (157 sloc) 4.66 KB
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"encoding/xml"
"strings"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xmlquery"
"golang.org/x/net/html"
)
// XMLElement is the representation of a XML tag.
type XMLElement struct {
// Name is the name of the tag
Name string
Text string
attributes interface{}
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the DOM object of the page. DOM is relative
// to the current XMLElement and is either a html.Node or xmlquery.Node
// based on how the XMLElement was created.
DOM interface{}
isHTML bool
}
// NewXMLElementFromHTMLNode creates a XMLElement from a html.Node.
func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement {
return &XMLElement{
Name: s.Data,
Request: resp.Request,
Response: resp,
Text: htmlquery.InnerText(s),
DOM: s,
attributes: s.Attr,
isHTML: true,
}
}
// NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node.
func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement {
return &XMLElement{
Name: s.Data,
Request: resp.Request,
Response: resp,
Text: s.InnerText(),
DOM: s,
attributes: s.Attr,
isHTML: false,
}
}
// Attr returns the selected attribute of a HTMLElement or empty string
// if no attribute found
func (h *XMLElement) Attr(k string) string {
if h.isHTML {
for _, a := range h.attributes.([]html.Attribute) {
if a.Key == k {
return a.Val
}
}
} else {
for _, a := range h.attributes.([]xml.Attr) {
if a.Name.Local == k {
return a.Value
}
}
}
return ""
}
// ChildText returns the concatenated and stripped text content of the matching
// elements.
func (h *XMLElement) ChildText(xpathQuery string) string {
if h.isHTML {
child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
if child == nil {
return ""
}
return strings.TrimSpace(htmlquery.InnerText(child))
}
child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
if child == nil {
return ""
}
return strings.TrimSpace(child.InnerText())
}
// ChildAttr returns the stripped text content of the first matching
// element's attribute.
func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string {
if h.isHTML {
child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
if child != nil {
for _, attr := range child.Attr {
if attr.Key == attrName {
return strings.TrimSpace(attr.Val)
}
}
}
} else {
child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
if child != nil {
for _, attr := range child.Attr {
if attr.Name.Local == attrName {
return strings.TrimSpace(attr.Value)
}
}
}
}
return ""
}
// ChildAttrs returns the stripped text content of all the matching
// element's attributes.
func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string {
var res []string
if h.isHTML {
htmlquery.FindEach(h.DOM.(*html.Node), xpathQuery, func(i int, child *html.Node) {
for _, attr := range child.Attr {
if attr.Key == attrName {
res = append(res, strings.TrimSpace(attr.Val))
}
}
})
} else {
xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
for _, attr := range child.Attr {
if attr.Name.Local == attrName {
res = append(res, strings.TrimSpace(attr.Value))
}
}
})
}
return res
}
// ChildTexts returns an array of strings corresponding to child elements that match the xpath query.
// Each item in the array is the stripped text content of the corresponding matching child element.
func (h *XMLElement) ChildTexts(xpathQuery string) []string {
texts := make([]string, 0)
if h.isHTML {
htmlquery.FindEach(h.DOM.(*html.Node), xpathQuery, func(i int, child *html.Node) {
texts = append(texts, strings.TrimSpace(htmlquery.InnerText(child)))
})
} else {
xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
texts = append(texts, strings.TrimSpace(child.InnerText()))
})
}
return texts
}