Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
172 lines (158 sloc) 4.56 KB
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"errors"
"reflect"
"strings"
"github.com/PuerkitoBio/goquery"
)
// Unmarshal is a shorthand for colly.UnmarshalHTML
func (h *HTMLElement) Unmarshal(v interface{}) error {
return UnmarshalHTML(v, h.DOM)
}
// UnmarshalHTML declaratively extracts text or attributes to a struct from
// HTML response using struct tags composed of css selectors.
// Allowed struct tags:
// - "selector" (required): CSS (goquery) selector of the desired data
// - "attr" (optional): Selects the matching element's attribute's value.
// Leave it blank or omit to get the text of the element.
//
// Example struct declaration:
//
// type Nested struct {
// String string `selector:"div > p"`
// Classes []string `selector:"li" attr:"class"`
// Struct *Nested `selector:"div > div"`
// }
//
// Supported types: struct, *struct, string, []string
func UnmarshalHTML(v interface{}, s *goquery.Selection) error {
rv := reflect.ValueOf(v)
if rv.Kind() != reflect.Ptr || rv.IsNil() {
return errors.New("Invalid type or nil-pointer")
}
sv := rv.Elem()
st := reflect.TypeOf(v).Elem()
for i := 0; i < sv.NumField(); i++ {
attrV := sv.Field(i)
if !attrV.CanAddr() || !attrV.CanSet() {
continue
}
if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil {
return err
}
}
return nil
}
func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error {
selector := attrT.Tag.Get("selector")
//selector is "-" specify that field should ignore.
if selector == "-" {
return nil
}
htmlAttr := attrT.Tag.Get("attr")
// TODO support more types
switch attrV.Kind() {
case reflect.Slice:
if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
return err
}
case reflect.String:
val := getDOMValue(s.Find(selector), htmlAttr)
attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
case reflect.Struct:
if err := unmarshalStruct(s, selector, attrV); err != nil {
return err
}
case reflect.Ptr:
if err := unmarshalPtr(s, selector, attrV); err != nil {
return err
}
default:
return errors.New("Invalid type: " + attrV.String())
}
return nil
}
func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error {
newS := s
if selector != "" {
newS = newS.Find(selector)
}
if newS.Nodes == nil {
return nil
}
v := reflect.New(attrV.Type())
err := UnmarshalHTML(v.Interface(), newS)
if err != nil {
return err
}
attrV.Set(reflect.Indirect(v))
return nil
}
func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error {
newS := s
if selector != "" {
newS = newS.Find(selector)
}
if newS.Nodes == nil {
return nil
}
e := attrV.Type().Elem()
if e.Kind() != reflect.Struct {
return errors.New("Invalid slice type")
}
v := reflect.New(e)
err := UnmarshalHTML(v.Interface(), newS)
if err != nil {
return err
}
attrV.Set(v)
return nil
}
func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error {
if attrV.Pointer() == 0 {
v := reflect.MakeSlice(attrV.Type(), 0, 0)
attrV.Set(v)
}
switch attrV.Type().Elem().Kind() {
case reflect.String:
s.Find(selector).Each(func(_ int, s *goquery.Selection) {
val := getDOMValue(s, htmlAttr)
attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val))))
})
case reflect.Ptr:
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
someVal := reflect.New(attrV.Type().Elem().Elem())
UnmarshalHTML(someVal.Interface(), innerSel)
attrV.Set(reflect.Append(attrV, someVal))
})
case reflect.Struct:
s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
someVal := reflect.New(attrV.Type().Elem())
UnmarshalHTML(someVal.Interface(), innerSel)
attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal)))
})
default:
return errors.New("Invalid slice type")
}
return nil
}
func getDOMValue(s *goquery.Selection, attr string) string {
if attr == "" {
return strings.TrimSpace(s.First().Text())
}
attrV, _ := s.Attr(attr)
return attrV
}