Skip to content

Commit

Permalink
scrape: support textarea, radio, and checkbox
Browse files Browse the repository at this point in the history
improve form parsing to support textarea elements, as well as proper
handling of radio and checkbox inputs (only include the value if the
"checked" attribute is set).

also allow nil setValue func for fetchAndSubmitForm.  While I don't know
of a case (yet) where we want to do this, there's no point in panicking
in this case.
  • Loading branch information
willnorris committed Mar 2, 2020
1 parent d913de9 commit 5f66efb
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 5 deletions.
31 changes: 26 additions & 5 deletions scrape/forms.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"fmt"
"net/http"
"net/url"
"strings"

"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
Expand All @@ -29,8 +30,8 @@ type htmlForm struct {
}

// parseForms parses and returns all form elements beneath node. Form values
// include all nested input elements within the form (textarea is not currently
// supported).
// include all input and textarea elements within the form. The values of radio
// and checkbox inputs are included only if they are checked.
//
// In the future, we might want to allow a custom selector to be passed in to
// further restrict what forms will be returned.
Expand All @@ -47,10 +48,28 @@ func parseForms(node *html.Node) (forms []htmlForm) {

s.Find("input").Each(func(_ int, s *goquery.Selection) {
name, _ := s.Attr("name")
if name == "" {
return
}

typ, _ := s.Attr("type")
typ = strings.ToLower(typ)
_, checked := s.Attr("checked")
if (typ == "radio" || typ == "checkbox") && !checked {
return
}

value, _ := s.Attr("value")
if name != "" {
form.Values.Add(name, value)
form.Values.Add(name, value)
})
s.Find("textarea").Each(func(_ int, s *goquery.Selection) {
name, _ := s.Attr("name")
if name == "" {
return
}

value := s.Text()
form.Values.Add(name, value)
})
forms = append(forms, form)
})
Expand Down Expand Up @@ -87,7 +106,9 @@ func fetchAndSubmitForm(client *http.Client, urlStr string, setValues func(url.V
actionURL = resp.Request.URL.ResolveReference(actionURL)

// allow caller to fill out the form
setValues(form.Values)
if setValues != nil {
setValues(form.Values)
}

resp, err = client.PostForm(actionURL.String(), form.Values)
if err != nil {
Expand Down
32 changes: 32 additions & 0 deletions scrape/forms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,38 @@ func Test_ParseForms(t *testing.T) {
{Action: "a2", Method: "m2", Values: url.Values{"n2": {"v2"}}},
},
},
{
"form with radio buttons (none checked)",
`<html><form>
<input type="radio" name="n1" value="v1">
<input type="radio" name="n1" value="v2">
<input type="radio" name="n1" value="v3">
</form></html>`,
[]htmlForm{{Values: url.Values{}}},
},
{
"form with radio buttons",
`<html><form>
<input type="radio" name="n1" value="v1">
<input type="radio" name="n1" value="v2">
<input type="radio" name="n1" value="v3" checked>
</form></html>`,
[]htmlForm{{Values: url.Values{"n1": {"v3"}}}},
},
{
"form with checkboxes",
`<html><form>
<input type="checkbox" name="n1" value="v1" checked>
<input type="checkbox" name="n2" value="v2">
<input type="checkbox" name="n3" value="v3" checked>
</form></html>`,
[]htmlForm{{Values: url.Values{"n1": {"v1"}, "n3": {"v3"}}}},
},
{
"single form with textarea",
`<html><form><textarea name="n1">v1</textarea></form></html>`,
[]htmlForm{{Values: url.Values{"n1": {"v1"}}}},
},
}

for _, tt := range tests {
Expand Down

0 comments on commit 5f66efb

Please sign in to comment.