This repository has been archived by the owner on Dec 10, 2019. It is now read-only.
/
plugin.go
85 lines (72 loc) · 2.1 KB
/
plugin.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
package parahumans
import (
"context"
"errors"
"html"
"net/http"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/fortytw2/hydrocarbon"
dc "github.com/fortytw2/hydrocarbon/discollect"
"github.com/fortytw2/hydrocarbon/httpx"
)
// Plugin is a plugin that can scrape parahumans
var Plugin = &dc.Plugin{
Name: "parahumans",
// url has already passed the Entrypoints regexps
// there is one possible config for this plugin
ConfigCreator: func(url string, ho *dc.HandlerOpts) (string, *dc.Config, error) {
return "Worm", &dc.Config{
Type: dc.FullScrape,
Entrypoints: []string{"https://parahumans.wordpress.com/2011/06/11/1-1/"},
}, nil
},
Scheduler: dc.NeverSchedule,
Entrypoints: []string{`.*parahumans.wordpress.com.*`},
Routes: map[string]dc.Handler{
`https:\/\/parahumans.wordpress.com\/(\d+)\/(\d+)\/(\d+)\/(.*)`: phPage,
},
}
func phPage(ctx context.Context, ho *dc.HandlerOpts, t *dc.Task) *dc.HandlerResponse {
resp, err := ho.Client.Get(t.URL)
if err != nil {
return dc.ErrorResponse(err)
}
defer httpx.DrainAndClose(resp.Body)
if resp.StatusCode != http.StatusOK {
return dc.ErrorResponse(errors.New("did not get 200"))
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return dc.ErrorResponse(err)
}
title := strings.TrimSpace(doc.Find(".entry-title").Text())
date := strings.TrimSpace(doc.Find(".entry-date").Text())
dateTs, err := time.Parse("January _2, 2006", date)
if err != nil {
return dc.ErrorResponse(err)
}
h := doc.Find(".entry-content")
nextPageURL, ok := doc.Find(".nav-next").Find("a").First().Attr("href")
if !ok {
return dc.ErrorResponse(errors.New("no url"))
}
h.Find("p > a").Remove()
h.Find(".entry-meta , #jp-post-flair , .sd-sharing").Remove()
body, err := h.Html()
if err != nil {
return dc.ErrorResponse(err)
}
return dc.Response([]interface{}{
&hydrocarbon.Post{
Author: "wildbow",
PostedAt: dateTs,
OriginalURL: t.URL,
Title: title,
Body: html.UnescapeString(strings.Replace(strings.TrimSpace(body), ` `, ` `, -1)),
},
}, &dc.Task{
URL: nextPageURL,
})
}