Skip to content
Permalink
Browse files

Make URL fetching concurrent

  • Loading branch information
frioux committed Nov 29, 2019
1 parent 72293ed commit 0e30ea5ec7af47d7d0a0f06494632b14bd9a926d
Showing with 227 additions and 108 deletions.
  1. +3 −3 README.mdwn
  2. +1 −0 go.mod
  3. +1 −0 go.sum
  4. +12 −12 help_generated.go
  5. +78 −92 internal/tool/rss/rss.go
  6. +1 −1 internal/tool/rss/rss_test.go
  7. +104 −0 internal/tool/rss/state.go
  8. +27 −0 internal/tool/rss/state_test.go
@@ -310,11 +310,11 @@ a single root for all files in the zipfile.

### `rss`

`rss` is a minimalist rss client. Outputs links as markdown on STDOUT. Takes url
to feed and path to state file. Example usage:
`rss` is a minimalist rss client. Outputs links as markdown on STDOUT. Takes urls
to feeds and path to state file. Example usage:

```bash
$ rss https://blog.afoolishmanifesto.com/index.xml afm.json
$ rss -state feed.json https://blog.afoolishmanifesto.com/index.xml
[Announcing shellquote](https://blog.afoolishmanifesto.com/posts/announcing-shellquote/)
[Detecting who used the EC2 metadata server with BCC](https://blog.afoolishmanifesto.com/posts/detecting-who-used-ec2-metadata-server-bcc/)
[Centralized known_hosts for ssh](https://blog.afoolishmanifesto.com/posts/centralized-known-hosts-for-ssh/)
1 go.mod
@@ -14,6 +14,7 @@ require (
github.com/pierrec/lz4/v3 v3.1.0
github.com/stretchr/testify v1.4.0
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 // indirect
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f
golang.org/x/sys v0.0.0-20190904005037-43c01164e931 // indirect
golang.org/x/text v0.3.2
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
1 go.sum
@@ -59,6 +59,7 @@ golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73r
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 h1:k7pJ2yAPLPgbskkFdhRCsA77k2fySZ1zf2zCjvQCiIM=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f h1:wMNYb4v58l5UBM7MYRLPG6ZhfOqbKu7X5eyFl8ZhKvA=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -311,11 +311,11 @@ a single root for all files in the zipfile.
### ` + "`" + `rss` + "`" + `
` + "`" + `rss` + "`" + ` is a minimalist rss client. Outputs links as markdown on STDOUT. Takes url
to feed and path to state file. Example usage:
` + "`" + `rss` + "`" + ` is a minimalist rss client. Outputs links as markdown on STDOUT. Takes urls
to feeds and path to state file. Example usage:
` + "`" + `` + "`" + `` + "`" + `bash
$ rss https://blog.afoolishmanifesto.com/index.xml afm.json
$ rss -state feed.json https://blog.afoolishmanifesto.com/index.xml
[Announcing shellquote](https://blog.afoolishmanifesto.com/posts/announcing-shellquote/)
[Detecting who used the EC2 metadata server with BCC](https://blog.afoolishmanifesto.com/posts/detecting-who-used-ec2-metadata-server-bcc/)
[Centralized known_hosts for ssh](https://blog.afoolishmanifesto.com/posts/centralized-known-hosts-for-ssh/)
@@ -466,22 +466,22 @@ func init() {

"replace-unzip": readme[8780:9010],

"rss": readme[9010:9758],
"rss": readme[9010:9768],

"slack-deaddrop": readme[9758:9978],
"slack-deaddrop": readme[9768:9988],

"slack-open": readme[9978:10112],
"slack-open": readme[9988:10122],

"sm-list": readme[10112:10388],
"sm-list": readme[10122:10398],

"srv": readme[10388:10571],
"srv": readme[10398:10581],

"toml2json": readme[10571:10746],
"toml2json": readme[10581:10756],

"undefer": readme[10746:11054],
"undefer": readme[10756:11064],

"uni": readme[11054:11216],
"uni": readme[11064:11226],

"yaml2json": readme[11216:11293],
"yaml2json": readme[11226:11303],
}
}
@@ -1,23 +1,24 @@
package rss

import (
"encoding/json"
"context"
"flag"
"fmt"
"io"
"net/url"
"os"
"sort"

"github.com/frioux/leatherman/internal/lmhttp"
"github.com/mmcdole/gofeed"
"golang.org/x/sync/errgroup"
)

/*
Run is a minimalist rss client. Outputs links as markdown on STDOUT. Takes url
to feed and path to state file. Example usage:
Run is a minimalist rss client. Outputs links as markdown on STDOUT. Takes urls
to feeds and path to state file. Example usage:
```bash
$ rss https://blog.afoolishmanifesto.com/index.xml afm.json
$ rss -state feed.json https://blog.afoolishmanifesto.com/index.xml
[Announcing shellquote](https://blog.afoolishmanifesto.com/posts/announcing-shellquote/)
[Detecting who used the EC2 metadata server with BCC](https://blog.afoolishmanifesto.com/posts/detecting-who-used-ec2-metadata-server-bcc/)
[Centralized known_hosts for ssh](https://blog.afoolishmanifesto.com/posts/centralized-known-hosts-for-ssh/)
@@ -28,44 +29,102 @@ $ rss https://blog.afoolishmanifesto.com/index.xml afm.json
Command: rss
*/
func Run(args []string, _ io.Reader) error {
if len(args) != 3 {
fmt.Fprintf(os.Stderr, "Usage: %s feedURL statefile\n", args[0])
flags := flag.NewFlagSet("rss", flag.ExitOnError)

var statePath string

flags.StringVar(&statePath, "state", "", "location to store state")
if err := flags.Parse(args[1:]); err != nil {
return fmt.Errorf("flags.Parse: %w", err)
}

if len(flags.Args()) == 0 {
fmt.Fprintf(os.Stderr, "Usage: %s -state rss.json <url> [<url>...]\n", args[0])
os.Exit(1)
}

return run(args[1], args[2], os.Stdout)
if statePath == "" {
fmt.Fprintln(os.Stderr, "-state is required")
os.Exit(1)
}

return run(statePath, flags.Args(), os.Stdout)
}

func run(urlString, statePath string, w io.Writer) error {
fp := gofeed.NewParser()
func loadFeed(fp *gofeed.Parser, urlString string) ([]*gofeed.Item, error) {
feedURL, err := url.Parse(urlString)
if err != nil {
return fmt.Errorf("Couldn't parse feed url (%s): %w", feedURL, err)
return nil, fmt.Errorf("Couldn't parse feed url (%s): %w", urlString, err)
}

resp, err := lmhttp.Get(urlString)
if err != nil {
return fmt.Errorf("Couldn't get feed: %w", err)
return nil, fmt.Errorf("Couldn't get feed: %w", err)
}

f, err := fp.Parse(resp.Body)
if err != nil {
return fmt.Errorf("Couldn't fetch feed (%s): %w", feedURL, err)
return nil, fmt.Errorf("Couldn't fetch feed (%s): %w", feedURL, err)
}
fixItems(feedURL, f.Items)

seen, err := syncRead(statePath, f.Items)
if err != nil {
return fmt.Errorf("Couldn't sync read (%s): %w", feedURL, err)
return f.Items, nil
}

func syncFeed(state indexedStates, items []*gofeed.Item, urlString string, w io.Writer) error {
if state[urlString] == nil {
state[urlString] = make(map[string]bool, len(items))
}

items := newItems(seen, f.Items)
items = newItems(state[urlString], items)

for _, i := range items {
state[urlString][i.GUID] = true
}

renderItems(w, items)

err = os.Rename(statePath+".tmp", statePath)
return nil
}

func run(statePath string, urls []string, w io.Writer) error {
state, err := readState(statePath)
if err != nil {
return fmt.Errorf("Couldn't rename state file (%s): %w", feedURL, err)
return fmt.Errorf("couldn't read state: %w", err)
}
fp := gofeed.NewParser()

results := make([][]*gofeed.Item, len(urls))
g, _ := errgroup.WithContext(context.Background())

for i, urlString := range urls {
i, urlString := i, urlString
g.Go(func() error { // O(n) goroutines
items, err := loadFeed(fp, urlString)
if err != nil {
return err
}
results[i] = items
return nil
})
}

if err := g.Wait(); err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
for i, items := range results {
if err := syncFeed(state, items, urls[i], w); err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
}

if err := writeState(statePath, state); err != nil {
return fmt.Errorf("Couldn't save state file: %w", err)
}
if err := os.Rename(statePath+".tmp", statePath); err != nil {
return fmt.Errorf("Couldn't rename state file: %w", err)
}

return nil
@@ -108,76 +167,3 @@ func newItems(seen map[string]bool, items []*gofeed.Item) []*gofeed.Item {

return ret
}

// Store JSON containing seen GUIDs for the current feed.
func syncRead(state string, items []*gofeed.Item) (map[string]bool, error) {
ret := make(map[string]bool, len(items))

guids, err := readState(state)
if err != nil {
return nil, fmt.Errorf("couldn't read state: %w", err)
}

for _, g := range guids {
ret[g] = true
}

// Generate news state
newState := make(map[string]bool, len(items)+len(guids))

for _, g := range guids {
newState[g] = true
}
for _, i := range items {
newState[i.GUID] = true
}
toStore := make([]string, 0, len(newState))

for k := range newState {
toStore = append(toStore, k)
}
sort.Strings(toStore)

err = writeState(state, toStore)
if err != nil {
return nil, fmt.Errorf("couldn't write state: %w", err)
}
return ret, nil
}

func readState(state string) ([]string, error) {
file, err := os.Open(state)
if err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("couldn't open state file: %w", err)
}

var guids []string

if err == nil {
decoder := json.NewDecoder(file)
err = decoder.Decode(&guids)
if err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("couldn't decode state file: %w", err)
}
}

return guids, nil
}

func writeState(state string, guids []string) error {
tmp, err := os.Create(state + ".tmp")
if err != nil {
return fmt.Errorf("couldn't create state file: %w", err)
}
encoder := json.NewEncoder(tmp)
encoder.SetIndent("", "\t")
err = encoder.Encode(guids)
if err != nil {
return fmt.Errorf("couldn't encode state file: %w", err)
}
err = tmp.Close()
if err != nil {
return fmt.Errorf("couldn't write state file: %w", err)
}
return nil
}
@@ -429,7 +429,7 @@ might realize.&lt;/p&gt;
}

buf := &bytes.Buffer{}
err = run(ts.URL, f.Name(), buf)
err = run(f.Name(), []string{ts.URL}, buf)
assert.NoError(t, err)
assert.Equal(t, `[Sorting Books](https://blog.afoolishmanifesto.com/posts/sorting-books/)
[Automating Email](https://blog.afoolishmanifesto.com/posts/automating-email/)

0 comments on commit 0e30ea5

Please sign in to comment.
You can’t perform that action at this time.