Skip to content

Commit

Permalink
Added url_filter option to allow crawl restrictions
Browse files Browse the repository at this point in the history
  • Loading branch information
fredwu committed Aug 26, 2017
1 parent 421b0ca commit 3ec1c4b
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 32 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
- [Added] `:timeout` option
- [Added] `:user_agent` option
- [Added] `:save_to` option
- [Added] `:parser` option to allow custom parsing logic
- [Added] `:assets` option
- [Added] `:url_filter` option to allow custom url filtering logic
- [Added] `:parser` option to allow custom parsing logic
- [Improved] Renamed `:max_levels` to `:max_depths`
- [Improved] Varies small fixes and improvements

Expand Down
31 changes: 18 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Crawler is under active development, below is a non-comprehensive list of featur
- [x] css
- [x] images
- [ ] The ability to manually stop/pause/restart the crawler.
- [ ] Restrict crawlable domains, paths or file types.
- [x] Restrict crawlable domains, paths or file types.
- [x] Limit concurrent crawlers.
- [x] Limit rate of crawling.
- [x] Set crawler's user agent.
Expand All @@ -35,22 +35,27 @@ Crawler.crawl("http://elixir-lang.org", max_depths: 2)

## Configurations

| Option | Type | Default Value | Description |
|-----------------|---------|-----------------------|-------------|
| `:max_depths` | integer | `3` | Maximum nested depth of pages to crawl.
| `:workers` | integer | `10` | Maximum number of concurrent workers for crawling.
| `:interval` | integer | `0` | Rate limit control - number of milliseconds before crawling more pages, defaults to `0` which is effectively no rate limit.
| `:timeout` | integer | `5000` | Timeout value for fetching a page, in ms.
| `:user_agent` | string | `Crawler/x.x.x (...)` | User-Agent value sent by the fetch requests.
| `:save_to` | string | `nil` | When provided, the path for saving crawled pages.
| `:assets` | list | `[]` | Whether to fetch any asset files, available options: `"css"`, `"js"`, `"images"`.
| `:parser` | module | `Crawler.Parser` | The default parser, useful when you need to handle parsing differently or to add extra functionalities.
| Option | Type | Default Value | Description |
|-----------------|---------|-----------------------------|-------------|
| `:max_depths` | integer | `3` | Maximum nested depth of pages to crawl.
| `:workers` | integer | `10` | Maximum number of concurrent workers for crawling.
| `:interval` | integer | `0` | Rate limit control - number of milliseconds before crawling more pages, defaults to `0` which is effectively no rate limit.
| `:timeout` | integer | `5000` | Timeout value for fetching a page, in ms.
| `:user_agent` | string | `Crawler/x.x.x (...)` | User-Agent value sent by the fetch requests.
| `:save_to` | string | `nil` | When provided, the path for saving crawled pages.
| `:assets` | list | `[]` | Whether to fetch any asset files, available options: `"css"`, `"js"`, `"images"`.
| `:url_filter` | module | `Crawler.Fetcher.UrlFilter` | Custom URL filter, useful when you need to restrict crawlable domains, paths or file types.
| `:parser` | module | `Crawler.Parser` | Custom parser, useful when you need to handle parsing differently or to add extra functionalities.

## Custom Parser
## Custom URL Filter

It is possible to swap in your custom parsing logic by specifying the `:parser` option. Your custom parser needs to conform to the `Crawler.Parser.Spec` behaviour:
It is possible to swap in your custom url filtering or parsing logic as shown in the configurations section. Your custom modules need to conform to their respective behaviours:

```elixir
defmodule CustomUrlFilter do
@behaviour Crawler.Fetcher.UrlFilter.Spec
end

defmodule CustomParser do
@behaviour Crawler.Parser.Spec
end
Expand Down
46 changes: 38 additions & 8 deletions lib/crawler/fetcher/policer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,50 @@ defmodule Crawler.Fetcher.Policer do
@doc """
## Examples
iex> Policer.police([depth: 1, max_depths: 2, url: "http://policer/"])
{:ok, [depth: 1, max_depths: 2, url: "http://policer/"]}
iex> Policer.police([
iex> depth: 1,
iex> max_depths: 2,
iex> url: "http://policer/",
iex> url_filter: UrlFilter
iex> ])
{:ok, [depth: 1, max_depths: 2, url: "http://policer/", url_filter: UrlFilter]}
iex> Policer.police([depth: 2, max_depths: 2, html_tag: "a"])
iex> Policer.police([
iex> depth: 2,
iex> max_depths: 2,
iex> html_tag: "a"
iex> ])
{:error, "Fetch failed 'within_fetch_depth?', with opts: [depth: 2, max_depths: 2, html_tag: \\\"a\\\"]."}
iex> Policer.police([depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg"])
{:ok, [depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg"]}
iex> Policer.police([
iex> depth: 3,
iex> max_depths: 2,
iex> html_tag: "img",
iex> url: "http://policer/hi.jpg",
iex> url_filter: UrlFilter
iex> ])
{:ok, [depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg", url_filter: UrlFilter]}
iex> Policer.police([depth: 1, max_depths: 2, url: "ftp://hello.world"])
iex> Policer.police([
iex> depth: 1,
iex> max_depths: 2,
iex> url: "ftp://hello.world"
iex> ])
{:error, "Fetch failed 'acceptable_uri_scheme?', with opts: [depth: 1, max_depths: 2, url: \\\"ftp://hello.world\\\"]."}
iex> Crawler.Store.add("http://policer/exist/")
iex> Policer.police([depth: 1, max_depths: 2, url: "http://policer/exist/"])
iex> Policer.police([
iex> depth: 1,
iex> max_depths: 2,
iex> url: "http://policer/exist/"
iex> ])
{:error, "Fetch failed 'not_fetched_yet?', with opts: [depth: 1, max_depths: 2, url: \\\"http://policer/exist/\\\"]."}
"""
def police(opts) do
with {_, true} <- within_fetch_depth?(opts),
{_, true} <- acceptable_uri_scheme?(opts),
{_, true} <- not_fetched_yet?(opts)
{_, true} <- not_fetched_yet?(opts),
{_, true} <- perform_url_filtering(opts)
do
{:ok, opts}
else
Expand Down Expand Up @@ -60,6 +84,12 @@ defmodule Crawler.Fetcher.Policer do
{:not_fetched_yet?, !Store.find(opts[:url])}
end

defp perform_url_filtering(opts) do
{:ok, pass_through?} = opts[:url_filter].filter(opts[:url])

{:perform_url_filtering, pass_through?}
end

defp police_error(fail_type, opts) do
{:error, "Fetch failed '#{fail_type}', with opts: #{Kernel.inspect(opts)}."}
end
Expand Down
14 changes: 14 additions & 0 deletions lib/crawler/fetcher/url_filter.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
defmodule Crawler.Fetcher.UrlFilter do
@moduledoc """
A placeholder module that let all URLs pass through.
"""

@behaviour __MODULE__.Spec

@doc """
Whether to pass through a given URL.
- `true` for letting the url through.
- `false` for rejecting the url.
"""
def filter(_url), do: {:ok, true}
end
9 changes: 9 additions & 0 deletions lib/crawler/fetcher/url_filter/spec.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defmodule Crawler.Fetcher.UrlFilter.Spec do
@moduledoc """
Spec for defining an url filter.
"""

@type url :: String.t

@callback filter(url) :: {:ok, boolean} | {:error, term}
end
3 changes: 3 additions & 0 deletions lib/crawler/options.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ defmodule Crawler.Options do
@user_agent "Crawler/#{Mixfile.project[:version]} (https://github.com/fredwu/crawler)"
@save_to nil
@assets []
@url_filter Crawler.Fetcher.UrlFilter
@parser Crawler.Parser

@doc """
Expand All @@ -37,6 +38,7 @@ defmodule Crawler.Options do
user_agent: user_agent(),
save_to: save_to(),
assets: assets(),
url_filter: url_filter(),
parser: parser(),
}, opts)
end
Expand All @@ -61,5 +63,6 @@ defmodule Crawler.Options do
defp user_agent, do: Application.get_env(:crawler, :user_agent) || @user_agent
defp save_to, do: Application.get_env(:crawler, :save_to) || @save_to
defp assets, do: Application.get_env(:crawler, :assets) || @assets
defp url_filter, do: Application.get_env(:crawler, :url_filter) || @url_filter
defp parser, do: Application.get_env(:crawler, :parser) || @parser
end
3 changes: 1 addition & 2 deletions lib/crawler/parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ defmodule Crawler.Parser do

require Logger

alias __MODULE__.Spec
alias __MODULE__.{CssParser, HtmlParser, LinkParser}
alias Crawler.{Worker, Dispatcher}

@behaviour Spec
@behaviour __MODULE__.Spec

@doc """
## Examples
Expand Down
2 changes: 1 addition & 1 deletion test/lib/crawler/fetcher/policer_test.exs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Crawler.Fetcher.PolicerTest do
use Crawler.TestCase, async: true

alias Crawler.Fetcher.Policer
alias Crawler.Fetcher.{Policer, UrlFilter}

doctest Policer
end
7 changes: 7 additions & 0 deletions test/lib/crawler/fetcher/url_filter_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defmodule Crawler.Fetcher.UrlFilterTest do
use Crawler.TestCase, async: true

alias Crawler.Fetcher.UrlFilter

doctest UrlFilter
end
14 changes: 7 additions & 7 deletions test/lib/crawler/fetcher_test.exs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Crawler.FetcherTest do
use Crawler.TestCase, async: true

alias Crawler.{Fetcher, Store}
alias Crawler.{Fetcher, Fetcher.UrlFilter, Store}

doctest Fetcher

Expand All @@ -12,7 +12,7 @@ defmodule Crawler.FetcherTest do
Plug.Conn.resp(conn, 200, "<html>200</html>")
end

Fetcher.fetch(url: url, depth: 0, html_tag: "a")
Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a")

page = Store.find(url)

Expand All @@ -27,7 +27,7 @@ defmodule Crawler.FetcherTest do
Plug.Conn.resp(conn, 500, "<html>500</html>")
end

fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a")
fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a")

assert fetcher == {:error, "Failed to fetch #{url}, status code: 500"}
refute Store.find(url).body
Expand All @@ -37,12 +37,12 @@ defmodule Crawler.FetcherTest do
url = "#{url}/fetcher/timeout"

Bypass.expect_once bypass, "GET", "/fetcher/timeout", fn (conn) ->
:timer.sleep(3)
:timer.sleep(5)
Plug.Conn.resp(conn, 200, "<html>200</html>")
end

wait fn ->
fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a", timeout: 1)
fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", timeout: 2)

assert fetcher == {:error, "Failed to fetch #{url}, reason: timeout"}
refute Store.find(url).body
Expand All @@ -56,7 +56,7 @@ defmodule Crawler.FetcherTest do
Plug.Conn.resp(conn, 200, "<html>200</html>")
end

fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a", save_to: "nope")
fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", save_to: "nope")

assert {:error, "Cannot write to file nope/#{path}/fetcher/fail.html, reason: enoent"} == fetcher
end
Expand All @@ -68,7 +68,7 @@ defmodule Crawler.FetcherTest do
Plug.Conn.resp(conn, 200, "<html>200</html>")
end

Fetcher.fetch(url: url, depth: 0, html_tag: "a", save_to: tmp("fetcher"))
Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", save_to: tmp("fetcher"))

wait fn ->
assert {:ok, "<html>200</html>"} == File.read(tmp("fetcher/#{path}/fetcher", "page.html"))
Expand Down

0 comments on commit 3ec1c4b

Please sign in to comment.