Added url_filter option to allow crawl restrictions

fredwu · Aug 26, 2017 · 3ec1c4b · 3ec1c4b
1 parent 421b0ca
commit 3ec1c4b
Show file tree

Hide file tree

Showing 10 changed files with 100 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,8 +11,9 @@
 - [Added] `:timeout` option
 - [Added] `:user_agent` option
 - [Added] `:save_to` option
-- [Added] `:parser` option to allow custom parsing logic
 - [Added] `:assets` option
+- [Added] `:url_filter` option to allow custom url filtering logic
+- [Added] `:parser` option to allow custom parsing logic
 - [Improved] Renamed `:max_levels` to `:max_depths`
 - [Improved] Varies small fixes and improvements
 

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Crawler is under active development, below is a non-comprehensive list of featur
   - [x] css
   - [x] images
 - [ ] The ability to manually stop/pause/restart the crawler.
-- [ ] Restrict crawlable domains, paths or file types.
+- [x] Restrict crawlable domains, paths or file types.
 - [x] Limit concurrent crawlers.
 - [x] Limit rate of crawling.
 - [x] Set crawler's user agent.
@@ -35,22 +35,27 @@ Crawler.crawl("http://elixir-lang.org", max_depths: 2)
 
 ## Configurations
 
-| Option          | Type    | Default Value         | Description |
-|-----------------|---------|-----------------------|-------------|
-| `:max_depths`   | integer | `3`                   | Maximum nested depth of pages to crawl.
-| `:workers`      | integer | `10`                  | Maximum number of concurrent workers for crawling.
-| `:interval`     | integer | `0`                   | Rate limit control - number of milliseconds before crawling more pages, defaults to `0` which is effectively no rate limit.
-| `:timeout`      | integer | `5000`                | Timeout value for fetching a page, in ms.
-| `:user_agent`   | string  | `Crawler/x.x.x (...)` | User-Agent value sent by the fetch requests.
-| `:save_to`      | string  | `nil`                 | When provided, the path for saving crawled pages.
-| `:assets`       | list    | `[]`                  | Whether to fetch any asset files, available options: `"css"`, `"js"`, `"images"`.
-| `:parser`       | module  | `Crawler.Parser`      | The default parser, useful when you need to handle parsing differently or to add extra functionalities.
+| Option          | Type    | Default Value               | Description |
+|-----------------|---------|-----------------------------|-------------|
+| `:max_depths`   | integer | `3`                         | Maximum nested depth of pages to crawl.
+| `:workers`      | integer | `10`                        | Maximum number of concurrent workers for crawling.
+| `:interval`     | integer | `0`                         | Rate limit control - number of milliseconds before crawling more pages, defaults to `0` which is effectively no rate limit.
+| `:timeout`      | integer | `5000`                      | Timeout value for fetching a page, in ms.
+| `:user_agent`   | string  | `Crawler/x.x.x (...)`       | User-Agent value sent by the fetch requests.
+| `:save_to`      | string  | `nil`                       | When provided, the path for saving crawled pages.
+| `:assets`       | list    | `[]`                        | Whether to fetch any asset files, available options: `"css"`, `"js"`, `"images"`.
+| `:url_filter`   | module  | `Crawler.Fetcher.UrlFilter` | Custom URL filter, useful when you need to restrict crawlable domains, paths or file types.
+| `:parser`       | module  | `Crawler.Parser`            | Custom parser, useful when you need to handle parsing differently or to add extra functionalities.
 
-## Custom Parser
+## Custom URL Filter
 
-It is possible to swap in your custom parsing logic by specifying the `:parser` option. Your custom parser needs to conform to the `Crawler.Parser.Spec` behaviour:
+It is possible to swap in your custom url filtering or parsing logic as shown in the configurations section. Your custom modules need to conform to their respective behaviours:
 
 ```elixir
+defmodule CustomUrlFilter do
+  @behaviour Crawler.Fetcher.UrlFilter.Spec
+end
+
 defmodule CustomParser do
   @behaviour Crawler.Parser.Spec
 end

diff --git a/lib/crawler/fetcher/policer.ex b/lib/crawler/fetcher/policer.ex
@@ -12,26 +12,50 @@ defmodule Crawler.Fetcher.Policer do
   @doc """
   ## Examples
 
-      iex> Policer.police([depth: 1, max_depths: 2, url: "http://policer/"])
-      {:ok, [depth: 1, max_depths: 2, url: "http://policer/"]}
+      iex> Policer.police([
+      iex>   depth: 1,
+      iex>   max_depths: 2,
+      iex>   url: "http://policer/",
+      iex>   url_filter: UrlFilter
+      iex> ])
+      {:ok, [depth: 1, max_depths: 2, url: "http://policer/", url_filter: UrlFilter]}
 
-      iex> Policer.police([depth: 2, max_depths: 2, html_tag: "a"])
+      iex> Policer.police([
+      iex>   depth: 2,
+      iex>   max_depths: 2,
+      iex>   html_tag: "a"
+      iex> ])
       {:error, "Fetch failed 'within_fetch_depth?', with opts: [depth: 2, max_depths: 2, html_tag: \\\"a\\\"]."}
 
-      iex> Policer.police([depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg"])
-      {:ok, [depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg"]}
+      iex> Policer.police([
+      iex>   depth: 3,
+      iex>   max_depths: 2,
+      iex>   html_tag: "img",
+      iex>   url: "http://policer/hi.jpg",
+      iex>   url_filter: UrlFilter
+      iex> ])
+      {:ok, [depth: 3, max_depths: 2, html_tag: "img", url: "http://policer/hi.jpg", url_filter: UrlFilter]}
 
-      iex> Policer.police([depth: 1, max_depths: 2, url: "ftp://hello.world"])
+      iex> Policer.police([
+      iex>   depth: 1,
+      iex>   max_depths: 2,
+      iex>   url: "ftp://hello.world"
+      iex> ])
       {:error, "Fetch failed 'acceptable_uri_scheme?', with opts: [depth: 1, max_depths: 2, url: \\\"ftp://hello.world\\\"]."}
 
       iex> Crawler.Store.add("http://policer/exist/")
-      iex> Policer.police([depth: 1, max_depths: 2, url: "http://policer/exist/"])
+      iex> Policer.police([
+      iex>   depth: 1,
+      iex>   max_depths: 2,
+      iex>   url: "http://policer/exist/"
+      iex> ])
       {:error, "Fetch failed 'not_fetched_yet?', with opts: [depth: 1, max_depths: 2, url: \\\"http://policer/exist/\\\"]."}
   """
   def police(opts) do
     with {_, true} <- within_fetch_depth?(opts),
          {_, true} <- acceptable_uri_scheme?(opts),
-         {_, true} <- not_fetched_yet?(opts)
+         {_, true} <- not_fetched_yet?(opts),
+         {_, true} <- perform_url_filtering(opts)
     do
       {:ok, opts}
     else
@@ -60,6 +84,12 @@ defmodule Crawler.Fetcher.Policer do
     {:not_fetched_yet?, !Store.find(opts[:url])}
   end
 
+  defp perform_url_filtering(opts) do
+    {:ok, pass_through?} = opts[:url_filter].filter(opts[:url])
+
+    {:perform_url_filtering, pass_through?}
+  end
+
   defp police_error(fail_type, opts) do
     {:error, "Fetch failed '#{fail_type}', with opts: #{Kernel.inspect(opts)}."}
   end

diff --git a/lib/crawler/fetcher/url_filter.ex b/lib/crawler/fetcher/url_filter.ex
@@ -0,0 +1,14 @@
+defmodule Crawler.Fetcher.UrlFilter do
+  @moduledoc """
+  A placeholder module that let all URLs pass through.
+  """
+
+  @behaviour __MODULE__.Spec
+
+  @doc """
+  Whether to pass through a given URL.
+    - `true` for letting the url through.
+    - `false` for rejecting the url.
+  """
+  def filter(_url), do: {:ok, true}
+end
diff --git a/lib/crawler/fetcher/url_filter/spec.ex b/lib/crawler/fetcher/url_filter/spec.ex
@@ -0,0 +1,9 @@
+defmodule Crawler.Fetcher.UrlFilter.Spec do
+  @moduledoc """
+  Spec for defining an url filter.
+  """
+
+  @type url :: String.t
+
+  @callback filter(url) :: {:ok, boolean} | {:error, term}
+end
diff --git a/lib/crawler/options.ex b/lib/crawler/options.ex
@@ -12,6 +12,7 @@ defmodule Crawler.Options do
   @user_agent "Crawler/#{Mixfile.project[:version]} (https://github.com/fredwu/crawler)"
   @save_to    nil
   @assets     []
+  @url_filter Crawler.Fetcher.UrlFilter
   @parser     Crawler.Parser
 
   @doc """
@@ -37,6 +38,7 @@ defmodule Crawler.Options do
       user_agent: user_agent(),
       save_to:    save_to(),
       assets:     assets(),
+      url_filter: url_filter(),
       parser:     parser(),
     }, opts)
   end
@@ -61,5 +63,6 @@ defmodule Crawler.Options do
   defp user_agent, do: Application.get_env(:crawler, :user_agent) || @user_agent
   defp save_to,    do: Application.get_env(:crawler, :save_to)    || @save_to
   defp assets,     do: Application.get_env(:crawler, :assets)     || @assets
+  defp url_filter, do: Application.get_env(:crawler, :url_filter) || @url_filter
   defp parser,     do: Application.get_env(:crawler, :parser)     || @parser
 end
diff --git a/lib/crawler/parser.ex b/lib/crawler/parser.ex
@@ -5,11 +5,10 @@ defmodule Crawler.Parser do
 
   require Logger
 
-  alias __MODULE__.Spec
   alias __MODULE__.{CssParser, HtmlParser, LinkParser}
   alias Crawler.{Worker, Dispatcher}
 
-  @behaviour Spec
+  @behaviour __MODULE__.Spec
 
   @doc """
   ## Examples

diff --git a/test/lib/crawler/fetcher/policer_test.exs b/test/lib/crawler/fetcher/policer_test.exs
@@ -1,7 +1,7 @@
 defmodule Crawler.Fetcher.PolicerTest do
   use Crawler.TestCase, async: true
 
-  alias Crawler.Fetcher.Policer
+  alias Crawler.Fetcher.{Policer, UrlFilter}
 
   doctest Policer
 end
diff --git a/test/lib/crawler/fetcher/url_filter_test.exs b/test/lib/crawler/fetcher/url_filter_test.exs
@@ -0,0 +1,7 @@
+defmodule Crawler.Fetcher.UrlFilterTest do
+  use Crawler.TestCase, async: true
+
+  alias Crawler.Fetcher.UrlFilter
+
+  doctest UrlFilter
+end
diff --git a/test/lib/crawler/fetcher_test.exs b/test/lib/crawler/fetcher_test.exs
@@ -1,7 +1,7 @@
 defmodule Crawler.FetcherTest do
   use Crawler.TestCase, async: true
 
-  alias Crawler.{Fetcher, Store}
+  alias Crawler.{Fetcher, Fetcher.UrlFilter, Store}
 
   doctest Fetcher
 
@@ -12,7 +12,7 @@ defmodule Crawler.FetcherTest do
       Plug.Conn.resp(conn, 200, "<html>200</html>")
     end
 
-    Fetcher.fetch(url: url, depth: 0, html_tag: "a")
+    Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a")
 
     page = Store.find(url)
 
@@ -27,7 +27,7 @@ defmodule Crawler.FetcherTest do
       Plug.Conn.resp(conn, 500, "<html>500</html>")
     end
 
-    fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a")
+    fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a")
 
     assert fetcher == {:error, "Failed to fetch #{url}, status code: 500"}
     refute Store.find(url).body
@@ -37,12 +37,12 @@ defmodule Crawler.FetcherTest do
     url = "#{url}/fetcher/timeout"
 
     Bypass.expect_once bypass, "GET", "/fetcher/timeout", fn (conn) ->
-      :timer.sleep(3)
+      :timer.sleep(5)
       Plug.Conn.resp(conn, 200, "<html>200</html>")
     end
 
     wait fn ->
-      fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a", timeout: 1)
+      fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", timeout: 2)
 
       assert fetcher == {:error, "Failed to fetch #{url}, reason: timeout"}
       refute Store.find(url).body
@@ -56,7 +56,7 @@ defmodule Crawler.FetcherTest do
       Plug.Conn.resp(conn, 200, "<html>200</html>")
     end
 
-    fetcher = Fetcher.fetch(url: url, depth: 0, html_tag: "a", save_to: "nope")
+    fetcher = Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", save_to: "nope")
 
     assert {:error, "Cannot write to file nope/#{path}/fetcher/fail.html, reason: enoent"} == fetcher
   end
@@ -68,7 +68,7 @@ defmodule Crawler.FetcherTest do
       Plug.Conn.resp(conn, 200, "<html>200</html>")
     end
 
-    Fetcher.fetch(url: url, depth: 0, html_tag: "a", save_to: tmp("fetcher"))
+    Fetcher.fetch(url: url, depth: 0, url_filter: UrlFilter, html_tag: "a", save_to: tmp("fetcher"))
 
     wait fn ->
       assert {:ok, "<html>200</html>"} == File.read(tmp("fetcher/#{path}/fetcher", "page.html"))