diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md index 63417760..3949afc0 100644 --- a/documentation/basic_concepts.md +++ b/documentation/basic_concepts.md @@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i to prepare first requests on `init()`. Which might be useful if, for example, you want to pass a session cookie to the starting request. Note: `start_requests` are processed before start_urls. + ** This callback is going to be deprecated in favour of init/1. For now the backwords + compatibility is kept with a help of macro which always generates `init/1`. + +`init(options)` same as `init/0` but also takes options (which can be passed from the engine during +the spider start). `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website. diff --git a/lib/crawly.ex b/lib/crawly.ex index 930de76a..fae97481 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -8,33 +8,105 @@ defmodule Crawly do when you need to get individual pages and parse them. The fetched URL is being converted to a request, and the request is piped - through the middlewares specidied in a config (with the exception of - `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are - ignored) + through the middlewares specified in a config (with the exception of + `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`) + Provide a spider with the `:with` option to fetch a given webpage using that spider. + + ### Fetching with a spider + To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option. + + iex> Crawly.fetch("https://www.example.com", with: MySpider) + {%HTTPoison.Response{...}, %{...}, [...], %{...}} + + Using the `:with` option will return a 4 item tuple: + + 1. The HTTPoison response + 2. The result returned from the `parse_item/1` callback + 3. The list of items that have been processed by the declared item pipelines. + 4. The pipeline state, included for debugging purposes. """ - @spec fetch(url, headers, options) :: HTTPoison.Response.t() + @type with_opt :: {:with, nil | module()} + @type request_opt :: {:request_options, list(Crawly.Request.option())} + @type headers_opt :: {:headers, list(Crawly.Request.header())} + + @type parsed_item_result :: Crawly.ParsedItem.t() + @type parsed_items :: list(any()) + @type pipeline_state :: %{optional(atom()) => any()} + + @spec fetch(url, opts) :: + HTTPoison.Response.t() + | {HTTPoison.Response.t(), parsed_item_result, parsed_items, + pipeline_state} when url: binary(), - headers: [], - options: [] - def fetch(url, headers \\ [], options \\ []) do - request0 = Crawly.Request.new(url, headers, options) + opts: [ + with_opt + | request_opt + | headers_opt + ] + def fetch(url, opts \\ []) do + opts = Enum.into(opts, %{with: nil, request_options: [], headers: []}) + + request0 = + Crawly.Request.new(url, opts[:headers], opts[:request_options]) + |> Map.put( + :middlewares, + Crawly.Utils.get_settings(:middlewares, opts[:with], []) + ) + ignored_middlewares = [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.RobotsTxt ] - middlewares = request0.middlewares -- ignored_middlewares - {request, _} = Crawly.Utils.pipe(middlewares, request0, %{}) + new_middlewares = request0.middlewares -- ignored_middlewares - {fetcher, client_options} = Application.get_env( - :crawly, - :fetcher, - {Crawly.Fetchers.HTTPoisonFetcher, []} - ) + request0 = + Map.put( + request0, + :middlewares, + new_middlewares + ) + + {%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{}) + + {fetcher, client_options} = + Crawly.Utils.get_settings( + :fetcher, + opts[:with], + {Crawly.Fetchers.HTTPoisonFetcher, []} + ) {:ok, response} = fetcher.fetch(request, client_options) - response + + case opts[:with] do + nil -> + # no spider provided, return response as is + response + + _ -> + # spider provided, send response through parse_item callback, pipe through the pipelines + with parsed_result <- parse(response, opts[:with]), + pipelines <- + Crawly.Utils.get_settings( + :pipelines, + opts[:with] + ), + items <- Map.get(parsed_result, :items, []), + {pipeline_result, pipeline_state} <- + Enum.reduce(items, {[], %{}}, fn item, {acc, state} -> + {piped, state} = Crawly.Utils.pipe(pipelines, item, state) + + if piped == false do + # dropped + {acc, state} + else + {[piped | acc], state} + end + end) do + {response, parsed_result, pipeline_result, pipeline_state} + end + end end @doc """ @@ -49,13 +121,16 @@ defmodule Crawly do case Kernel.function_exported?(spider, :parse_item, 1) do false -> {:error, :spider_not_found} + true -> spider.parse_item(response) end end @doc """ - Returns a list of known modules which implements Crawly.Spider behaviour + Returns a list of known modules which implements Crawly.Spider behaviour. + + Should not be used for spider management. Use functions defined in `Crawly.Engine` for that. """ @spec list_spiders() :: [module()] def list_spiders(), do: Crawly.Utils.list_spiders() diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex index 160c86b7..2d27a9b6 100644 --- a/lib/crawly/engine.ex +++ b/lib/crawly/engine.ex @@ -22,12 +22,43 @@ defmodule Crawly.Engine do defstruct(started_spiders: %{}, known_spiders: []) - @spec start_spider(module(), binary()) :: - :ok - | {:error, :spider_already_started} - | {:error, :atom} - def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do - GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id}) + @doc """ + Starts a spider. All options passed in the second argument will be passed along to the spider's `init/1` callback. + + ### Reserved Options + - `:crawl_id` (binary). Optional, automatically generated if not set. + + + ### Backward compatability + If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future. + """ + @type crawl_id_opt :: {:crawl_id, binary()} + @spec start_spider(spider_name, opts) :: result + when spider_name: module(), + opts: [crawl_id_opt], + result: + :ok + | {:error, :spider_already_started} + | {:error, :atom} + def start_spider(spider_name, opts \\ []) + + def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do + Logger.warn( + "Deprecation Warning: Setting the crawl_id as second positional argument is deprecated. Please use the :crawl_id option instead. Refer to docs for more info (https://hexdocs.pm/crawly/Crawly.Engine.html#start_spider/2) " + ) + + start_spider(spider_name, crawl_id: crawl_id) + end + + def start_spider(spider_name, opts) when is_list(opts) do + opts = + Enum.into(opts, %{}) + |> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0) + + GenServer.call( + __MODULE__, + {:start_spider, spider_name, opts[:crawl_id], Map.to_list(opts)} + ) end @spec get_manager(module()) :: @@ -132,11 +163,15 @@ defmodule Crawly.Engine do {:reply, format_spider_info(state), state} end - def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do + def handle_call( + {:start_spider, spider_name, crawl_id, options}, + _form, + state + ) do result = case Map.get(state.started_spiders, spider_name) do nil -> - Crawly.EngineSup.start_spider(spider_name) + Crawly.EngineSup.start_spider(spider_name, options) _ -> {:error, :spider_already_started} diff --git a/lib/crawly/engine_sup.ex b/lib/crawly/engine_sup.ex index 0ed6603f..91fcf46b 100644 --- a/lib/crawly/engine_sup.ex +++ b/lib/crawly/engine_sup.ex @@ -11,7 +11,7 @@ defmodule Crawly.EngineSup do DynamicSupervisor.init(strategy: :one_for_one) end - def start_spider(spider_name) do + def start_spider(spider_name, options) do result = case Code.ensure_loaded?(spider_name) do true -> @@ -19,7 +19,7 @@ defmodule Crawly.EngineSup do {:ok, _sup_pid} = DynamicSupervisor.start_child( __MODULE__, - {Crawly.ManagerSup, spider_name} + {Crawly.ManagerSup, [spider_name, options]} ) false -> diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 54f77112..da6ea5b7 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -45,15 +45,15 @@ defmodule Crawly.Manager do end end - def start_link(spider_name) do + def start_link([spider_name, options]) do Logger.debug("Starting the manager for #{spider_name}") - GenServer.start_link(__MODULE__, spider_name) + GenServer.start_link(__MODULE__, [spider_name, options]) end @impl true - def init(spider_name) do + def init([spider_name, options]) do # Getting spider start urls - init = spider_name.init() + init = spider_name.init(options) # Start DataStorage worker {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name) diff --git a/lib/crawly/manager_sup.ex b/lib/crawly/manager_sup.ex index 6da56281..a59ec17a 100644 --- a/lib/crawly/manager_sup.ex +++ b/lib/crawly/manager_sup.ex @@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do @moduledoc false use Supervisor - def start_link(spider_name) do - Supervisor.start_link(__MODULE__, spider_name) + def start_link([spider_name, options]) do + Supervisor.start_link(__MODULE__, [spider_name, options]) end @impl true - def init(spider_name) do + def init([spider_name, options]) do children = [ # This supervisor is used to spawn Worker processes {DynamicSupervisor, strategy: :one_for_one, name: spider_name}, # Starts spider manager process - {Crawly.Manager, spider_name} + {Crawly.Manager, [spider_name, options]} ] Supervisor.init(children, strategy: :one_for_one) diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex index 6f1d0048..fc443c6c 100644 --- a/lib/crawly/spider.ex +++ b/lib/crawly/spider.ex @@ -4,18 +4,22 @@ defmodule Crawly.Spider do A Spider is a module which is responsible for defining: 1. `init/0` function, which must return a keyword list with start_urls/start_requests list - 2. `base_url/0` function responsible for filtering out requests not related to + 2. `init/1` same as init, but also takes a list of options sent from Engine + 3. `base_url/0` function responsible for filtering out requests not related to a given website - 3. `parse_item/1` function which is responsible for parsing the downloaded + 4. `parse_item/1` function which is responsible for parsing the downloaded request and converting it into items which can be stored and new requests which can be scheduled - 4. `custom_settings/0` an optional callback which can be used in order to + 5. `custom_settings/0` an optional callback which can be used in order to provide custom spider specific settings. Should define a list with custom settings and their values. These values will take precedence over the global settings defined in the config. """ + + @callback init() :: [start_urls: list(), start_requests: list()] + @callback init(options: keyword()) :: [start_urls: list(), start_requests: list()] @callback base_url() :: binary() @@ -26,11 +30,21 @@ defmodule Crawly.Spider do defmacro __using__(_opts) do quote do + require Logger @behaviour Crawly.Spider def override_settings(), do: [] - defoverridable override_settings: 0 + # This line is needed to keep the backward compatibility, so all spiders + # with init/0 will still work normally. + def init(_options), do: init() + + def init() do + Logger.error("Using default spider init, without start urls") + %{start_urls: []} + end + + defoverridable override_settings: 0, init: 1, init: 0 end end end diff --git a/test/crawly_test.exs b/test/crawly_test.exs index b9f6b33b..7dbd86db 100644 --- a/test/crawly_test.exs +++ b/test/crawly_test.exs @@ -1,8 +1,45 @@ defmodule CrawlyTest do use ExUnit.Case - doctest Crawly - test "greets the world" do - assert :test == :test + setup do + :meck.new(CrawlyTestSpider, [:non_strict]) + + :meck.expect(CrawlyTestSpider, :parse_item, fn _resp -> + %{ + items: [%{content: "hello"}], + requests: [ + Crawly.Utils.request_from_url("https://www.example.com/test") + ] + } + end) + + :meck.expect(CrawlyTestSpider, :override_settings, fn -> + [pipelines: [Crawly.Pipelines.JSONEncoder]] + end) + + on_exit(fn -> + :meck.unload() + end) + + {:ok, spider_module: CrawlyTestSpider} + end + + test "fetch/1 is able to fetch a given url using global config, returns a response" do + assert %HTTPoison.Response{} = Crawly.fetch("https://example.com") + end + + test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems", + %{spider_module: spider_module} do + assert {%HTTPoison.Response{}, parsed_item_res, parsed_items, + pipeline_state} = + Crawly.fetch("http://example.com", with: spider_module) + + assert %{ + items: [_], + requests: requests + } = parsed_item_res + + assert [encoded] = parsed_items + assert encoded =~ "hello" end end diff --git a/test/manager_test.exs b/test/manager_test.exs index 683dd06d..d0d9e9ee 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -140,6 +140,22 @@ defmodule ManagerTest do assert_receive {:performing_request, "https://www.example.com/blog.html"} end + + test "It's possible to initialize a spider with parameters" do + Process.register(self(), :manager_test_initial_args_test) + + urls = [ + "https://example.com/1", + "https://example.com/2", + "https://example.com/3" + ] + + :ok = Crawly.Engine.start_spider(Manager.InitialArgsTestSpider, urls: urls) + + assert_receive recv_opts + assert is_binary(recv_opts[:crawl_id]) + assert Enum.sort(recv_opts[:urls]) == Enum.sort(urls) + end end defmodule Manager.TestSpider do @@ -212,3 +228,20 @@ defmodule Manager.StartRequestsTestSpider do } end end + +defmodule Manager.InitialArgsTestSpider do + use Crawly.Spider + + def base_url() do + "https://www.example.com" + end + + def init(opts) do + send(:manager_test_initial_args_test, opts) + [start_urls: opts[:urls]] + end + + def parse_item(_response) do + %{items: [], requests: []} + end +end diff --git a/test/test_utils.ex b/test/test_utils.ex index 77e8e253..ea735f60 100644 --- a/test/test_utils.ex +++ b/test/test_utils.ex @@ -47,7 +47,6 @@ defmodule TestSpider do end defmodule UtilsTestSpider do - use GenServer use Crawly.Spider @impl true