elixir-crawly · oltarasenko · Nov 13, 2020 · May 18, 2020 · Nov 2, 2020 · Nov 2, 2020
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i
  to prepare first requests on `init()`. Which might be useful if, for example, you
  want to pass a session cookie to the starting request. Note: `start_requests` are
  processed before start_urls.
+ ** This callback is going to be deprecated in favour of init/1. For now the backwords 
+ compatibility is kept with a help of macro which always generates `init/1`.
+
+`init(options)` same as `init/0` but also takes options (which can be passed from the engine during 
+the spider start). 
 
 `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.
 

diff --git a/lib/crawly.ex b/lib/crawly.ex
@@ -8,33 +8,105 @@ defmodule Crawly do
   when you need to get individual pages and parse them.
 
   The fetched URL is being converted to a request, and the request is piped
-  through the middlewares specidied in a config (with the exception of
-  `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are
-  ignored)
+  through the middlewares specified in a config (with the exception of
+  `Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`)
 
+  Provide a spider with the `:with` option to fetch a given webpage using that spider.
+
+  ### Fetching with a spider
+  To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option.
+
+    iex> Crawly.fetch("https://www.example.com", with: MySpider)
+    {%HTTPoison.Response{...}, %{...}, [...], %{...}}
+
+  Using the `:with` option will return a 4 item tuple:
+
+  1. The HTTPoison response
+  2. The result returned from the `parse_item/1` callback
+  3. The list of items that have been processed by the declared item pipelines.
+  4. The pipeline state, included for debugging purposes.
   """
-  @spec fetch(url, headers, options) :: HTTPoison.Response.t()
+  @type with_opt :: {:with, nil | module()}
+  @type request_opt :: {:request_options, list(Crawly.Request.option())}
+  @type headers_opt :: {:headers, list(Crawly.Request.header())}
+
+  @type parsed_item_result :: Crawly.ParsedItem.t()
+  @type parsed_items :: list(any())
+  @type pipeline_state :: %{optional(atom()) => any()}
+
+  @spec fetch(url, opts) ::
+          HTTPoison.Response.t()
+          | {HTTPoison.Response.t(), parsed_item_result, parsed_items,
+             pipeline_state}
         when url: binary(),
-             headers: [],
-             options: []
-  def fetch(url, headers \\ [], options \\ []) do
-    request0 = Crawly.Request.new(url, headers, options)
+             opts: [
+               with_opt
+               | request_opt
+               | headers_opt
+             ]
+  def fetch(url, opts \\ []) do
+    opts = Enum.into(opts, %{with: nil, request_options: [], headers: []})
+
+    request0 =
+      Crawly.Request.new(url, opts[:headers], opts[:request_options])
+      |> Map.put(
+        :middlewares,
+        Crawly.Utils.get_settings(:middlewares, opts[:with], [])
+      )
+
     ignored_middlewares = [
       Crawly.Middlewares.DomainFilter,
       Crawly.Middlewares.RobotsTxt
     ]
-    middlewares = request0.middlewares -- ignored_middlewares
 
-    {request, _} = Crawly.Utils.pipe(middlewares, request0, %{})
+    new_middlewares = request0.middlewares -- ignored_middlewares
 
-    {fetcher, client_options} = Application.get_env(
-      :crawly,
-      :fetcher,
-      {Crawly.Fetchers.HTTPoisonFetcher, []}
-    )
+    request0 =
+      Map.put(
+        request0,
+        :middlewares,
+        new_middlewares
+      )
+
+    {%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})
+
+    {fetcher, client_options} =
+      Crawly.Utils.get_settings(
+        :fetcher,
+        opts[:with],
+        {Crawly.Fetchers.HTTPoisonFetcher, []}
+      )
 
     {:ok, response} = fetcher.fetch(request, client_options)
-    response
+
+    case opts[:with] do
+      nil ->
+        # no spider provided, return response as is
+        response
+
+      _ ->
+        # spider provided, send response through  parse_item callback, pipe through the pipelines
+        with parsed_result <- parse(response, opts[:with]),
+             pipelines <-
+               Crawly.Utils.get_settings(
+                 :pipelines,
+                 opts[:with]
+               ),
+             items <- Map.get(parsed_result, :items, []),
+             {pipeline_result, pipeline_state} <-
+               Enum.reduce(items, {[], %{}}, fn item, {acc, state} ->
+                 {piped, state} = Crawly.Utils.pipe(pipelines, item, state)
+
+                 if piped == false do
+                   # dropped
+                   {acc, state}
+                 else
+                   {[piped | acc], state}
+                 end
+               end) do
+          {response, parsed_result, pipeline_result, pipeline_state}
+        end
+    end
   end
 
   @doc """
@@ -49,13 +121,16 @@ defmodule Crawly do
     case Kernel.function_exported?(spider, :parse_item, 1) do
       false ->
         {:error, :spider_not_found}
+
       true ->
         spider.parse_item(response)
     end
   end
 
   @doc """
-  Returns a list of known modules which implements Crawly.Spider behaviour
+  Returns a list of known modules which implements Crawly.Spider behaviour.
+
+  Should not be used for spider management. Use functions defined in `Crawly.Engine` for that.
   """
   @spec list_spiders() :: [module()]
   def list_spiders(), do: Crawly.Utils.list_spiders()

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -22,12 +22,43 @@ defmodule Crawly.Engine do
 
   defstruct(started_spiders: %{}, known_spiders: [])
 
-  @spec start_spider(module(), binary()) ::
-          :ok
-          | {:error, :spider_already_started}
-          | {:error, :atom}
-  def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
-    GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
+  @doc """
+  Starts a spider. All options passed in the second argument will be passed along to the spider's `init/1` callback.
+
+  ### Reserved Options
+  - `:crawl_id` (binary). Optional, automatically generated if not set.
+
+
+  ### Backward compatability
+  If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future.
+  """
+  @type crawl_id_opt :: {:crawl_id, binary()}
+  @spec start_spider(spider_name, opts) :: result
+        when spider_name: module(),
+             opts: [crawl_id_opt],
+             result:
+               :ok
+               | {:error, :spider_already_started}
+               | {:error, :atom}
+  def start_spider(spider_name, opts \\ [])
+
+  def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
+    Logger.warn(
+      "Deprecation Warning: Setting the crawl_id as second positional argument is deprecated. Please use the :crawl_id option instead. Refer to docs for more info (https://hexdocs.pm/crawly/Crawly.Engine.html#start_spider/2) "
+    )
+
+    start_spider(spider_name, crawl_id: crawl_id)
+  end
+
+  def start_spider(spider_name, opts) when is_list(opts) do
+    opts =
+      Enum.into(opts, %{})
+      |> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0)
+
+    GenServer.call(
+      __MODULE__,
+      {:start_spider, spider_name, opts[:crawl_id], Map.to_list(opts)}
+    )
   end
 
   @spec get_manager(module()) ::
@@ -132,11 +163,15 @@ defmodule Crawly.Engine do
     {:reply, format_spider_info(state), state}
   end
 
-  def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
+  def handle_call(
+        {:start_spider, spider_name, crawl_id, options},
+        _form,
+        state
+      ) do
     result =
       case Map.get(state.started_spiders, spider_name) do
         nil ->
-          Crawly.EngineSup.start_spider(spider_name)
+          Crawly.EngineSup.start_spider(spider_name, options)
 
         _ ->
           {:error, :spider_already_started}

diff --git a/lib/crawly/engine_sup.ex b/lib/crawly/engine_sup.ex
@@ -11,15 +11,15 @@ defmodule Crawly.EngineSup do
     DynamicSupervisor.init(strategy: :one_for_one)
   end
 
-  def start_spider(spider_name) do
+  def start_spider(spider_name, options) do
     result =
       case Code.ensure_loaded?(spider_name) do
         true ->
           # Given spider module exists in the namespace, we can proceed
           {:ok, _sup_pid} =
             DynamicSupervisor.start_child(
               __MODULE__,
-              {Crawly.ManagerSup, spider_name}
+              {Crawly.ManagerSup, [spider_name, options]}
             )
 
         false ->

diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -45,15 +45,15 @@ defmodule Crawly.Manager do
     end
   end
 
-  def start_link(spider_name) do
+  def start_link([spider_name, options]) do
     Logger.debug("Starting the manager for #{spider_name}")
-    GenServer.start_link(__MODULE__, spider_name)
+    GenServer.start_link(__MODULE__, [spider_name, options])
   end
 
   @impl true
-  def init(spider_name) do
+  def init([spider_name, options]) do
     # Getting spider start urls
-    init = spider_name.init()
+    init = spider_name.init(options)
 
     # Start DataStorage worker
     {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)

diff --git a/lib/crawly/manager_sup.ex b/lib/crawly/manager_sup.ex
@@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do
   @moduledoc false
   use Supervisor
 
-  def start_link(spider_name) do
-    Supervisor.start_link(__MODULE__, spider_name)
+  def start_link([spider_name, options]) do
+    Supervisor.start_link(__MODULE__, [spider_name, options])
   end
 
   @impl true
-  def init(spider_name) do
+  def init([spider_name, options]) do
     children = [
       # This supervisor is used to spawn Worker processes
       {DynamicSupervisor, strategy: :one_for_one, name: spider_name},
 
       # Starts spider manager process
-      {Crawly.Manager, spider_name}
+      {Crawly.Manager, [spider_name, options]}
     ]
 
     Supervisor.init(children, strategy: :one_for_one)

diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex
@@ -4,18 +4,22 @@ defmodule Crawly.Spider do
 
   A Spider is a module which is responsible for defining:
   1. `init/0` function, which must return a keyword list with start_urls/start_requests list
-  2. `base_url/0` function responsible for filtering out requests not related to
+  2. `init/1` same as init, but also takes a list of options sent from Engine
+  3. `base_url/0` function responsible for filtering out requests not related to
       a given website
-  3. `parse_item/1` function which is responsible for parsing the downloaded
+  4. `parse_item/1` function which is responsible for parsing the downloaded
      request and converting it into items which can be stored and new requests
      which can be scheduled
-  4. `custom_settings/0` an optional callback which can be used in order to
+  5. `custom_settings/0` an optional callback which can be used in order to
       provide custom spider specific settings. Should define a list with custom
       settings and their values. These values will take precedence over the
       global settings defined in the config.
   """
 
+
+
   @callback init() :: [start_urls: list(), start_requests: list()]
+  @callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]
 
   @callback base_url() :: binary()
 
@@ -26,11 +30,21 @@ defmodule Crawly.Spider do
 
   defmacro __using__(_opts) do
     quote do
+      require Logger
       @behaviour Crawly.Spider
 
       def override_settings(), do: []
 
-      defoverridable override_settings: 0
+      # This line is needed to keep the backward compatibility, so all spiders
+      # with init/0 will still work normally.
+      def init(_options), do: init()
+
+      def init() do
+        Logger.error("Using default spider init, without start urls")
+        %{start_urls: []}
+      end
+
+      defoverridable override_settings: 0, init: 1, init: 0
     end
   end
 end
diff --git a/test/crawly_test.exs b/test/crawly_test.exs
@@ -1,8 +1,45 @@
 defmodule CrawlyTest do
   use ExUnit.Case
-  doctest Crawly
 
-  test "greets the world" do
-    assert :test == :test
+  setup do
+    :meck.new(CrawlyTestSpider, [:non_strict])
+
+    :meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
+      %{
+        items: [%{content: "hello"}],
+        requests: [
+          Crawly.Utils.request_from_url("https://www.example.com/test")
+        ]
+      }
+    end)
+
+    :meck.expect(CrawlyTestSpider, :override_settings, fn ->
+      [pipelines: [Crawly.Pipelines.JSONEncoder]]
+    end)
+
+    on_exit(fn ->
+      :meck.unload()
+    end)
+
+    {:ok, spider_module: CrawlyTestSpider}
+  end
+
+  test "fetch/1 is able to fetch a given url using global config, returns a response" do
+    assert %HTTPoison.Response{} = Crawly.fetch("https://example.com")
+  end
+
+  test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems",
+       %{spider_module: spider_module} do
+    assert {%HTTPoison.Response{}, parsed_item_res, parsed_items,
+            pipeline_state} =
+             Crawly.fetch("http://example.com", with: spider_module)
+
+    assert %{
+             items: [_],
+             requests: requests
+           } = parsed_item_res
+
+    assert [encoded] = parsed_items
+    assert encoded =~ "hello"
   end
 end