elixir-crawly · oltarasenko · Nov 11, 2020 · Nov 10, 2020 · Nov 11, 2020
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i
  to prepare first requests on `init()`. Which might be useful if, for example, you
  want to pass a session cookie to the starting request. Note: `start_requests` are
  processed before start_urls.
+ ** This callback is going to be deprecated in favour of init/1. For now the backwords 
+ compatibility is kept with a help of macro which always generates `init/1`.
+
+`init(options)` same as `init/0` but also takes options (which can be passed from the engine during 
+the spider start). 
 
 `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.
 

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -22,12 +22,52 @@ defmodule Crawly.Engine do
 
   defstruct(started_spiders: %{}, known_spiders: [])
 
-  @spec start_spider(module(), binary()) ::
-          :ok
-          | {:error, :spider_already_started}
-          | {:error, :atom}
-  def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
-    GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
+  @spec start_spider(spider_name) :: result
+        when spider_name: module(),
+             result:
+               :ok
+               | {:error, :spider_already_started}
+               | {:error, :atom}
+
+  def start_spider(spider_name) do
+    start_spider(spider_name, UUID.uuid1(), [])
+  end
+
+  @spec start_spider(spider_name, crawl_id) :: result
+        when spider_name: module(),
+             crawl_id: binary(),
+             result:
+               :ok
+               | {:error, :spider_already_started}
+               | {:error, :atom}
+
+  def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
+    start_spider(spider_name, crawl_id, [])
+  end
+
+  @spec start_spider(spider_name, options) :: result
+        when spider_name: module(),
+             options: list(),
+             result:
+               :ok
+               | {:error, :spider_already_started}
+               | {:error, :atom}
+
+  def start_spider(spider_name, options) when is_list(options) do
+    start_spider(spider_name, UUID.uuid1(), options)
+  end
+
+  @spec start_spider(spider_name, crawl_id, options) :: result
+        when spider_name: module(),
+             crawl_id: binary(),
+             options: list(),
+             result:
+               :ok
+               | {:error, :spider_already_started}
+               | {:error, :atom}
+
+  def start_spider(spider_name, crawl_id, options) do
+    GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id, options})
   end
 
   @spec get_manager(module()) ::
@@ -132,11 +172,15 @@ defmodule Crawly.Engine do
     {:reply, format_spider_info(state), state}
   end
 
-  def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
+  def handle_call(
+        {:start_spider, spider_name, crawl_id, options},
+        _form,
+        state
+      ) do
     result =
       case Map.get(state.started_spiders, spider_name) do
         nil ->
-          Crawly.EngineSup.start_spider(spider_name)
+          Crawly.EngineSup.start_spider(spider_name, options)
 
         _ ->
           {:error, :spider_already_started}

diff --git a/lib/crawly/engine_sup.ex b/lib/crawly/engine_sup.ex
@@ -11,15 +11,15 @@ defmodule Crawly.EngineSup do
     DynamicSupervisor.init(strategy: :one_for_one)
   end
 
-  def start_spider(spider_name) do
+  def start_spider(spider_name, options) do
     result =
       case Code.ensure_loaded?(spider_name) do
         true ->
           # Given spider module exists in the namespace, we can proceed
           {:ok, _sup_pid} =
             DynamicSupervisor.start_child(
               __MODULE__,
-              {Crawly.ManagerSup, spider_name}
+              {Crawly.ManagerSup, [spider_name, options]}
             )
 
         false ->

diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -45,15 +45,15 @@ defmodule Crawly.Manager do
     end
   end
 
-  def start_link(spider_name) do
+  def start_link([spider_name, options]) do
     Logger.debug("Starting the manager for #{spider_name}")
-    GenServer.start_link(__MODULE__, spider_name)
+    GenServer.start_link(__MODULE__, [spider_name, options])
   end
 
   @impl true
-  def init(spider_name) do
+  def init([spider_name, options]) do
     # Getting spider start urls
-    init = spider_name.init()
+    init = spider_name.init(options)
 
     # Start DataStorage worker
     {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)

diff --git a/lib/crawly/manager_sup.ex b/lib/crawly/manager_sup.ex
@@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do
   @moduledoc false
   use Supervisor
 
-  def start_link(spider_name) do
-    Supervisor.start_link(__MODULE__, spider_name)
+  def start_link([spider_name, options]) do
+    Supervisor.start_link(__MODULE__, [spider_name, options])
   end
 
   @impl true
-  def init(spider_name) do
+  def init([spider_name, options]) do
     children = [
       # This supervisor is used to spawn Worker processes
       {DynamicSupervisor, strategy: :one_for_one, name: spider_name},
 
       # Starts spider manager process
-      {Crawly.Manager, spider_name}
+      {Crawly.Manager, [spider_name, options]}
     ]
 
     Supervisor.init(children, strategy: :one_for_one)

diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex
@@ -4,18 +4,22 @@ defmodule Crawly.Spider do
 
   A Spider is a module which is responsible for defining:
   1. `init/0` function, which must return a keyword list with start_urls/start_requests list
-  2. `base_url/0` function responsible for filtering out requests not related to
+  2. `init/1` same as init, but also takes a list of options sent from Engine
+  3. `base_url/0` function responsible for filtering out requests not related to
       a given website
-  3. `parse_item/1` function which is responsible for parsing the downloaded
+  4. `parse_item/1` function which is responsible for parsing the downloaded
      request and converting it into items which can be stored and new requests
      which can be scheduled
-  4. `custom_settings/0` an optional callback which can be used in order to
+  5. `custom_settings/0` an optional callback which can be used in order to
       provide custom spider specific settings. Should define a list with custom
       settings and their values. These values will take precedence over the
       global settings defined in the config.
   """
 
+
+
   @callback init() :: [start_urls: list(), start_requests: list()]
+  @callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]
 
   @callback base_url() :: binary()
 
@@ -26,11 +30,21 @@ defmodule Crawly.Spider do
 
   defmacro __using__(_opts) do
     quote do
+      require Logger
       @behaviour Crawly.Spider
 
       def override_settings(), do: []
 
-      defoverridable override_settings: 0
+      # This line is needed to keep the backward compatibility, so all spiders
+      # with init/0 will still work normally.
+      def init(_options), do: init()
+
+      def init() do
+        Logger.error("Using default spider init, without start urls")
+        %{start_urls: []}
+      end
+
+      defoverridable override_settings: 0, init: 1, init: 0
     end
   end
 end
diff --git a/test/manager_test.exs b/test/manager_test.exs
@@ -140,6 +140,16 @@ defmodule ManagerTest do
 
     assert_receive {:performing_request, "https://www.example.com/blog.html"}
   end
+
+  test "It's possible to initialize a spider with parameters" do
+    Process.register(self(), :manager_test_initial_args_test)
+    urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
+    :ok = Crawly.Engine.start_spider(Manager.InitialArgsTestSpider, [urls: urls])
+
+    assert_receive recv_urls
+
+    assert Enum.sort(recv_urls) == Enum.sort(urls)
+  end
 end
 
 defmodule Manager.TestSpider do
@@ -212,3 +222,20 @@ defmodule Manager.StartRequestsTestSpider do
     }
   end
 end
+
+defmodule Manager.InitialArgsTestSpider do
+  use Crawly.Spider
+
+  def base_url() do
+    "https://www.example.com"
+  end
+
+  def init([urls: list_of_urls]) do
+    send(:manager_test_initial_args_test, list_of_urls)
+    [start_urls: list_of_urls]
+   end
+
+  def parse_item(_response) do
+    %{items: [], requests: []}
+  end
+end
diff --git a/test/test_utils.ex b/test/test_utils.ex
@@ -47,7 +47,6 @@ defmodule TestSpider do
 end
 
 defmodule UtilsTestSpider do
-  use GenServer
   use Crawly.Spider
 
   @impl true