Add support for start_requests in Crawly.Spider.init callback

It seems to be required for the case when you want to start crawling prepared requests instead of raw urls. This is must for login functionality
elixir-crawly · Oct 6, 2020 · 8c8e3f8 · 8c8e3f8
1 parent 5867f16
commit 8c8e3f8
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 14 deletions.
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.
 
 In order to make a working web crawler, all the behaviour callbacks need to be implemented.
 
-`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
+`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
+ to prepare first requests on `init()`. Which might be useful if, for example, you
+ want to pass a session cookie to the starting request. Note: `start_requests` are
+ processed before start_urls.
 
 `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.
 

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -41,9 +41,10 @@ defmodule Crawly.Engine do
         Supervisor.which_children(pid_sup)
         |> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
         |> case do
-          nil -> {:error, :spider_not_found}
+          nil ->
+            {:error, :spider_not_found}
+
           {_, pid, :worker, _} ->
-            IO.inspect pid
             pid
         end
     end

diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -53,7 +53,7 @@ defmodule Crawly.Manager do
   @impl true
   def init(spider_name) do
     # Getting spider start urls
-    [start_urls: urls] = spider_name.init()
+    init = spider_name.init()
 
     # Start DataStorage worker
     {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
@@ -65,10 +65,20 @@ defmodule Crawly.Manager do
 
     Process.link(request_storage_pid)
 
-    # Store start requests
-    requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
+    # Store start urls
+    Enum.each(
+      Keyword.get(init, :start_requests, []),
+      fn request ->
+        Crawly.RequestsStorage.store(spider_name, request)
+      end
+    )
 
-    :ok = Crawly.RequestsStorage.store(spider_name, requests)
+    Enum.each(
+      Keyword.get(init, :start_urls, []),
+      fn url ->
+        Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
+      end
+    )
 
     # Start workers
     num_workers =

diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex
@@ -3,7 +3,7 @@ defmodule Crawly.Spider do
   A behavior module for implementing a Crawly Spider
 
   A Spider is a module which is responsible for defining:
-  1. `init/0` function, which must return a keyword list with start_urls list
+  1. `init/0` function, which must return a keyword list with start_urls/start_requests list
   2. `base_url/0` function responsible for filtering out requests not related to
       a given website
   3. `parse_item/1` function which is responsible for parsing the downloaded
@@ -15,7 +15,7 @@ defmodule Crawly.Spider do
       global settings defined in the config.
   """
 
-  @callback init() :: [start_urls: list()]
+  @callback init() :: [start_urls: list(), start_requests: list()]
 
   @callback base_url() :: binary()
 

diff --git a/test/manager_test.exs b/test/manager_test.exs
@@ -1,6 +1,8 @@
 defmodule ManagerTest do
   use ExUnit.Case, async: false
 
+  alias Crawly.Engine
+
   setup do
     Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
     Application.put_env(:crawly, :closespider_itemcount, 10)
@@ -17,7 +19,8 @@ defmodule ManagerTest do
 
     on_exit(fn ->
       :meck.unload()
-      Crawly.Engine.stop_spider(Manager.TestSpider)
+      running_spiders = Engine.running_spiders() |> Map.keys()
+      Enum.each(running_spiders, &Engine.stop_spider/1)
       Application.put_env(:crawly, :manager_operations_timeout, 30_000)
       Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
       Application.put_env(:crawly, :closespider_timeout, 20)
@@ -118,6 +121,25 @@ defmodule ManagerTest do
 
     assert_receive :manual_stop
   end
+
+  test "It's possible to start a spider with start_requests" do
+    pid = self()
+    :ok = Crawly.Engine.start_spider(Manager.StartRequestsTestSpider)
+
+    :meck.expect(HTTPoison, :get, fn url, _, _ ->
+      send(pid, {:performing_request, url})
+
+      {:ok,
+       %HTTPoison.Response{
+         status_code: 200,
+         body: "Some page",
+         headers: [],
+         request: %{}
+       }}
+    end)
+
+    assert_receive {:performing_request, "https://www.example.com/blog.html"}
+  end
 end
 
 defmodule Manager.TestSpider do
@@ -159,12 +181,31 @@ defmodule Manager.TestSpider do
       ]
     }
   end
+end
+
+defmodule Manager.StartRequestsTestSpider do
+  use Crawly.Spider
 
-  def spider_closed(:manual_stop) do
-    send(:spider_closed_callback_test, :manual_stop)
+  def base_url() do
+    "https://www.example.com"
   end
 
-  def spider_closed(_) do
-    :ignored
+  def init() do
+    [
+      start_requests: [Crawly.Request.new("https://www.example.com/blog.html")]
+    ]
+  end
+
+  def parse_item(_response) do
+    path = Enum.random(1..100)
+
+    %{
+      :items => [
+        %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
+      ],
+      :requests => [
+        Crawly.Utils.request_from_url("https://www.example.com/#{path}")
+      ]
+    }
   end
 end