Merge db15123 into 5867f16

elixir-crawly · Oct 16, 2020 · decd838 · decd838
2 parents 5867f16 + db15123
commit decd838
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 14 deletions.
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.
 
 In order to make a working web crawler, all the behaviour callbacks need to be implemented.
 
-`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
+`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
+ to prepare first requests on `init()`. Which might be useful if, for example, you
+ want to pass a session cookie to the starting request. Note: `start_requests` are
+ processed before start_urls.
 
 `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.
 

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -41,9 +41,10 @@ defmodule Crawly.Engine do
         Supervisor.which_children(pid_sup)
         |> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
         |> case do
-          nil -> {:error, :spider_not_found}
+          nil ->
+            {:error, :spider_not_found}
+
           {_, pid, :worker, _} ->
-            IO.inspect pid
             pid
         end
     end

diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -53,7 +53,7 @@ defmodule Crawly.Manager do
   @impl true
   def init(spider_name) do
     # Getting spider start urls
-    [start_urls: urls] = spider_name.init()
+    init = spider_name.init()
 
     # Start DataStorage worker
     {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
@@ -65,10 +65,29 @@ defmodule Crawly.Manager do
 
     Process.link(request_storage_pid)
 
-    # Store start requests
-    requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
+    # Store start urls
+    Enum.each(
+      Keyword.get(init, :start_requests, []),
+      fn
+        %Crawly.Request{} = request ->
+          Crawly.RequestsStorage.store(spider_name, request)
+
+        request ->
+          # We should not attempt to store something which is not a request
+          Logger.error(
+            "#{inspect(request)} does not seem to be a request. Ignoring."
+          )
+
+          :ignore
+      end
+    )
 
-    :ok = Crawly.RequestsStorage.store(spider_name, requests)
+    Enum.each(
+      Keyword.get(init, :start_urls, []),
+      fn url ->
+        Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
+      end
+    )
 
     # Start workers
     num_workers =

diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex
@@ -3,7 +3,7 @@ defmodule Crawly.Spider do
   A behavior module for implementing a Crawly Spider
 
   A Spider is a module which is responsible for defining:
-  1. `init/0` function, which must return a keyword list with start_urls list
+  1. `init/0` function, which must return a keyword list with start_urls/start_requests list
   2. `base_url/0` function responsible for filtering out requests not related to
       a given website
   3. `parse_item/1` function which is responsible for parsing the downloaded
@@ -15,7 +15,7 @@ defmodule Crawly.Spider do
       global settings defined in the config.
   """
 
-  @callback init() :: [start_urls: list()]
+  @callback init() :: [start_urls: list(), start_requests: list()]
 
   @callback base_url() :: binary()
 

diff --git a/test/manager_test.exs b/test/manager_test.exs
@@ -1,6 +1,8 @@
 defmodule ManagerTest do
   use ExUnit.Case, async: false
 
+  alias Crawly.Engine
+
   setup do
     Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
     Application.put_env(:crawly, :closespider_itemcount, 10)
@@ -17,7 +19,8 @@ defmodule ManagerTest do
 
     on_exit(fn ->
       :meck.unload()
-      Crawly.Engine.stop_spider(Manager.TestSpider)
+      running_spiders = Engine.running_spiders() |> Map.keys()
+      Enum.each(running_spiders, &Engine.stop_spider/1)
       Application.put_env(:crawly, :manager_operations_timeout, 30_000)
       Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
       Application.put_env(:crawly, :closespider_timeout, 20)
@@ -118,6 +121,25 @@ defmodule ManagerTest do
 
     assert_receive :manual_stop
   end
+
+  test "It's possible to start a spider with start_requests" do
+    pid = self()
+    :ok = Crawly.Engine.start_spider(Manager.StartRequestsTestSpider)
+
+    :meck.expect(HTTPoison, :get, fn url, _, _ ->
+      send(pid, {:performing_request, url})
+
+      {:ok,
+       %HTTPoison.Response{
+         status_code: 200,
+         body: "Some page",
+         headers: [],
+         request: %{}
+       }}
+    end)
+
+    assert_receive {:performing_request, "https://www.example.com/blog.html"}
+  end
 end
 
 defmodule Manager.TestSpider do
@@ -159,12 +181,34 @@ defmodule Manager.TestSpider do
       ]
     }
   end
+end
+
+defmodule Manager.StartRequestsTestSpider do
+  use Crawly.Spider
+
+  def base_url() do
+    "https://www.example.com"
+  end
 
-  def spider_closed(:manual_stop) do
-    send(:spider_closed_callback_test, :manual_stop)
+  def init() do
+    [
+      start_requests: [
+        Crawly.Request.new("https://www.example.com/blog.html"),
+        "Incorrect request"
+      ]
+    ]
   end
 
-  def spider_closed(_) do
-    :ignored
+  def parse_item(_response) do
+    path = Enum.random(1..100)
+
+    %{
+      :items => [
+        %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
+      ],
+      :requests => [
+        Crawly.Utils.request_from_url("https://www.example.com/#{path}")
+      ]
+    }
   end
 end