diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md index 9ca53443..63417760 100644 --- a/documentation/basic_concepts.md +++ b/documentation/basic_concepts.md @@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines. In order to make a working web crawler, all the behaviour callbacks need to be implemented. -`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. +`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required + to prepare first requests on `init()`. Which might be useful if, for example, you + want to pass a session cookie to the starting request. Note: `start_requests` are + processed before start_urls. `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website. diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex index 462df0c1..160c86b7 100644 --- a/lib/crawly/engine.ex +++ b/lib/crawly/engine.ex @@ -41,9 +41,10 @@ defmodule Crawly.Engine do Supervisor.which_children(pid_sup) |> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1)) |> case do - nil -> {:error, :spider_not_found} + nil -> + {:error, :spider_not_found} + {_, pid, :worker, _} -> - IO.inspect pid pid end end diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 6f4a6313..469034c9 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -53,7 +53,7 @@ defmodule Crawly.Manager do @impl true def init(spider_name) do # Getting spider start urls - [start_urls: urls] = spider_name.init() + init = spider_name.init() # Start DataStorage worker {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name) @@ -65,10 +65,20 @@ defmodule Crawly.Manager do Process.link(request_storage_pid) - # Store start requests - requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end) + # Store start urls + Enum.each( + Keyword.get(init, :start_requests, []), + fn request -> + Crawly.RequestsStorage.store(spider_name, request) + end + ) - :ok = Crawly.RequestsStorage.store(spider_name, requests) + Enum.each( + Keyword.get(init, :start_urls, []), + fn url -> + Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url)) + end + ) # Start workers num_workers = diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex index afdf1570..6f1d0048 100644 --- a/lib/crawly/spider.ex +++ b/lib/crawly/spider.ex @@ -3,7 +3,7 @@ defmodule Crawly.Spider do A behavior module for implementing a Crawly Spider A Spider is a module which is responsible for defining: - 1. `init/0` function, which must return a keyword list with start_urls list + 1. `init/0` function, which must return a keyword list with start_urls/start_requests list 2. `base_url/0` function responsible for filtering out requests not related to a given website 3. `parse_item/1` function which is responsible for parsing the downloaded @@ -15,7 +15,7 @@ defmodule Crawly.Spider do global settings defined in the config. """ - @callback init() :: [start_urls: list()] + @callback init() :: [start_urls: list(), start_requests: list()] @callback base_url() :: binary() diff --git a/test/manager_test.exs b/test/manager_test.exs index 715e354b..0fb4a9f3 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -1,6 +1,8 @@ defmodule ManagerTest do use ExUnit.Case, async: false + alias Crawly.Engine + setup do Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_itemcount, 10) @@ -17,7 +19,8 @@ defmodule ManagerTest do on_exit(fn -> :meck.unload() - Crawly.Engine.stop_spider(Manager.TestSpider) + running_spiders = Engine.running_spiders() |> Map.keys() + Enum.each(running_spiders, &Engine.stop_spider/1) Application.put_env(:crawly, :manager_operations_timeout, 30_000) Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_timeout, 20) @@ -118,6 +121,25 @@ defmodule ManagerTest do assert_receive :manual_stop end + + test "It's possible to start a spider with start_requests" do + pid = self() + :ok = Crawly.Engine.start_spider(Manager.StartRequestsTestSpider) + + :meck.expect(HTTPoison, :get, fn url, _, _ -> + send(pid, {:performing_request, url}) + + {:ok, + %HTTPoison.Response{ + status_code: 200, + body: "Some page", + headers: [], + request: %{} + }} + end) + + assert_receive {:performing_request, "https://www.example.com/blog.html"} + end end defmodule Manager.TestSpider do @@ -159,12 +181,31 @@ defmodule Manager.TestSpider do ] } end +end + +defmodule Manager.StartRequestsTestSpider do + use Crawly.Spider - def spider_closed(:manual_stop) do - send(:spider_closed_callback_test, :manual_stop) + def base_url() do + "https://www.example.com" end - def spider_closed(_) do - :ignored + def init() do + [ + start_requests: [Crawly.Request.new("https://www.example.com/blog.html")] + ] + end + + def parse_item(_response) do + path = Enum.random(1..100) + + %{ + :items => [ + %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"} + ], + :requests => [ + Crawly.Utils.request_from_url("https://www.example.com/#{path}") + ] + } end end