Skip to content

Commit

Permalink
Add support for start_requests in Crawly.Spider.init callback
Browse files Browse the repository at this point in the history
It seems to be required for the case when you want to start crawling
prepared requests instead of raw urls.

This is must for login functionality
  • Loading branch information
oltarasenko committed Oct 6, 2020
1 parent 5867f16 commit 8c8e3f8
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 14 deletions.
5 changes: 4 additions & 1 deletion documentation/basic_concepts.md
Expand Up @@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.

In order to make a working web crawler, all the behaviour callbacks need to be implemented.

`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
5 changes: 3 additions & 2 deletions lib/crawly/engine.ex
Expand Up @@ -41,9 +41,10 @@ defmodule Crawly.Engine do
Supervisor.which_children(pid_sup)
|> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
|> case do
nil -> {:error, :spider_not_found}
nil ->
{:error, :spider_not_found}

{_, pid, :worker, _} ->
IO.inspect pid
pid
end
end
Expand Down
18 changes: 14 additions & 4 deletions lib/crawly/manager.ex
Expand Up @@ -53,7 +53,7 @@ defmodule Crawly.Manager do
@impl true
def init(spider_name) do
# Getting spider start urls
[start_urls: urls] = spider_name.init()
init = spider_name.init()

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand All @@ -65,10 +65,20 @@ defmodule Crawly.Manager do

Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
# Store start urls
Enum.each(
Keyword.get(init, :start_requests, []),
fn request ->
Crawly.RequestsStorage.store(spider_name, request)
end
)

:ok = Crawly.RequestsStorage.store(spider_name, requests)
Enum.each(
Keyword.get(init, :start_urls, []),
fn url ->
Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
end
)

# Start workers
num_workers =
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/spider.ex
Expand Up @@ -3,7 +3,7 @@ defmodule Crawly.Spider do
A behavior module for implementing a Crawly Spider
A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls list
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
Expand All @@ -15,7 +15,7 @@ defmodule Crawly.Spider do
global settings defined in the config.
"""

@callback init() :: [start_urls: list()]
@callback init() :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand Down
51 changes: 46 additions & 5 deletions test/manager_test.exs
@@ -1,6 +1,8 @@
defmodule ManagerTest do
use ExUnit.Case, async: false

alias Crawly.Engine

setup do
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_itemcount, 10)
Expand All @@ -17,7 +19,8 @@ defmodule ManagerTest do

on_exit(fn ->
:meck.unload()
Crawly.Engine.stop_spider(Manager.TestSpider)
running_spiders = Engine.running_spiders() |> Map.keys()
Enum.each(running_spiders, &Engine.stop_spider/1)
Application.put_env(:crawly, :manager_operations_timeout, 30_000)
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_timeout, 20)
Expand Down Expand Up @@ -118,6 +121,25 @@ defmodule ManagerTest do

assert_receive :manual_stop
end

test "It's possible to start a spider with start_requests" do
pid = self()
:ok = Crawly.Engine.start_spider(Manager.StartRequestsTestSpider)

:meck.expect(HTTPoison, :get, fn url, _, _ ->
send(pid, {:performing_request, url})

{:ok,
%HTTPoison.Response{
status_code: 200,
body: "Some page",
headers: [],
request: %{}
}}
end)

assert_receive {:performing_request, "https://www.example.com/blog.html"}
end
end

defmodule Manager.TestSpider do
Expand Down Expand Up @@ -159,12 +181,31 @@ defmodule Manager.TestSpider do
]
}
end
end

defmodule Manager.StartRequestsTestSpider do
use Crawly.Spider

def spider_closed(:manual_stop) do
send(:spider_closed_callback_test, :manual_stop)
def base_url() do
"https://www.example.com"
end

def spider_closed(_) do
:ignored
def init() do
[
start_requests: [Crawly.Request.new("https://www.example.com/blog.html")]
]
end

def parse_item(_response) do
path = Enum.random(1..100)

%{
:items => [
%{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
],
:requests => [
Crawly.Utils.request_from_url("https://www.example.com/#{path}")
]
}
end
end

0 comments on commit 8c8e3f8

Please sign in to comment.