Skip to content

Commit

Permalink
Merge db15123 into 5867f16
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko authored Oct 16, 2020
2 parents 5867f16 + db15123 commit decd838
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 14 deletions.
5 changes: 4 additions & 1 deletion documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.

In order to make a working web crawler, all the behaviour callbacks need to be implemented.

`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
5 changes: 3 additions & 2 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ defmodule Crawly.Engine do
Supervisor.which_children(pid_sup)
|> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
|> case do
nil -> {:error, :spider_not_found}
nil ->
{:error, :spider_not_found}

{_, pid, :worker, _} ->
IO.inspect pid
pid
end
end
Expand Down
27 changes: 23 additions & 4 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ defmodule Crawly.Manager do
@impl true
def init(spider_name) do
# Getting spider start urls
[start_urls: urls] = spider_name.init()
init = spider_name.init()

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand All @@ -65,10 +65,29 @@ defmodule Crawly.Manager do

Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
# Store start urls
Enum.each(
Keyword.get(init, :start_requests, []),
fn
%Crawly.Request{} = request ->
Crawly.RequestsStorage.store(spider_name, request)

request ->
# We should not attempt to store something which is not a request
Logger.error(
"#{inspect(request)} does not seem to be a request. Ignoring."
)

:ignore
end
)

:ok = Crawly.RequestsStorage.store(spider_name, requests)
Enum.each(
Keyword.get(init, :start_urls, []),
fn url ->
Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
end
)

# Start workers
num_workers =
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ defmodule Crawly.Spider do
A behavior module for implementing a Crawly Spider
A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls list
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
Expand All @@ -15,7 +15,7 @@ defmodule Crawly.Spider do
global settings defined in the config.
"""

@callback init() :: [start_urls: list()]
@callback init() :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand Down
54 changes: 49 additions & 5 deletions test/manager_test.exs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
defmodule ManagerTest do
use ExUnit.Case, async: false

alias Crawly.Engine

setup do
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_itemcount, 10)
Expand All @@ -17,7 +19,8 @@ defmodule ManagerTest do

on_exit(fn ->
:meck.unload()
Crawly.Engine.stop_spider(Manager.TestSpider)
running_spiders = Engine.running_spiders() |> Map.keys()
Enum.each(running_spiders, &Engine.stop_spider/1)
Application.put_env(:crawly, :manager_operations_timeout, 30_000)
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_timeout, 20)
Expand Down Expand Up @@ -118,6 +121,25 @@ defmodule ManagerTest do

assert_receive :manual_stop
end

test "It's possible to start a spider with start_requests" do
pid = self()
:ok = Crawly.Engine.start_spider(Manager.StartRequestsTestSpider)

:meck.expect(HTTPoison, :get, fn url, _, _ ->
send(pid, {:performing_request, url})

{:ok,
%HTTPoison.Response{
status_code: 200,
body: "Some page",
headers: [],
request: %{}
}}
end)

assert_receive {:performing_request, "https://www.example.com/blog.html"}
end
end

defmodule Manager.TestSpider do
Expand Down Expand Up @@ -159,12 +181,34 @@ defmodule Manager.TestSpider do
]
}
end
end

defmodule Manager.StartRequestsTestSpider do
use Crawly.Spider

def base_url() do
"https://www.example.com"
end

def spider_closed(:manual_stop) do
send(:spider_closed_callback_test, :manual_stop)
def init() do
[
start_requests: [
Crawly.Request.new("https://www.example.com/blog.html"),
"Incorrect request"
]
]
end

def spider_closed(_) do
:ignored
def parse_item(_response) do
path = Enum.random(1..100)

%{
:items => [
%{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
],
:requests => [
Crawly.Utils.request_from_url("https://www.example.com/#{path}")
]
}
end
end

0 comments on commit decd838

Please sign in to comment.