Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support of initial arguments #136

Merged
merged 2 commits into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.
** This callback is going to be deprecated in favour of init/1. For now the backwords
compatibility is kept with a help of macro which always generates `init/1`.

`init(options)` same as `init/0` but also takes options (which can be passed from the engine during
the spider start).

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
60 changes: 52 additions & 8 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,52 @@ defmodule Crawly.Engine do

defstruct(started_spiders: %{}, known_spiders: [])

@spec start_spider(module(), binary()) ::
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
@spec start_spider(spider_name) :: result
when spider_name: module(),
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}

def start_spider(spider_name) do
start_spider(spider_name, UUID.uuid1(), [])
end

@spec start_spider(spider_name, crawl_id) :: result
when spider_name: module(),
crawl_id: binary(),
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}

def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
start_spider(spider_name, crawl_id, [])
end

@spec start_spider(spider_name, options) :: result
when spider_name: module(),
options: list(),
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}

def start_spider(spider_name, options) when is_list(options) do
start_spider(spider_name, UUID.uuid1(), options)
end

@spec start_spider(spider_name, crawl_id, options) :: result
Ziinc marked this conversation as resolved.
Show resolved Hide resolved
when spider_name: module(),
crawl_id: binary(),
options: list(),
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}

def start_spider(spider_name, crawl_id, options) do
GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id, options})
end

@spec get_manager(module()) ::
Expand Down Expand Up @@ -132,11 +172,15 @@ defmodule Crawly.Engine do
{:reply, format_spider_info(state), state}
end

def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
def handle_call(
{:start_spider, spider_name, crawl_id, options},
_form,
state
) do
result =
case Map.get(state.started_spiders, spider_name) do
nil ->
Crawly.EngineSup.start_spider(spider_name)
Crawly.EngineSup.start_spider(spider_name, options)

_ ->
{:error, :spider_already_started}
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/engine_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ defmodule Crawly.EngineSup do
DynamicSupervisor.init(strategy: :one_for_one)
end

def start_spider(spider_name) do
def start_spider(spider_name, options) do
result =
case Code.ensure_loaded?(spider_name) do
true ->
# Given spider module exists in the namespace, we can proceed
{:ok, _sup_pid} =
DynamicSupervisor.start_child(
__MODULE__,
{Crawly.ManagerSup, spider_name}
{Crawly.ManagerSup, [spider_name, options]}
)

false ->
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ defmodule Crawly.Manager do
end
end

def start_link(spider_name) do
def start_link([spider_name, options]) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
GenServer.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
# Getting spider start urls
init = spider_name.init()
init = spider_name.init(options)

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do
@moduledoc false
use Supervisor

def start_link(spider_name) do
Supervisor.start_link(__MODULE__, spider_name)
def start_link([spider_name, options]) do
Supervisor.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
children = [
# This supervisor is used to spawn Worker processes
{DynamicSupervisor, strategy: :one_for_one, name: spider_name},

# Starts spider manager process
{Crawly.Manager, spider_name}
{Crawly.Manager, [spider_name, options]}
]

Supervisor.init(children, strategy: :one_for_one)
Expand Down
22 changes: 18 additions & 4 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ defmodule Crawly.Spider do

A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
2. `init/1` same as init, but also takes a list of options sent from Engine
3. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
4. `parse_item/1` function which is responsible for parsing the downloaded
request and converting it into items which can be stored and new requests
which can be scheduled
4. `custom_settings/0` an optional callback which can be used in order to
5. `custom_settings/0` an optional callback which can be used in order to
provide custom spider specific settings. Should define a list with custom
settings and their values. These values will take precedence over the
global settings defined in the config.
"""



@callback init() :: [start_urls: list(), start_requests: list()]
@callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand All @@ -26,11 +30,21 @@ defmodule Crawly.Spider do

defmacro __using__(_opts) do
quote do
require Logger
@behaviour Crawly.Spider

def override_settings(), do: []

defoverridable override_settings: 0
# This line is needed to keep the backward compatibility, so all spiders
# with init/0 will still work normally.
def init(_options), do: init()

def init() do
Logger.error("Using default spider init, without start urls")
%{start_urls: []}
end

defoverridable override_settings: 0, init: 1, init: 0
end
end
end
27 changes: 27 additions & 0 deletions test/manager_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,16 @@ defmodule ManagerTest do

assert_receive {:performing_request, "https://www.example.com/blog.html"}
end

test "It's possible to initialize a spider with parameters" do
Process.register(self(), :manager_test_initial_args_test)
urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
:ok = Crawly.Engine.start_spider(Manager.InitialArgsTestSpider, [urls: urls])

assert_receive recv_urls

assert Enum.sort(recv_urls) == Enum.sort(urls)
end
end

defmodule Manager.TestSpider do
Expand Down Expand Up @@ -212,3 +222,20 @@ defmodule Manager.StartRequestsTestSpider do
}
end
end

defmodule Manager.InitialArgsTestSpider do
use Crawly.Spider

def base_url() do
"https://www.example.com"
end

def init([urls: list_of_urls]) do
send(:manager_test_initial_args_test, list_of_urls)
[start_urls: list_of_urls]
end

def parse_item(_response) do
%{items: [], requests: []}
end
end
1 change: 0 additions & 1 deletion test/test_utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ defmodule TestSpider do
end

defmodule UtilsTestSpider do
use GenServer
use Crawly.Spider

@impl true
Expand Down