Skip to content

Commit

Permalink
Allow passing spider options during the start_spider
Browse files Browse the repository at this point in the history
Allows to override some important spider options on startup:
1. closespider_itemcount
2. closespider_timeout
3. concurrent_requests_per_domain (number of started workers)

Also I have re-written manager_tests one more time. Still unhappy about
it, but now it looks slightly better, IMO
  • Loading branch information
oltarasenko committed Dec 24, 2020
1 parent af21439 commit 5d3eff4
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 180 deletions.
11 changes: 8 additions & 3 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ defmodule Crawly.Engine do
### Reserved Options
- `:crawl_id` (binary). Optional, automatically generated if not set.
### Backward compatability
- `:closespider_itemcount` (integer | disabled). Optional, overrides the close
spider item count on startup.
- `:closespider_timeout` (integer | disabled). Optional, overrides the close
spider timeout on startup.
- `:concurrent_requests_per_domain` (integer). Optional, overrides the number of
workers for a given spider
### Backward compatibility
If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future.
"""
@type crawl_id_opt :: {:crawl_id, binary()}
Expand Down
45 changes: 32 additions & 13 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,21 @@ defmodule Crawly.Manager do
def init([spider_name, options]) do
crawl_id = Keyword.get(options, :crawl_id)
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)

itemcount_limit =
Keyword.get(
options,
:closespider_itemcount,
get_default_limit(:closespider_itemcount, spider_name)
)

closespider_timeout_limit =
Keyword.get(
options,
:closespider_timeout,
get_default_limit(:closespider_timeout, spider_name)
)

# Start DataStorage worker
{:ok, data_storage_pid} =
Crawly.DataStorage.start_worker(spider_name, crawl_id)
Expand All @@ -69,7 +84,11 @@ defmodule Crawly.Manager do

# Start workers
num_workers =
Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)
Keyword.get(
options,
:concurrent_requests_per_domain,
Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)
)

worker_pids =
Enum.map(1..num_workers, fn _x ->
Expand All @@ -93,6 +112,8 @@ defmodule Crawly.Manager do
%{
name: spider_name,
crawl_id: crawl_id,
itemcount_limit: itemcount_limit,
closespider_timeout_limit: closespider_timeout_limit,
tref: tref,
prev_scraped_cnt: 0,
workers: worker_pids
Expand Down Expand Up @@ -152,27 +173,17 @@ defmodule Crawly.Manager do

Logger.info("Current crawl speed is: #{delta} items/min")

itemcount_limit =
:closespider_itemcount
|> Utils.get_settings(state.name)
|> maybe_convert_to_integer()

maybe_stop_spider_by_itemcount_limit(
state.name,
items_count,
itemcount_limit
state.itemcount_limit
)

# Close spider in case if it's not scraping items fast enough
closespider_timeout_limit =
:closespider_timeout
|> Utils.get_settings(state.name)
|> maybe_convert_to_integer()

maybe_stop_spider_by_timeout(
state.name,
delta,
closespider_timeout_limit
state.closespider_timeout_limit
)

tref =
Expand Down Expand Up @@ -224,4 +235,12 @@ defmodule Crawly.Manager do
end
)
end

# Get a closespider_itemcount or closespider_timeout_limit from config or spider
# settings.
defp get_default_limit(limit_name, spider_name) do
limit_name
|> Utils.get_settings(spider_name)
|> maybe_convert_to_integer()
end
end
2 changes: 1 addition & 1 deletion lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ defmodule Crawly.RequestsStorage.Worker do
GenServer.call(pid, command)
catch
error, reason ->
Logger.debug("Could not fetch a request: #{inspect(reason)}")
Logger.debug("Could not get response: #{inspect(reason)}")
Logger.debug(Exception.format(:error, error, __STACKTRACE__))
end
end
Expand Down
3 changes: 2 additions & 1 deletion lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ defmodule Crawly.Worker do

# define the default worker fetch interval.
@default_backoff 10_000
@start_timeout 1000

defstruct backoff: @default_backoff, spider_name: nil, crawl_id: nil

Expand All @@ -21,7 +22,7 @@ defmodule Crawly.Worker do

def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(crawl_id: crawl_id, spider_name: spider_name)
Crawly.Utils.send_after(self(), :work, 0)
Crawly.Utils.send_after(self(), :work, @start_timeout)

{:ok,
%Crawly.Worker{
Expand Down

0 comments on commit 5d3eff4

Please sign in to comment.