Skip to content

Commit

Permalink
Another set of codereview improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Apr 2, 2020
1 parent 1aa520a commit 4457b43
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 34 deletions.
2 changes: 1 addition & 1 deletion documentation/configuration.md
Expand Up @@ -109,7 +109,7 @@ Allows to specify a custom port to start the application. That is important when
## Overriding global settings on spider level

It's possible to override most of the setting on a spider level. In order to do that,
it's required to define a custom callback for Crawly.Spider behaviour.
it is required to define the `override_settings/0` callback in your spider.

For example:
```elixir
Expand Down
83 changes: 50 additions & 33 deletions lib/crawly/manager.ex
Expand Up @@ -74,12 +74,10 @@ defmodule Crawly.Manager do
)

# Schedule basic service operations for given spider manager
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)
)
timeout =
Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)

tref = Process.send_after(self(), :operations, timeout)

{:ok,
%{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}}
Expand All @@ -94,36 +92,28 @@ defmodule Crawly.Manager do
delta = items_count - state.prev_scraped_cnt
Logger.info("Current crawl speed is: #{delta} items/min")

case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do
:disabled ->
:ignored

cnt when cnt < items_count ->
Logger.info(
"Stopping #{inspect(state.name)}, closespider_itemcount achieved"
)

Crawly.Engine.stop_spider(state.name)
itemcount_limit =
:closespider_itemcount
|> Utils.get_settings(state.name)
|> maybe_convert_to_integer()

_ ->
:ignoring
end
maybe_stop_spider_by_itemcount_limit(
state.name,
items_count,
itemcount_limit
)

# Close spider in case if it's not scraping items fast enough
case Utils.get_settings(:closespider_timeout, state.name, :disabled) do
:undefined ->
:ignoring

cnt when cnt > delta ->
Logger.info(
"Stopping #{inspect(state.name)}, itemcount timeout achieved"
)

Crawly.Engine.stop_spider(state.name)

_ ->
:ignoring
end
closespider_timeout_limit =
:closespider_timeout
|> Utils.get_settings(state.name)
|> maybe_convert_to_integer()

maybe_stop_spider_by_timeout(
state.name,
items_count,
closespider_timeout_limit
)

tref =
Process.send_after(
Expand All @@ -134,4 +124,31 @@ defmodule Crawly.Manager do

{:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}}
end

defp maybe_stop_spider_by_itemcount_limit(spider_name, current, limit)
when current > limit do
Logger.info(
"Stopping #{inspect(spider_name)}, closespider_itemcount achieved"
)

Crawly.Engine.stop_spider(spider_name)
end

defp maybe_stop_spider_by_itemcount_limit(_, _, _), do: :ok

defp maybe_stop_spider_by_timeout(spider_name, current, limit)
when current < limit do
Logger.info("Stopping #{inspect(spider_name)}, itemcount timeout achieved")

Crawly.Engine.stop_spider(spider_name)
end

defp maybe_stop_spider_by_timeout(_, _, _), do: :ok

defp maybe_convert_to_integer(value) when is_atom(value), do: value

defp maybe_convert_to_integer(value) when is_binary(value),
do: String.to_integer(value)

defp maybe_convert_to_integer(value) when is_integer(value), do: value
end

0 comments on commit 4457b43

Please sign in to comment.