From 4457b43c70c6f7e0d5432f5b092894e0320c2bb3 Mon Sep 17 00:00:00 2001 From: Oleg Tarasenko Date: Thu, 2 Apr 2020 17:51:11 +0200 Subject: [PATCH] Another set of codereview improvements --- documentation/configuration.md | 2 +- lib/crawly/manager.ex | 83 ++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/documentation/configuration.md b/documentation/configuration.md index d3ec617f..7b6e16db 100644 --- a/documentation/configuration.md +++ b/documentation/configuration.md @@ -109,7 +109,7 @@ Allows to specify a custom port to start the application. That is important when ## Overriding global settings on spider level It's possible to override most of the setting on a spider level. In order to do that, -it's required to define a custom callback for Crawly.Spider behaviour. +it is required to define the `override_settings/0` callback in your spider. For example: ```elixir diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index dcb5a0db..def2cc2a 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -74,12 +74,10 @@ defmodule Crawly.Manager do ) # Schedule basic service operations for given spider manager - tref = - Process.send_after( - self(), - :operations, - Utils.get_settings(:manager_operations_timeout, spider_name, @timeout) - ) + timeout = + Utils.get_settings(:manager_operations_timeout, spider_name, @timeout) + + tref = Process.send_after(self(), :operations, timeout) {:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}} @@ -94,36 +92,28 @@ defmodule Crawly.Manager do delta = items_count - state.prev_scraped_cnt Logger.info("Current crawl speed is: #{delta} items/min") - case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do - :disabled -> - :ignored - - cnt when cnt < items_count -> - Logger.info( - "Stopping #{inspect(state.name)}, closespider_itemcount achieved" - ) - - Crawly.Engine.stop_spider(state.name) + itemcount_limit = + :closespider_itemcount + |> Utils.get_settings(state.name) + |> maybe_convert_to_integer() - _ -> - :ignoring - end + maybe_stop_spider_by_itemcount_limit( + state.name, + items_count, + itemcount_limit + ) # Close spider in case if it's not scraping items fast enough - case Utils.get_settings(:closespider_timeout, state.name, :disabled) do - :undefined -> - :ignoring - - cnt when cnt > delta -> - Logger.info( - "Stopping #{inspect(state.name)}, itemcount timeout achieved" - ) - - Crawly.Engine.stop_spider(state.name) - - _ -> - :ignoring - end + closespider_timeout_limit = + :closespider_timeout + |> Utils.get_settings(state.name) + |> maybe_convert_to_integer() + + maybe_stop_spider_by_timeout( + state.name, + items_count, + closespider_timeout_limit + ) tref = Process.send_after( @@ -134,4 +124,31 @@ defmodule Crawly.Manager do {:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}} end + + defp maybe_stop_spider_by_itemcount_limit(spider_name, current, limit) + when current > limit do + Logger.info( + "Stopping #{inspect(spider_name)}, closespider_itemcount achieved" + ) + + Crawly.Engine.stop_spider(spider_name) + end + + defp maybe_stop_spider_by_itemcount_limit(_, _, _), do: :ok + + defp maybe_stop_spider_by_timeout(spider_name, current, limit) + when current < limit do + Logger.info("Stopping #{inspect(spider_name)}, itemcount timeout achieved") + + Crawly.Engine.stop_spider(spider_name) + end + + defp maybe_stop_spider_by_timeout(_, _, _), do: :ok + + defp maybe_convert_to_integer(value) when is_atom(value), do: value + + defp maybe_convert_to_integer(value) when is_binary(value), + do: String.to_integer(value) + + defp maybe_convert_to_integer(value) when is_integer(value), do: value end