diff --git a/README.md b/README.md index 4904e141..8942614d 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,6 @@ historical archival. config :crawly, closespider_timeout: 10, concurrent_requests_per_domain: 8, - closespider_itemcount: 1000, middlewares: [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, diff --git a/documentation/configuration.md b/documentation/configuration.md index 0ec1f7cd..b98064b4 100644 --- a/documentation/configuration.md +++ b/documentation/configuration.md @@ -93,11 +93,11 @@ config :crawly, Defines a list of middlewares responsible for pre-processing requests. If any of the requests from the `Crawly.Spider` is not passing the middleware, it's dropped. -### closespider_itemcount :: pos_integer() +### closespider_itemcount :: pos_integer() | :disabled -default: 5000 +default: :disabled -An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to nil the spider will not be stopped. +An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped. ### closespider_timeout :: pos_integer() diff --git a/documentation/quickstart.md b/documentation/quickstart.md index 6ffa1609..2557e0fd 100644 --- a/documentation/quickstart.md +++ b/documentation/quickstart.md @@ -58,7 +58,6 @@ Goals: config :crawly, closespider_timeout: 10, concurrent_requests_per_domain: 8, - closespider_itemcount: 1000, middlewares: [ Crawly.Middlewares.DomainFilter, {Crawly.Middlewares.RequestOptions, [timeout: 30_000]}, diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 066d313b..37b25b0f 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -85,7 +85,10 @@ defmodule Crawly.Manager do delta = items_count - state.prev_scraped_cnt Logger.info("Current crawl speed is: #{delta} items/min") - case Application.get_env(:crawly, :closespider_itemcount, 1000) do + case Application.get_env(:crawly, :closespider_itemcount, :disabled) do + :disabled -> + :ignored + cnt when cnt < items_count -> Logger.info( "Stopping #{inspect(state.name)}, closespider_itemcount achieved"