diff --git a/config/test.exs b/config/test.exs index ad9c1d78..388e986f 100644 --- a/config/test.exs +++ b/config/test.exs @@ -1,36 +1,37 @@ use Mix.Config config :crawly, - manager_operations_timeout: 30_000, + manager_operations_timeout: 30_000, - # Stop spider after scraping certain amount of items - closespider_itemcount: 100, - # Stop spider if it does crawl fast enough - closespider_timeout: 20, - concurrent_requests_per_domain: 5, - follow_redirect: true, + # Stop spider after scraping certain amount of items + closespider_itemcount: 100, + # Stop spider if it does crawl fast enough + closespider_timeout: 20, + concurrent_requests_per_domain: 5, + follow_redirect: true, - # Request middlewares - # User agents which are going to be used with requests - user_agents: [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" - ], - middlewares: [ - Crawly.Middlewares.DomainFilter, - Crawly.Middlewares.UniqueRequest, - Crawly.Middlewares.RobotsTxt, - {Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]} - ], - pipelines: [ - {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]}, - {Crawly.Pipelines.DuplicatesFilter, item_id: :title}, - Crawly.Pipelines.JSONEncoder - ], - retry: [ - retry_codes: [500, 404], - max_retries: 2, - ignored_middlewares: [Crawly.Middlewares.UniqueRequest] - ] + # Request middlewares + # User agents which are going to be used with requests + user_agents: [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" + ], + middlewares: [ + Crawly.Middlewares.DomainFilter, + Crawly.Middlewares.UniqueRequest, + Crawly.Middlewares.RobotsTxt, + {Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]} + ], + pipelines: [ + {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]}, + {Crawly.Pipelines.DuplicatesFilter, item_id: :title}, + Crawly.Pipelines.JSONEncoder + ], + retry: [ + retry_codes: [500, 404], + max_retries: 2, + ignored_middlewares: [Crawly.Middlewares.UniqueRequest] + ] +config :logger, level: :info diff --git a/documentation/configuration.md b/documentation/configuration.md index ec3aea68..0b2a760b 100644 --- a/documentation/configuration.md +++ b/documentation/configuration.md @@ -69,6 +69,10 @@ default: 4 The maximum number of concurrent (ie. simultaneous) requests that will be performed by the Crawly workers. +NOTE: Worker's speed if often limited by the speed of the actual HTTP client and +network bandwidth. Crawly itself would not allow one worker to send more than +4 requests per minute. + ### retry :: Keyword list Allows to configure the retry logic. Accepts the following configuration options: diff --git a/lib/crawly/worker.ex b/lib/crawly/worker.ex index addc542c..f6d824b7 100644 --- a/lib/crawly/worker.ex +++ b/lib/crawly/worker.ex @@ -8,7 +8,7 @@ defmodule Crawly.Worker do require Logger # define the default worker fetch interval. - @default_backoff 300 + @default_backoff 25_000 defstruct backoff: @default_backoff, spider_name: nil @@ -17,7 +17,7 @@ defmodule Crawly.Worker do end def init([spider_name]) do - Crawly.Utils.send_after(self(), :work, @default_backoff) + Crawly.Utils.send_after(self(), :work, 0) {:ok, %Crawly.Worker{spider_name: spider_name, backoff: @default_backoff}} end diff --git a/test/manager_test.exs b/test/manager_test.exs index e1b724ad..658e4d25 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -1,9 +1,10 @@ defmodule ManagerTest do - use ExUnit.Case + use ExUnit.Case, async: false setup do Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_itemcount, 10) + Application.put_env(:crawly, :concurrent_requests_per_domain, 1) :meck.expect(HTTPoison, :get, fn _, _, _ -> {:ok, @@ -16,7 +17,8 @@ defmodule ManagerTest do end) on_exit(fn -> - :meck.unload(HTTPoison) + :meck.unload() + Crawly.Engine.stop_spider(Manager.TestSpider) Application.put_env(:crawly, :manager_operations_timeout, 30_000) Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_timeout, 20) @@ -24,12 +26,12 @@ defmodule ManagerTest do end) end - test "test normal spider behavior" do + test "max request per minute is respected" do :ok = Crawly.Engine.start_spider(Manager.TestSpider) {:stored_requests, num} = Crawly.RequestsStorage.stats(Manager.TestSpider) assert num == 1 - Process.sleep(5_00) + Process.sleep(1_00) {:stored_items, num} = Crawly.DataStorage.stats(Manager.TestSpider) assert num == 1 @@ -39,40 +41,38 @@ defmodule ManagerTest do end test "Closespider itemcount is respected" do - Application.put_env(:crawly, :manager_operations_timeout, 1_000) - Application.put_env(:crawly, :closespider_timeout, 1) - Application.put_env(:crawly, :concurrent_requests_per_domain, 5) - Application.put_env(:crawly, :closespider_itemcount, 3) + Process.register(self(), :spider_closed_callback_test) + + Application.put_env(:crawly, :manager_operations_timeout, 50) + Application.put_env(:crawly, :closespider_itemcount, 1) :ok = Crawly.Engine.start_spider(Manager.TestSpider) - Process.sleep(2_000) + assert_receive :itemcount_timeout + assert %{} == Crawly.Engine.running_spiders() end test "Closespider timeout is respected" do - Application.put_env(:crawly, :manager_operations_timeout, 1_000) - Application.put_env(:crawly, :concurrent_requests_per_domain, 1) - :ok = Crawly.Engine.start_spider(Manager.TestSpider) - Process.sleep(2_000) - assert %{} == Crawly.Engine.running_spiders() - end - test "Can't start already started spider" do - :ok = Crawly.Engine.start_spider(Manager.TestSpider) + Process.register(self(), :spider_closed_callback_test) - assert {:error, :spider_already_started} == - Crawly.Engine.start_spider(Manager.TestSpider) + # Ignore closespider_itemcount + Application.put_env(:crawly, :closespider_itemcount, :disabled) - :ok = Crawly.Engine.stop_spider(Manager.TestSpider) + Application.put_env(:crawly, :closespider_timeout, 10) + + Application.put_env(:crawly, :manager_operations_timeout, 50) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) + + assert_receive :itemcount_timeout + assert %{} == Crawly.Engine.running_spiders() end - test "Can't stop the spider which is not started already started spider" do + test "Can't start already started spider" do :ok = Crawly.Engine.start_spider(Manager.TestSpider) assert {:error, :spider_already_started} == Crawly.Engine.start_spider(Manager.TestSpider) - - :ok = Crawly.Engine.stop_spider(Manager.TestSpider) end test "Spider closed callback is called when spider is stopped" do diff --git a/test/worker_test.exs b/test/worker_test.exs index 34125be7..5c809e96 100644 --- a/test/worker_test.exs +++ b/test/worker_test.exs @@ -32,7 +32,7 @@ defmodule WorkerTest do test "Backoff increased when there is no work", context do send(context.crawler, :work) state = :sys.get_state(context.crawler) - assert state.backoff > 300 + assert state.backoff > 25_000 end test "Backoff interval restores if requests are in the system", context do @@ -46,7 +46,7 @@ defmodule WorkerTest do send(context.crawler, :work) state = :sys.get_state(context.crawler) - assert state.backoff == 300 + assert state.backoff == 25_000 end end