Skip to content

Commit

Permalink
Request throttling improvements (#111)
Browse files Browse the repository at this point in the history
* Request throttling improvements

Previousely a worker was able to send up to 200 requests per minute
we decided to limit it a way more. Now one worker can do only 4 requests
per minute.

Also improved tests (removed ugly sleeps)

* Switch log level to :info for tests

* Implement codereview suggestion
  • Loading branch information
oltarasenko committed May 23, 2020
1 parent f21b21a commit b40fced
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 57 deletions.
61 changes: 31 additions & 30 deletions config/test.exs
@@ -1,36 +1,37 @@
use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
manager_operations_timeout: 30_000,

# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirect: true,
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirect: true,

# Request middlewares
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
],
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
],
retry: [
retry_codes: [500, 404],
max_retries: 2,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]
# Request middlewares
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
],
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
],
retry: [
retry_codes: [500, 404],
max_retries: 2,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]

config :logger, level: :info
4 changes: 4 additions & 0 deletions documentation/configuration.md
Expand Up @@ -69,6 +69,10 @@ default: 4

The maximum number of concurrent (ie. simultaneous) requests that will be performed by the Crawly workers.

NOTE: Worker's speed if often limited by the speed of the actual HTTP client and
network bandwidth. Crawly itself would not allow one worker to send more than
4 requests per minute.

### retry :: Keyword list

Allows to configure the retry logic. Accepts the following configuration options:
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/worker.ex
Expand Up @@ -8,7 +8,7 @@ defmodule Crawly.Worker do
require Logger

# define the default worker fetch interval.
@default_backoff 300
@default_backoff 25_000

defstruct backoff: @default_backoff, spider_name: nil

Expand All @@ -17,7 +17,7 @@ defmodule Crawly.Worker do
end

def init([spider_name]) do
Crawly.Utils.send_after(self(), :work, @default_backoff)
Crawly.Utils.send_after(self(), :work, 0)

{:ok, %Crawly.Worker{spider_name: spider_name, backoff: @default_backoff}}
end
Expand Down
46 changes: 23 additions & 23 deletions test/manager_test.exs
@@ -1,9 +1,10 @@
defmodule ManagerTest do
use ExUnit.Case
use ExUnit.Case, async: false

setup do
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_itemcount, 10)
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)

:meck.expect(HTTPoison, :get, fn _, _, _ ->
{:ok,
Expand All @@ -16,20 +17,21 @@ defmodule ManagerTest do
end)

on_exit(fn ->
:meck.unload(HTTPoison)
:meck.unload()
Crawly.Engine.stop_spider(Manager.TestSpider)
Application.put_env(:crawly, :manager_operations_timeout, 30_000)
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
Application.put_env(:crawly, :closespider_timeout, 20)
Application.put_env(:crawly, :closespider_itemcount, 100)
end)
end

test "test normal spider behavior" do
test "max request per minute is respected" do
:ok = Crawly.Engine.start_spider(Manager.TestSpider)

{:stored_requests, num} = Crawly.RequestsStorage.stats(Manager.TestSpider)
assert num == 1
Process.sleep(5_00)
Process.sleep(1_00)

{:stored_items, num} = Crawly.DataStorage.stats(Manager.TestSpider)
assert num == 1
Expand All @@ -39,40 +41,38 @@ defmodule ManagerTest do
end

test "Closespider itemcount is respected" do
Application.put_env(:crawly, :manager_operations_timeout, 1_000)
Application.put_env(:crawly, :closespider_timeout, 1)
Application.put_env(:crawly, :concurrent_requests_per_domain, 5)
Application.put_env(:crawly, :closespider_itemcount, 3)
Process.register(self(), :spider_closed_callback_test)

Application.put_env(:crawly, :manager_operations_timeout, 50)
Application.put_env(:crawly, :closespider_itemcount, 1)
:ok = Crawly.Engine.start_spider(Manager.TestSpider)

Process.sleep(2_000)
assert_receive :itemcount_timeout

assert %{} == Crawly.Engine.running_spiders()
end

test "Closespider timeout is respected" do
Application.put_env(:crawly, :manager_operations_timeout, 1_000)
Application.put_env(:crawly, :concurrent_requests_per_domain, 1)
:ok = Crawly.Engine.start_spider(Manager.TestSpider)
Process.sleep(2_000)
assert %{} == Crawly.Engine.running_spiders()
end

test "Can't start already started spider" do
:ok = Crawly.Engine.start_spider(Manager.TestSpider)
Process.register(self(), :spider_closed_callback_test)

assert {:error, :spider_already_started} ==
Crawly.Engine.start_spider(Manager.TestSpider)
# Ignore closespider_itemcount
Application.put_env(:crawly, :closespider_itemcount, :disabled)

:ok = Crawly.Engine.stop_spider(Manager.TestSpider)
Application.put_env(:crawly, :closespider_timeout, 10)

Application.put_env(:crawly, :manager_operations_timeout, 50)
:ok = Crawly.Engine.start_spider(Manager.TestSpider)

assert_receive :itemcount_timeout
assert %{} == Crawly.Engine.running_spiders()
end

test "Can't stop the spider which is not started already started spider" do
test "Can't start already started spider" do
:ok = Crawly.Engine.start_spider(Manager.TestSpider)

assert {:error, :spider_already_started} ==
Crawly.Engine.start_spider(Manager.TestSpider)

:ok = Crawly.Engine.stop_spider(Manager.TestSpider)
end

test "Spider closed callback is called when spider is stopped" do
Expand Down
4 changes: 2 additions & 2 deletions test/worker_test.exs
Expand Up @@ -32,7 +32,7 @@ defmodule WorkerTest do
test "Backoff increased when there is no work", context do
send(context.crawler, :work)
state = :sys.get_state(context.crawler)
assert state.backoff > 300
assert state.backoff > 25_000
end

test "Backoff interval restores if requests are in the system", context do
Expand All @@ -46,7 +46,7 @@ defmodule WorkerTest do

send(context.crawler, :work)
state = :sys.get_state(context.crawler)
assert state.backoff == 300
assert state.backoff == 25_000
end
end

Expand Down

0 comments on commit b40fced

Please sign in to comment.