Skip to content

Commit

Permalink
Try to fetch unsuccessful requests more then once. The amount
Browse files Browse the repository at this point in the history
of attempts is controlled by the max retries setting.
  • Loading branch information
oltarasenko committed Jan 13, 2020
1 parent 3186fad commit 6d0f911
Show file tree
Hide file tree
Showing 12 changed files with 386 additions and 146 deletions.
6 changes: 6 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
],

# User agents which are going to be used with requests
user_agents: [
Expand Down
61 changes: 30 additions & 31 deletions config/test.exs
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
manager_operations_timeout: 30_000,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
],
retry: [
retry_codes: [500, 404],
max_retries: 2,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]

# The path where items are stored
base_store_path: "/tmp/",
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
]
24 changes: 24 additions & 0 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,30 @@ config :crawly,
proxy: "<proxy_host>:<proxy_port>",
```

### retry :: Keyword list

Allows to configure the retry logic. Accepts the following configuration options:
1) *retry_codes*: Allows to specify a list of HTTP codes which are treated as
failed responses. (Default: [])

2) *max_retries*: Allows to specify the number of attempts before the request is
abandoned. (Default: 0)

3) *ignored_middlewares*: Allows to modify the list of processors for a given
requests when retry happens. (Will be required to avoid clashes with
Unique.Request middleware).

Example:
```
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]
```

### fetcher :: atom()

default: Crawly.Fetchers.HTTPoisonFetcher
Expand Down
1 change: 1 addition & 0 deletions lib/crawly/fetchers/httpoison_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ defmodule Crawly.Fetchers.HTTPoisonFetcher do
require Logger

def fetch(request, _client_options) do
# TODO: This should return Crawly.Response.
HTTPoison.get(request.url, request.headers, request.options)
end
end
2 changes: 1 addition & 1 deletion lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ defmodule Crawly.Manager do
Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> %Crawly.Request{url: url} end)
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)

:ok = Crawly.RequestsStorage.store(spider_name, requests)

Expand Down
68 changes: 62 additions & 6 deletions lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@ defmodule Crawly.Request do
Defines Crawly request structure.
"""
defstruct url: nil, headers: [], prev_response: nil, options: []
###===========================================================================
### Type definitions
###===========================================================================
defstruct url: nil,
headers: [],
prev_response: nil,
options: [],
middlewares: [],
retries: 0

@type header() :: {key(), value()}
@type url() :: binary()
Expand All @@ -15,9 +23,57 @@ defmodule Crawly.Request do
@type option :: {atom(), binary()}

@type t :: %__MODULE__{
url: url(),
headers: [header()],
prev_response: %{},
options: [option()]
}
url: url(),
headers: [header()],
prev_response: %{},
options: [option()],
middlewares: [atom()],
retries: non_neg_integer()
}

###===========================================================================
### API functions
###===========================================================================
@doc """
Create new Crawly.Request from url, headers and options
"""
@spec new(url, headers, options) :: request
when url: binary(),
headers: [term()],
options: [term()],
request: Crawly.Request.t()

def new(url, headers \\ [], options \\ []) do
# Define a list of middlewares which are used by default to process
# incoming requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)

new(url, headers, options, middlewares)
end

@doc """
Same as Crawly.Request.new/3 from but allows to specify middlewares as the 4th
parameter.
"""
@spec new(url, headers, options, middlewares) :: request
when url: binary(),
headers: [term()],
options: [term()],
middlewares: [term()], # TODO: improve typespec here
request: Crawly.Request.t()
def new(url, headers, options, middlewares) do
%Crawly.Request{
url: url,
headers: headers,
options: options,
middlewares: middlewares
}
end
end
11 changes: 1 addition & 10 deletions lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,7 @@ defmodule Crawly.RequestsStorage.Worker do

# Store the given request
def handle_call({:store, request}, _from, state) do
# Define a list of middlewares which are used by default to process incoming
# requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)
middlewares = request.middlewares

new_state =
case Crawly.Utils.pipe(middlewares, request, state) do
Expand Down
13 changes: 11 additions & 2 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defmodule Crawly.Utils do
A helper function which returns a Request structure for the given URL
"""
@spec request_from_url(binary()) :: Crawly.Request.t()
def request_from_url(url), do: %Crawly.Request{url: url, headers: []}
def request_from_url(url), do: Crawly.Request.new(url)

@doc """
A helper function which converts a list of URLS into a requests list.
Expand Down Expand Up @@ -60,7 +60,6 @@ defmodule Crawly.Utils do
{new_item, new_state} = Crawly.Utils.pipe(pipelines, item, state)
```
"""
@spec pipe(pipelines, item, state) :: result
when pipelines: [Crawly.Pipeline.t()],
Expand Down Expand Up @@ -104,4 +103,14 @@ defmodule Crawly.Utils do

pipe(pipelines, new_item, new_state)
end

@doc """
A wrapper over Process.send after
This wrapper should be used instead of Process.send_after, so it's possible
to mock the last one. To avoid race conditions on worker's testing.
"""
@spec send_after(pid(), term(), pos_integer()) :: reference()
def send_after(pid, message, timeout) do
Process.send_after(pid, message, timeout)
end
end
56 changes: 48 additions & 8 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ defmodule Crawly.Worker do
end

def init([spider_name]) do
Process.send_after(self(), :work, @default_backoff)
Crawly.Utils.send_after(self(), :work, @default_backoff)

{:ok, %Crawly.Worker{spider_name: spider_name, backoff: @default_backoff}}
end
Expand Down Expand Up @@ -59,7 +59,7 @@ defmodule Crawly.Worker do
end
end

Process.send_after(self(), :work, new_backoff)
Crawly.Utils.send_after(self(), :work, new_backoff)

{:noreply, %{state | backoff: new_backoff}}
end
Expand All @@ -68,7 +68,7 @@ defmodule Crawly.Worker do
when request: Crawly.Request.t(),
spider_name: atom(),
response: HTTPoison.Response.t(),
result: {:ok, response, spider_name} | {:error, term()}
result: {:ok, {response, spider_name}} | {:error, term()}
defp get_response({request, spider_name}) do
# check if spider-level fetcher is set. Overrides the globally configured fetcher.
# if not set, log warning for explicit config preferred,
Expand All @@ -79,13 +79,25 @@ defmodule Crawly.Worker do
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

retry_options = Application.get_env(:crawly, :retry, [])
retry_codes = Keyword.get(retry_options, :retry_codes, [])
case fetcher.fetch(request, options) do
{:ok, response} ->
{:ok, {response, spider_name}}

{:error, _reason} = response ->
response
{:error, _reason} = err ->
:ok = maybe_retry_request(spider_name, request)
err

{:ok, %HTTPoison.Response{status_code: code} = response} ->
# Send the request back to re-try in case if retry status code requires
# it.
case code in retry_codes do
true ->
:ok = maybe_retry_request(spider_name, request)
{:error, :retry}
false ->
{:ok, {response, spider_name}}
end
end

end

@spec parse_item({response, spider_name}) :: result
Expand Down Expand Up @@ -158,4 +170,32 @@ defmodule Crawly.Worker do

{:ok, :done}
end

## Retry a request if max retries allows to do so
defp maybe_retry_request(spider, request) do
retries = request.retries
retry_settings = Application.get_env(:crawly, :retry, Keyword.new())

ignored_middlewares = Keyword.get(retry_settings, :ignored_middlewares, [])
max_retries = Keyword.get(retry_settings, :max_retries, 0)

case retries <= max_retries do
true ->
Logger.info("Request to #{request.url}, is scheduled for retry")

middlewares = request.middlewares -- ignored_middlewares

request = %Crawly.Request{
request |
middlewares: middlewares,
retries: retries + 1
}

:ok = Crawly.RequestsStorage.store(spider, request)
false ->
Logger.info("Dropping request to #{request.url}, (max retries)")
:ok
end

end
end
18 changes: 3 additions & 15 deletions test/request_storage_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,7 @@ defmodule RequestStorageTest do
end

test "Duplicated requests are filtered out", context do
request = %Crawly.Request{
url: "http://example.com",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://example.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
:ok = Crawly.RequestsStorage.store(context.crawler, request)
Expand All @@ -91,23 +87,15 @@ defmodule RequestStorageTest do
end

test "Outbound requests are filtered out", context do
request = %Crawly.Request{
url: "http://otherdomain.com",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://otherdomain.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
{:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler)
assert 0 == num
end

test "Robots.txt is respected", context do
request = %Crawly.Request{
url: "http://example.com/filter",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://example.com/filter")

:meck.expect(Gollum, :crawlable?, fn _, "http://example.com/filter" ->
:uncrawlable
Expand Down

0 comments on commit 6d0f911

Please sign in to comment.