Skip to content

Commit

Permalink
Merge pull request #47 from oltarasenko/new_re_tries
Browse files Browse the repository at this point in the history
Retry support
  • Loading branch information
Ziinc committed Jan 13, 2020
2 parents 3186fad + 6d0f911 commit 8d78f06
Show file tree
Hide file tree
Showing 12 changed files with 386 additions and 146 deletions.
6 changes: 6 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
],

# User agents which are going to be used with requests
user_agents: [
Expand Down
61 changes: 30 additions & 31 deletions config/test.exs
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
manager_operations_timeout: 30_000,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
],
retry: [
retry_codes: [500, 404],
max_retries: 2,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]

# The path where items are stored
base_store_path: "/tmp/",
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
]
24 changes: 24 additions & 0 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,30 @@ config :crawly,
proxy: "<proxy_host>:<proxy_port>",
```

### retry :: Keyword list

Allows to configure the retry logic. Accepts the following configuration options:
1) *retry_codes*: Allows to specify a list of HTTP codes which are treated as
failed responses. (Default: [])

2) *max_retries*: Allows to specify the number of attempts before the request is
abandoned. (Default: 0)

3) *ignored_middlewares*: Allows to modify the list of processors for a given
requests when retry happens. (Will be required to avoid clashes with
Unique.Request middleware).

Example:
```
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
]
```

### fetcher :: atom()

default: Crawly.Fetchers.HTTPoisonFetcher
Expand Down
1 change: 1 addition & 0 deletions lib/crawly/fetchers/httpoison_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ defmodule Crawly.Fetchers.HTTPoisonFetcher do
require Logger

def fetch(request, _client_options) do
# TODO: This should return Crawly.Response.
HTTPoison.get(request.url, request.headers, request.options)
end
end
2 changes: 1 addition & 1 deletion lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ defmodule Crawly.Manager do
Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> %Crawly.Request{url: url} end)
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)

:ok = Crawly.RequestsStorage.store(spider_name, requests)

Expand Down
68 changes: 62 additions & 6 deletions lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@ defmodule Crawly.Request do
Defines Crawly request structure.
"""
defstruct url: nil, headers: [], prev_response: nil, options: []
###===========================================================================
### Type definitions
###===========================================================================
defstruct url: nil,
headers: [],
prev_response: nil,
options: [],
middlewares: [],
retries: 0

@type header() :: {key(), value()}
@type url() :: binary()
Expand All @@ -15,9 +23,57 @@ defmodule Crawly.Request do
@type option :: {atom(), binary()}

@type t :: %__MODULE__{
url: url(),
headers: [header()],
prev_response: %{},
options: [option()]
}
url: url(),
headers: [header()],
prev_response: %{},
options: [option()],
middlewares: [atom()],
retries: non_neg_integer()
}

###===========================================================================
### API functions
###===========================================================================
@doc """
Create new Crawly.Request from url, headers and options
"""
@spec new(url, headers, options) :: request
when url: binary(),
headers: [term()],
options: [term()],
request: Crawly.Request.t()

def new(url, headers \\ [], options \\ []) do
# Define a list of middlewares which are used by default to process
# incoming requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)

new(url, headers, options, middlewares)
end

@doc """
Same as Crawly.Request.new/3 from but allows to specify middlewares as the 4th
parameter.
"""
@spec new(url, headers, options, middlewares) :: request
when url: binary(),
headers: [term()],
options: [term()],
middlewares: [term()], # TODO: improve typespec here
request: Crawly.Request.t()
def new(url, headers, options, middlewares) do
%Crawly.Request{
url: url,
headers: headers,
options: options,
middlewares: middlewares
}
end
end
11 changes: 1 addition & 10 deletions lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,7 @@ defmodule Crawly.RequestsStorage.Worker do

# Store the given request
def handle_call({:store, request}, _from, state) do
# Define a list of middlewares which are used by default to process incoming
# requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)
middlewares = request.middlewares

new_state =
case Crawly.Utils.pipe(middlewares, request, state) do
Expand Down
13 changes: 11 additions & 2 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defmodule Crawly.Utils do
A helper function which returns a Request structure for the given URL
"""
@spec request_from_url(binary()) :: Crawly.Request.t()
def request_from_url(url), do: %Crawly.Request{url: url, headers: []}
def request_from_url(url), do: Crawly.Request.new(url)

@doc """
A helper function which converts a list of URLS into a requests list.
Expand Down Expand Up @@ -60,7 +60,6 @@ defmodule Crawly.Utils do
{new_item, new_state} = Crawly.Utils.pipe(pipelines, item, state)
```
"""
@spec pipe(pipelines, item, state) :: result
when pipelines: [Crawly.Pipeline.t()],
Expand Down Expand Up @@ -104,4 +103,14 @@ defmodule Crawly.Utils do

pipe(pipelines, new_item, new_state)
end

@doc """
A wrapper over Process.send after
This wrapper should be used instead of Process.send_after, so it's possible
to mock the last one. To avoid race conditions on worker's testing.
"""
@spec send_after(pid(), term(), pos_integer()) :: reference()
def send_after(pid, message, timeout) do
Process.send_after(pid, message, timeout)
end
end
56 changes: 48 additions & 8 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ defmodule Crawly.Worker do
end

def init([spider_name]) do
Process.send_after(self(), :work, @default_backoff)
Crawly.Utils.send_after(self(), :work, @default_backoff)

{:ok, %Crawly.Worker{spider_name: spider_name, backoff: @default_backoff}}
end
Expand Down Expand Up @@ -59,7 +59,7 @@ defmodule Crawly.Worker do
end
end

Process.send_after(self(), :work, new_backoff)
Crawly.Utils.send_after(self(), :work, new_backoff)

{:noreply, %{state | backoff: new_backoff}}
end
Expand All @@ -68,7 +68,7 @@ defmodule Crawly.Worker do
when request: Crawly.Request.t(),
spider_name: atom(),
response: HTTPoison.Response.t(),
result: {:ok, response, spider_name} | {:error, term()}
result: {:ok, {response, spider_name}} | {:error, term()}
defp get_response({request, spider_name}) do
# check if spider-level fetcher is set. Overrides the globally configured fetcher.
# if not set, log warning for explicit config preferred,
Expand All @@ -79,13 +79,25 @@ defmodule Crawly.Worker do
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

retry_options = Application.get_env(:crawly, :retry, [])
retry_codes = Keyword.get(retry_options, :retry_codes, [])
case fetcher.fetch(request, options) do
{:ok, response} ->
{:ok, {response, spider_name}}

{:error, _reason} = response ->
response
{:error, _reason} = err ->
:ok = maybe_retry_request(spider_name, request)
err

{:ok, %HTTPoison.Response{status_code: code} = response} ->
# Send the request back to re-try in case if retry status code requires
# it.
case code in retry_codes do
true ->
:ok = maybe_retry_request(spider_name, request)
{:error, :retry}
false ->
{:ok, {response, spider_name}}
end
end

end

@spec parse_item({response, spider_name}) :: result
Expand Down Expand Up @@ -158,4 +170,32 @@ defmodule Crawly.Worker do

{:ok, :done}
end

## Retry a request if max retries allows to do so
defp maybe_retry_request(spider, request) do
retries = request.retries
retry_settings = Application.get_env(:crawly, :retry, Keyword.new())

ignored_middlewares = Keyword.get(retry_settings, :ignored_middlewares, [])
max_retries = Keyword.get(retry_settings, :max_retries, 0)

case retries <= max_retries do
true ->
Logger.info("Request to #{request.url}, is scheduled for retry")

middlewares = request.middlewares -- ignored_middlewares

request = %Crawly.Request{
request |
middlewares: middlewares,
retries: retries + 1
}

:ok = Crawly.RequestsStorage.store(spider, request)
false ->
Logger.info("Dropping request to #{request.url}, (max retries)")
:ok
end

end
end
18 changes: 3 additions & 15 deletions test/request_storage_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,7 @@ defmodule RequestStorageTest do
end

test "Duplicated requests are filtered out", context do
request = %Crawly.Request{
url: "http://example.com",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://example.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
:ok = Crawly.RequestsStorage.store(context.crawler, request)
Expand All @@ -91,23 +87,15 @@ defmodule RequestStorageTest do
end

test "Outbound requests are filtered out", context do
request = %Crawly.Request{
url: "http://otherdomain.com",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://otherdomain.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
{:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler)
assert 0 == num
end

test "Robots.txt is respected", context do
request = %Crawly.Request{
url: "http://example.com/filter",
headers: [],
options: []
}
request = Crawly.Utils.request_from_url("http://example.com/filter")

:meck.expect(Gollum, :crawlable?, fn _, "http://example.com/filter" ->
:uncrawlable
Expand Down
Loading

0 comments on commit 8d78f06

Please sign in to comment.