Skip to content

Commit

Permalink
Move default middlewares to Crawly.Request.new/3
Browse files Browse the repository at this point in the history
It allows to use a request as router which defines how exactly
the request is supposed to be processed.
  • Loading branch information
oltarasenko committed Jan 3, 2020
1 parent c423cad commit d098b0c
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 104 deletions.
13 changes: 8 additions & 5 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
max_retries: 3,
retry: [
{:max_retries, 3},
{:ignored_middlewares, [Crawly.Middlewares.UniqueRequest]}],
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
Expand Down Expand Up @@ -63,8 +65,9 @@ config :crawly,
Crawly.Pipelines.JSONEncoder
]

config :crawly, Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"
config :crawly,
Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"

import_config "#{Mix.env}.exs"
import_config "#{Mix.env}.exs"
6 changes: 5 additions & 1 deletion config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ config :crawly,
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
max_retries: 2,

retry: [
{:max_retries, 2},
{:ignored_middlewares, [Crawly.Middlewares.UniqueRequest]}
],
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
Expand Down
51 changes: 41 additions & 10 deletions lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ defmodule Crawly.Request do
@type retries :: non_neg_integer()
@type option :: {atom(), binary()}
@opaque t :: %__MODULE__{
url: binary(),
headers: [header()],
prev_response: %{},
retries: 0,
middlewares: [],
options: [option()]
}
url: binary(),
headers: [header()],
prev_response: %{},
retries: 0,
middlewares: [],
options: [option()]
}
###===========================================================================
### API functions
###===========================================================================
Expand All @@ -41,8 +41,39 @@ defmodule Crawly.Request do
headers: [term()],
options: [term()],
request: Crawly.Request.t()

def new(url, headers \\ [], options \\ []) do
%Crawly.Request{url: url, headers: headers, options: options}
# Define a list of middlewares which are used by default to process
# incoming requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)

new(url, headers, options, middlewares)
end

@doc """
Same as Crawly.Request.new/3 from but allows to specify middlewares as the 4th
parameter.
"""
@spec new(url, headers, options, middlewares) :: request
when url: binary(),
headers: [term()],
options: [term()],
middlewares: [term()], # TODO: improve typespec here
request: Crawly.Request.t()
def new(url, headers, options, middlewares) do
%Crawly.Request{
url: url,
headers: headers,
options: options,
middlewares: middlewares
}
end

@doc """
Expand Down Expand Up @@ -115,14 +146,14 @@ defmodule Crawly.Request do
options_field: binary()
def options(request, new_options),
do: %Crawly.Request{request | options: new_options}

@doc """
Access retries field from Crawly.Request
"""
@spec retries(request) :: retries_field
when request: Crawly.Request.t(),
retries_field: [term()]
def retries(request), do: request.headers
def retries(request), do: request.retries

@doc """
Set retries field in Crawly.Request
Expand Down
11 changes: 1 addition & 10 deletions lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,7 @@ defmodule Crawly.RequestsStorage.Worker do

# Store the given request
def handle_call({:store, request}, _from, state) do
# Define a list of middlewares which are used by default to process incoming
# requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]

middlewares =
Application.get_env(:crawly, :middlewares, default_middlewares)
middlewares = Crawly.Request.middlewares(request)

new_state =
case Crawly.Utils.pipe(middlewares, request, state) do
Expand Down
95 changes: 51 additions & 44 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -30,42 +30,34 @@ defmodule Crawly.Worker do

defp do_work(spider_name, backoff) do
# Get a request from requests storage.
new_backoff =
case Crawly.RequestsStorage.pop(spider_name) do
nil ->
# Slow down a bit when there are no new URLs
backoff * 2

request ->
# Process the request using following group of functions
functions = [
{:get_response, &get_response/1},
{:parse_item, &parse_item/1},
{:process_parsed_item, &process_parsed_item/1}
]

case :epipe.run(functions, {request, spider_name}) do
{:error, _step, reason, _step_state} ->
# TODO: Add retry logic
Logger.error(
fn ->
"Crawly worker could not process the request to #{
inspect(request.url)
}
case Crawly.RequestsStorage.pop(spider_name) do
nil ->
# Slow down a bit when there are no new URLs
backoff * 2

request ->
# Process the request using following group of functions
functions = [
{:get_response, &get_response/1},
{:parse_item, &parse_item/1},
{:process_parsed_item, &process_parsed_item/1}
]

case :epipe.run(functions, {request, spider_name}) do
{:error, _step, reason, _step_state} ->
Logger.error(
fn ->
"Crawly worker could not process the request to #{
inspect(request.url)
}
reason: #{inspect(reason)}"
end
)

@default_backoff

{:ok, _result} ->
@default_backoff
end
end

Process.send_after(self(), :work, new_backoff)

{:noreply, %{state | backoff: new_backoff}}
end
)
@default_backoff
{:ok, _result} ->
@default_backoff
end
end
end

@spec get_response({request, spider_name}) :: result
Expand Down Expand Up @@ -165,19 +157,34 @@ defmodule Crawly.Worker do
end

## Retry a request if max retries allows to do so
defp maybe_retry_request(spider, %Crawly.Request{retries: retries} = request) do
max_retires = Application.get_env(:crawly, :max_retries, 3)
defp maybe_retry_request(spider, request) do
retries = Crawly.Request.retries(request)
url = Crawly.Request.url(request)

retry_settings = Application.get_env(:crawly, :retry, [])

case retries <= max_retires do
max_retries = Keyword.get(retry_settings, :max_retries, 3)
ignored_middlewares = Keyword.get(
retry_settings,
:ignored_middlewares,
[Crawly.Middlewares.UniqueRequest]
)

case retries <= max_retries do
true ->
Logger.info("Request to #{request.url}, is scheduled for retry")
:ok = Crawly.RequestsStorage.store(
spider,
%Crawly.Request{request | retries: retries + 1}
)
Logger.info("Request to #{url}, is scheduled for retry")

middlewares = Crawly.Request.middlewares(request) -- ignored_middlewares

request = request
|> Crawly.Request.middlewares(middlewares)
|> Crawly.Request.retries(retries + 1)

:ok = Crawly.RequestsStorage.store(spider, request)
false ->
Logger.info("Dropping request to #{request.url}, (max retries)")
Logger.info("Dropping request to #{url}, (max retries)")
:ok
end

end
end
32 changes: 5 additions & 27 deletions test/request_storage_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,15 @@ defmodule RequestStorageTest do
end

test "Request storage can store requests", context do
request = %Crawly.Request{
url: "http://example.com",
headers: [],
options: []
}
request = Crawly.Request.new("http://example.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
{:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler)
assert 1 == num
end

test "Request storage returns request for given spider", context do
request = %Crawly.Request{
url: "http://example.com",
headers: [],
options: []
}

request = Crawly.Request.new("http://example.com")
:ok = Crawly.RequestsStorage.store(context.crawler, request)

returned_request = Crawly.RequestsStorage.pop(context.crawler)
Expand All @@ -64,12 +55,7 @@ defmodule RequestStorageTest do
end

test "Duplicated requests are filtered out", context do
request = %Crawly.Request{
url: "http://example.com",
headers: [],
options: []
}

request = Crawly.Request.new("http://example.com")
:ok = Crawly.RequestsStorage.store(context.crawler, request)
:ok = Crawly.RequestsStorage.store(context.crawler, request)

Expand All @@ -91,23 +77,15 @@ defmodule RequestStorageTest do
end

test "Outbound requests are filtered out", context do
request = %Crawly.Request{
url: "http://otherdomain.com",
headers: [],
options: []
}
request = Crawly.Request.new("http://otherdomain.com")

:ok = Crawly.RequestsStorage.store(context.crawler, request)
{:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler)
assert 0 == num
end

test "Robots.txt is respected", context do
request = %Crawly.Request{
url: "http://example.com/filter",
headers: [],
options: []
}
request = Crawly.Request.new("http://example.com/filter")

:meck.expect(Gollum, :crawlable?, fn _, "http://example.com/filter" ->
:uncrawlable
Expand Down
16 changes: 9 additions & 7 deletions test/utils_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,21 @@ defmodule UtilsTest do

test "Request from url" do
requests = Crawly.Utils.request_from_url("https://test.com")
assert requests == %Crawly.Request{url: "https://test.com", headers: []}
assert requests == Crawly.Request.new("https://test.com")
end

test "Requests from urls" do
requests =
Crawly.Utils.requests_from_urls([
"https://test.com",
"https://example.com"
])
Crawly.Utils.requests_from_urls(
[
"https://test.com",
"https://example.com"
]
)

assert requests == [
%Crawly.Request{url: "https://test.com", headers: []},
%Crawly.Request{url: "https://example.com", headers: []}
Crawly.Request.new("https://test.com"),
Crawly.Request.new("https://example.com")
]
end

Expand Down
1 change: 1 addition & 0 deletions test/worker_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ defmodule WorkerTest do
send(context.crawler, :work)

response = receive_mocked_response()

assert response != false
assert response.retries == 1

Expand Down

0 comments on commit d098b0c

Please sign in to comment.