Skip to content

Commit

Permalink
Merge abee884 into b5ba74c
Browse files Browse the repository at this point in the history
  • Loading branch information
Ziinc committed Dec 30, 2019
2 parents b5ba74c + abee884 commit a600c2a
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 5 deletions.
2 changes: 2 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ use Mix.Config
config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},

# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
Expand Down
7 changes: 7 additions & 0 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,10 @@ It's possible to set proxy using the proxy value of Crawly config, for example:
config :crawly,
proxy: "<proxy_host>:<proxy_port>",
```

### fetcher :: atom()

default: Crawly.Fetchers.HTTPoisonFetcher

Allows to specify a custom HTTP client which will be performing request to the crawl
target.
14 changes: 14 additions & 0 deletions lib/crawly/fetchers/fetcher.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
defmodule Crawly.Fetchers.Fetcher do
@moduledoc """
A behavior module for defining Crawly Fetchers
A fetcher is expected to implement a fetch callback which should take
Crawly.Request, HTTP client options and return Crawly.Response.
"""

@callback fetch(request, options) :: {:ok, response} | {:error, reason}
when request: Crawly.Request.t(),
response: Crawly.Response.t(),
options: map(),
reason: term()
end
12 changes: 12 additions & 0 deletions lib/crawly/fetchers/httpoison_fetcher.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defmodule Crawly.Fetchers.HTTPoisonFetcher do
@moduledoc """
Implements Crawly.Fetchers.Fetcher behavior based on HTTPoison HTTP client
"""
@behaviour Crawly.Fetchers.Fetcher

require Logger

def fetch(request, _client_options) do
HTTPoison.get(request.url, request.headers, request.options)
end
end
4 changes: 3 additions & 1 deletion lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ defmodule Crawly.Request do
defstruct url: nil, headers: [], prev_response: nil, options: []

@type header() :: {key(), value()}
@type url() :: binary()

@typep key :: binary()
@typep value :: binary()

@type option :: {atom(), binary()}

@type t :: %__MODULE__{
url: binary(),
url: url(),
headers: [header()],
prev_response: %{},
options: [option()]
Expand Down
19 changes: 19 additions & 0 deletions lib/crawly/response.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
defmodule Crawly.Response do
@moduledoc """
Define Crawly response structure
"""

defstruct body: nil,
headers: [],
request: nil,
request_url: nil,
status_code: nil

@type t :: %__MODULE__{
body: term(),
headers: list(),
request: Crawly.Request.t(),
request_url: Crawly.Request.url(),
status_code: integer()
}
end
21 changes: 17 additions & 4 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,18 @@ defmodule Crawly.Worker do
# TODO: Add retry logic
Logger.error(
fn ->
"Crawly worker could not process the request to #{inspect(request.url)}
"Crawly worker could not process the request to #{
inspect(request.url)
}
reason: #{inspect(reason)}"
end)
end
)

@default_backoff

{:ok, _result} ->
@default_backoff
end

end

Process.send_after(self(), :work, new_backoff)
Expand All @@ -66,7 +70,16 @@ defmodule Crawly.Worker do
response: HTTPoison.Response.t(),
result: {:ok, response, spider_name} | {:error, term()}
defp get_response({request, spider_name}) do
case HTTPoison.get(request.url, request.headers, request.options) do
# check if spider-level fetcher is set. Overrides the globally configured fetcher.
# if not set, log warning for explicit config preferred,
# get the globally-configured fetcher. Defaults to HTTPoisonFetcher
{fetcher, options} = Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

case fetcher.fetch(request, options) do
{:ok, response} ->
{:ok, {response, spider_name}}

Expand Down

0 comments on commit a600c2a

Please sign in to comment.