Skip to content

Commit

Permalink
Pluggable fetchers improvement, closes #43 (#180)
Browse files Browse the repository at this point in the history
* Make :logger_file_backend an opt-in dependency

* Update documentation

* Warn user is LoggerFileBackend module isn't loaded

* Update documentation

* Fix file formatting

* Add default case

* Update documentation

* Fix file formatting

* Create wrapper for Code.ensure_loaded?

* test availability of LoggerFileBackend module

* Fix file formatting

* Use worker functions

* Pluggable fetchers improvement

* Fix formatting

* Disallow module declaration kind

* Add guards to case

* Put unwrap logic in utils

* Put unwrap logic in utils

* Raise error for invalid form

* Test for invalid format

* Fix formatting

* update readme with example of usage (#173)

* Update crawly version to 0.13.0

* Change elixir UUID name (#175)

* change uuid -> elixir_uuid

* install

Co-authored-by: rootkc <info@kennethchristensen.dk>
Co-authored-by: oltarasenko <oltarasenko@gmail.com>

* fix some test error & compile time warning (#177)

* fix: test error

* style: mix format

* Fix formatting

* Disallow module declaration kind

* Add guards to case

* Put unwrap logic in utils

* Put unwrap logic in utils

* Raise error for invalid form

* Test for invalid format

* Fix formatting

* Add article on creating webdriver based fetcher

* Make function perform single responsibility

* Use util function to unwrap setting

* Add typespec

* Update tests

* Fix exception message

* Add doc

* Return should be {module, list}

Co-authored-by: Maksym Verbovyi <vermaxik@gmail.com>
Co-authored-by: Oleg Tarasenko <oltarasenko@gmail.com>
Co-authored-by: rootkc <rootkc@protonmail.com>
Co-authored-by: rootkc <info@kennethchristensen.dk>
Co-authored-by: Kevin Pan <feng_19@foxmail.com>
  • Loading branch information
6 people committed Jun 3, 2021
1 parent c02e408 commit b6e833f
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 12 deletions.
13 changes: 3 additions & 10 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,7 @@ defmodule Crawly do
)

{%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})

{fetcher, client_options} =
Crawly.Utils.get_settings(
:fetcher,
opts[:with],
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

{:ok, response} = fetcher.fetch(request, client_options)
{:ok, {response, _}} = Crawly.Worker.get_response({request, opts[:with]})

case opts[:with] do
nil ->
Expand All @@ -87,7 +79,8 @@ defmodule Crawly do

_ ->
# spider provided, send response through parse_item callback, pipe through the pipelines
with parsed_result <- parse(response, opts[:with]),
with {:ok, {parsed_result, _, _}} <-
Crawly.Worker.parse_item({response, opts[:with]}),
pipelines <-
Crawly.Utils.get_settings(
:pipelines,
Expand Down
20 changes: 20 additions & 0 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,24 @@ defmodule Crawly.Utils do
def ensure_loaded?(module) do
Code.ensure_loaded?(module)
end

@doc """
Function to get setting module in proper data structure
"""
@spec unwrap_module_and_options(term) ::
{atom, maybe_improper_list}
def unwrap_module_and_options(setting) do
case setting do
{module, args} when is_list(args) and is_atom(module) ->
{module, args}

module when is_atom(module) ->
{module, []}

x ->
raise "Invalid format: A #{setting} setting cannot be defined in the form `{#{
inspect(x)
}}`. Only the forms `{module, options}` and `module` are valid"
end
end
end
8 changes: 6 additions & 2 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,24 @@ defmodule Crawly.Worker do
{:noreply, %{state | backoff: new_backoff}}
end

@doc false
@spec get_response({request, spider_name}) :: result
when request: Crawly.Request.t(),
spider_name: atom(),
response: HTTPoison.Response.t(),
result: {:ok, {response, spider_name}} | {:error, term()}
defp get_response({request, spider_name}) do
def get_response({request, spider_name}) do
# check if spider-level fetcher is set. Overrides the globally configured fetcher.
# if not set, log warning for explicit config preferred,
# get the globally-configured fetcher. Defaults to HTTPoisonFetcher

{fetcher, options} =
Crawly.Utils.get_settings(
:fetcher,
spider_name,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)
|> Crawly.Utils.unwrap_module_and_options()

retry_options = Crawly.Utils.get_settings(:retry, spider_name, [])
retry_codes = Keyword.get(retry_options, :retry_codes, [])
Expand All @@ -109,14 +112,15 @@ defmodule Crawly.Worker do
end
end

@doc false
@spec parse_item({response, spider_name}) :: result
when response: HTTPoison.Response.t(),
spider_name: atom(),
response: HTTPoison.Response.t(),
parsed_item: Crawly.ParsedItem.t(),
next: {parsed_item, response, spider_name},
result: {:ok, next} | {:error, term()}
defp parse_item({response, spider_name}) do
def parse_item({response, spider_name}) do
try do
# get parsers
parsers = Crawly.Utils.get_settings(:parsers, spider_name, nil)
Expand Down
15 changes: 15 additions & 0 deletions test/utils_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,21 @@ defmodule UtilsTest do
)
end

test "Invalid module options format" do
:meck.expect(
Crawly.Utils,
:get_settings,
fn :fetcher, nil, nil ->
{Crawly.Fetchers.HTTPoisonFetcher}
end
)

assert catch_error(
Crawly.Utils.get_settings(:fetcher, nil, nil)
|> Crawly.Utils.unwrap_module_and_options()
)
end

defp expected_request(url) do
%Crawly.Request{
url: url,
Expand Down

0 comments on commit b6e833f

Please sign in to comment.