diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md index 20d67f69..1f6a011c 100644 --- a/documentation/basic_concepts.md +++ b/documentation/basic_concepts.md @@ -113,6 +113,13 @@ Built-in middlewares: 2. `Crawly.Middlewares.RobotsTxt` - this middleware ensures that Crawly respects the robots.txt defined by the target website. 3. `Crawly.Middlewares.UniqueRequest` - this middleware ensures that crawly would not schedule the same URL(request) multiple times. 4. `Crawly.Middlewares.UserAgent` - this middleware is used to set a User Agent HTTP header. Allows to rotate UserAgents, if the last one is defined as a list. +5. `Crawly.Middlewares.RequestOptions` - allows to set additional request options, for example timeout, of proxy string. + + + Example: + ```elixir + {Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]} + ``` ### Item Pipelines diff --git a/documentation/configuration.md b/documentation/configuration.md index 2e330391..e747ed38 100644 --- a/documentation/configuration.md +++ b/documentation/configuration.md @@ -8,7 +8,7 @@ A basic example: config :crawly, pipelines: [ # my pipelines - ] + ], middlewares: [ # my middlewares ] @@ -79,14 +79,15 @@ config :crawly, ### middlewares :: [module()] -```elixir The default middlewares are as follows: +```elixir config :crawly, middlewares: [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.RobotsTxt, - {Crawly.Middlewares.UserAgent, user_agents: ["My Bot"] } + {Crawly.Middlewares.UserAgent, user_agents: ["My Bot"] }, + {Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]} ] ``` diff --git a/documentation/quickstart.md b/documentation/quickstart.md index 1c6845dc..bbd2de31 100644 --- a/documentation/quickstart.md +++ b/documentation/quickstart.md @@ -56,10 +56,10 @@ Goals: config :crawly, closespider_timeout: 10, concurrent_requests_per_domain: 8, - follow_redirect: true, closespider_itemcount: 1000, middlewares: [ Crawly.Middlewares.DomainFilter, + {Crawly.Middlewares.RequestSettings, [timeout: 30_000]}, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.UserAgent ], diff --git a/lib/crawly.ex b/lib/crawly.ex index d029b6e5..288f2d1f 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -12,19 +12,8 @@ defmodule Crawly do headers: [], options: [] def fetch(url, headers \\ [], options \\ []) do - options = [follow_redirect: Application.get_env(:crawly, :follow_redirect, false)] ++ options - - options = - case Application.get_env(:crawly, :proxy, false) do - false -> - options - - proxy -> - options ++ [{:proxy, proxy}] - end request = Crawly.Request.new(url, headers, options) - {fetcher, client_options} = Application.get_env( :crawly, :fetcher, @@ -32,6 +21,5 @@ defmodule Crawly do ) fetcher.fetch(request, client_options) - end end diff --git a/lib/crawly/middlewares/request_options.ex b/lib/crawly/middlewares/request_options.ex new file mode 100644 index 00000000..da3bd41d --- /dev/null +++ b/lib/crawly/middlewares/request_options.ex @@ -0,0 +1,20 @@ +defmodule Crawly.Middlewares.RequestOptions do + @moduledoc """ + Request settings middleware + + Allows to specify HTTP request settings like follow_redirect, or request + timeout. + + ### Example Declaration + ``` + middlewares: [ + {Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]} + ] + ``` + """ + @behaviour Crawly.Pipeline + + def run(request, state, options \\ []) do + {%Crawly.Request{request| options: options}, state} + end +end \ No newline at end of file diff --git a/lib/crawly/request.ex b/lib/crawly/request.ex index 3252e1be..cb5ff543 100644 --- a/lib/crawly/request.ex +++ b/lib/crawly/request.ex @@ -48,6 +48,7 @@ defmodule Crawly.Request do # incoming requests default_middlewares = [ Crawly.Middlewares.DomainFilter, + {Crawly.Middlewares.RequestOptions, []}, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.RobotsTxt ] diff --git a/lib/crawly/worker.ex b/lib/crawly/worker.ex index b1d0666a..91238814 100644 --- a/lib/crawly/worker.ex +++ b/lib/crawly/worker.ex @@ -135,30 +135,13 @@ defmodule Crawly.Worker do requests = Map.get(parsed_item, :requests, []) items = Map.get(parsed_item, :items, []) - # Reading HTTP client options - options = [follow_redirect: Application.get_env(:crawly, :follow_redirect, false)] - - options = - case Application.get_env(:crawly, :proxy, false) do - false -> - options - - proxy -> - options ++ [{:proxy, proxy}] - end - # Process all requests one by one Enum.each( requests, fn request -> - request = - request - |> Map.put(:prev_response, response) - |> Map.put(:options, options) - + request = Map.put(request, :prev_response, response) Crawly.RequestsStorage.store(spider_name, request) - end - ) + end) # Process all items one by one Enum.each( diff --git a/test/middlewares/request_options_test.exs b/test/middlewares/request_options_test.exs new file mode 100644 index 00000000..ac1a38cb --- /dev/null +++ b/test/middlewares/request_options_test.exs @@ -0,0 +1,18 @@ +defmodule Middlewares.OptionsTest do + use ExUnit.Case, async: false + + test "Options are added to request settings" do + req = Crawly.Request.new("http://example.com") + middlewares = [ + { + Crawly.Middlewares.RequestOptions, + [timeout: 30_000, recv_timeout: 15000] + } + ] + state = %{spider_name: :test_spider} + + {new_request, _state} = Crawly.Utils.pipe(middlewares, req, state) + + assert [timeout: 30000, recv_timeout: 15000] == new_request.options + end +end