Skip to content

Commit

Permalink
Merge 91f8b2c into a89b54b
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Feb 15, 2020
2 parents a89b54b + 91f8b2c commit 8d290bb
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 35 deletions.
7 changes: 7 additions & 0 deletions documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ Built-in middlewares:
2. `Crawly.Middlewares.RobotsTxt` - this middleware ensures that Crawly respects the robots.txt defined by the target website.
3. `Crawly.Middlewares.UniqueRequest` - this middleware ensures that crawly would not schedule the same URL(request) multiple times.
4. `Crawly.Middlewares.UserAgent` - this middleware is used to set a User Agent HTTP header. Allows to rotate UserAgents, if the last one is defined as a list.
5. `Crawly.Middlewares.RequestOptions` - allows to set additional request options, for example timeout, of proxy string.


Example:
```elixir
{Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]}
```

### Item Pipelines

Expand Down
7 changes: 4 additions & 3 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ A basic example:
config :crawly,
pipelines: [
# my pipelines
]
],
middlewares: [
# my middlewares
]
Expand Down Expand Up @@ -79,14 +79,15 @@ config :crawly,

### middlewares :: [module()]

```elixir
The default middlewares are as follows:
```elixir
config :crawly,
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
{Crawly.Middlewares.UserAgent, user_agents: ["My Bot"] }
{Crawly.Middlewares.UserAgent, user_agents: ["My Bot"] },
{Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]}
]
```

Expand Down
2 changes: 1 addition & 1 deletion documentation/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ Goals:
config :crawly,
closespider_timeout: 10,
concurrent_requests_per_domain: 8,
follow_redirect: true,
closespider_itemcount: 1000,
middlewares: [
Crawly.Middlewares.DomainFilter,
{Crawly.Middlewares.RequestSettings, [timeout: 30_000]},
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.UserAgent
],
Expand Down
12 changes: 0 additions & 12 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,14 @@ defmodule Crawly do
headers: [],
options: []
def fetch(url, headers \\ [], options \\ []) do
options = [follow_redirect: Application.get_env(:crawly, :follow_redirect, false)] ++ options

options =
case Application.get_env(:crawly, :proxy, false) do
false ->
options

proxy ->
options ++ [{:proxy, proxy}]
end
request = Crawly.Request.new(url, headers, options)


{fetcher, client_options} = Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

fetcher.fetch(request, client_options)

end
end
20 changes: 20 additions & 0 deletions lib/crawly/middlewares/request_options.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defmodule Crawly.Middlewares.RequestOptions do
@moduledoc """
Request settings middleware
Allows to specify HTTP request settings like follow_redirect, or request
timeout.
### Example Declaration
```
middlewares: [
{Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]}
]
```
"""
@behaviour Crawly.Pipeline

def run(request, state, options \\ []) do
{%Crawly.Request{request| options: options}, state}
end
end
1 change: 1 addition & 0 deletions lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ defmodule Crawly.Request do
# incoming requests
default_middlewares = [
Crawly.Middlewares.DomainFilter,
{Crawly.Middlewares.RequestOptions, []},
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt
]
Expand Down
21 changes: 2 additions & 19 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -135,30 +135,13 @@ defmodule Crawly.Worker do
requests = Map.get(parsed_item, :requests, [])
items = Map.get(parsed_item, :items, [])

# Reading HTTP client options
options = [follow_redirect: Application.get_env(:crawly, :follow_redirect, false)]

options =
case Application.get_env(:crawly, :proxy, false) do
false ->
options

proxy ->
options ++ [{:proxy, proxy}]
end

# Process all requests one by one
Enum.each(
requests,
fn request ->
request =
request
|> Map.put(:prev_response, response)
|> Map.put(:options, options)

request = Map.put(request, :prev_response, response)
Crawly.RequestsStorage.store(spider_name, request)
end
)
end)

# Process all items one by one
Enum.each(
Expand Down
18 changes: 18 additions & 0 deletions test/middlewares/request_options_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defmodule Middlewares.OptionsTest do
use ExUnit.Case, async: false

test "Options are added to request settings" do
req = Crawly.Request.new("http://example.com")
middlewares = [
{
Crawly.Middlewares.RequestOptions,
[timeout: 30_000, recv_timeout: 15000]
}
]
state = %{spider_name: :test_spider}

{new_request, _state} = Crawly.Utils.pipe(middlewares, req, state)

assert [timeout: 30000, recv_timeout: 15000] == new_request.options
end
end

0 comments on commit 8d290bb

Please sign in to comment.