Skip to content

Commit

Permalink
Merge c5b9b30 into 8c8b365
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Nov 12, 2020
2 parents 8c8b365 + c5b9b30 commit aef6509
Show file tree
Hide file tree
Showing 24 changed files with 106 additions and 100 deletions.
3 changes: 2 additions & 1 deletion .formatter.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
".credo.exs",
".formatter.exs",
"mix.exs",
"{config,lib,priv,rel,test}/**/*.{ex,exs}"],
"{config,lib,priv,rel,test}/**/*.{ex,exs}"
],
line_length: 80
]
1 change: 0 additions & 1 deletion config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,4 @@ config :crawly,
Crawly.Pipelines.JSONEncoder
]


import_config "#{Mix.env()}.exs"
10 changes: 4 additions & 6 deletions lib/crawly/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@ defmodule Crawly.Application do
{Crawly.DataStorage, []},
{Crawly.RequestsStorage, []},
{DynamicSupervisor,
strategy: :one_for_one,
name: Crawly.RequestsStorage.WorkersSup},
strategy: :one_for_one, name: Crawly.RequestsStorage.WorkersSup},
{DynamicSupervisor,
strategy: :one_for_one,
name: Crawly.DataStorage.WorkersSup},
strategy: :one_for_one, name: Crawly.DataStorage.WorkersSup},
{Plug.Cowboy,
scheme: :http,
plug: Crawly.API.Router,
scheme: :http,
plug: Crawly.API.Router,
options: [port: Application.get_env(:crawly, :port, 4001)]}
]

Expand Down
16 changes: 9 additions & 7 deletions lib/crawly/fetchers/splash.ex
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,17 @@ defmodule Crawly.Fetchers.Splash do
nil ->
throw(
"The base_url is not set. Splash fetcher can't be used! " <>
"Please set :base_url in fetcher options to continue. " <>
"For example: " <>
"fetcher: {Crawly.Fetchers.Splash, [base_url: <url>]}"
"Please set :base_url in fetcher options to continue. " <>
"For example: " <>
"fetcher: {Crawly.Fetchers.Splash, [base_url: <url>]}"
)

{base_url, other_options} ->
{base_url, other_options}
end

query_parameters = URI.encode_query(Keyword.put(other_options, :url, request.url))
query_parameters =
URI.encode_query(Keyword.put(other_options, :url, request.url))

url =
URI.merge(base_url, "?" <> query_parameters)
Expand All @@ -53,10 +54,11 @@ defmodule Crawly.Fetchers.Splash do
new_request = %HTTPoison.Request{response.request | url: request.url}

new_response = %HTTPoison.Response{
response |
request: new_request,
request_url: request.url
response
| request: new_request,
request_url: request.url
}

{:ok, new_response}

error ->
Expand Down
1 change: 1 addition & 0 deletions lib/crawly/middlewares/auto_cookies_manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ defmodule Crawly.Middlewares.AutoCookiesManager do

new_state =
Map.put(state, :cookies_manager_seen_cookies, new_known_cookies)

{new_request, new_state}
end
end
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/middlewares/request_options.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ defmodule Crawly.Middlewares.RequestOptions do
Allows to specify HTTP request settings like follow_redirect, or request
timeout.
If using `HTTPoisonFetcher` (the default), please refer to the [HTTPoison Request documentation](https://hexdocs.pm/httpoison/HTTPoison.Request.html#content) for full list of `:options`.
## Example Usage
### Example Declaration
```
Expand All @@ -20,11 +20,11 @@ defmodule Crawly.Middlewares.RequestOptions do
{Crawly.Middlewares.RequestOptions, [proxy: {"https://my_host.com", 3000}, proxy_auth: {"my_user", "my_password}]}
]
```
"""
@behaviour Crawly.Pipeline

def run(request, state, options \\ []) do
{%Crawly.Request{request| options: options}, state}
{%Crawly.Request{request | options: options}, state}
end
end
3 changes: 1 addition & 2 deletions lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ defmodule Crawly.Middlewares.UserAgent do

new_headers = List.keydelete(request.headers, "User-Agent", 0)

user_agents =
Map.get(opts, :user_agents, ["Crawly Bot 1.0"])
user_agents = Map.get(opts, :user_agents, ["Crawly Bot 1.0"])

useragent = Enum.random(user_agents)

Expand Down
6 changes: 3 additions & 3 deletions lib/crawly/parsed_item.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule Crawly.ParsedItem do

@type item() :: map()
@type t :: %__MODULE__{
items: [item()],
requests: [Crawly.Request.t()]
}
items: [item()],
requests: [Crawly.Request.t()]
}
end
1 change: 1 addition & 0 deletions lib/crawly/pipelines/csv_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
{false, state :: map} | {csv_line :: String.t(), state :: map}
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})

case opts[:fields] do
fields when fields in [nil, []] ->
Logger.error(
Expand Down
25 changes: 13 additions & 12 deletions lib/crawly/request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ defmodule Crawly.Request do
Defines Crawly request structure.
"""
###===========================================================================
### ===========================================================================
### Type definitions
###===========================================================================
### ===========================================================================
defstruct url: nil,
headers: [],
prev_response: nil,
Expand All @@ -23,17 +23,17 @@ defmodule Crawly.Request do
@type option :: {atom(), binary()}

@type t :: %__MODULE__{
url: url(),
headers: [header()],
prev_response: %{},
options: [option()],
middlewares: [atom()],
retries: non_neg_integer()
}
url: url(),
headers: [header()],
prev_response: %{},
options: [option()],
middlewares: [atom()],
retries: non_neg_integer()
}

###===========================================================================
### ===========================================================================
### API functions
###===========================================================================
### ===========================================================================
@doc """
Create new Crawly.Request from url, headers and options
"""
Expand Down Expand Up @@ -64,10 +64,11 @@ defmodule Crawly.Request do
parameter.
"""
@spec new(url, headers, options, middlewares) :: request
# TODO: improve typespec here
when url: binary(),
headers: [term()],
options: [term()],
middlewares: [term()], # TODO: improve typespec here
middlewares: [term()],
request: Crawly.Request.t()
def new(url, headers, options, middlewares) do
%Crawly.Request{
Expand Down
3 changes: 2 additions & 1 deletion lib/crawly/requests_storage/requests_storage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ defmodule Crawly.RequestsStorage do
%{
id: :undefined,
restart: :temporary,
start: {Crawly.RequestsStorage.Worker, :start_link, [spider_name]}
start:
{Crawly.RequestsStorage.Worker, :start_link, [spider_name]}
}
)

Expand Down
12 changes: 6 additions & 6 deletions lib/crawly/response.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ defmodule Crawly.Response do
status_code: nil

@type t :: %__MODULE__{
body: term(),
headers: list(),
request: Crawly.Request.t(),
request_url: Crawly.Request.url(),
status_code: integer()
}
body: term(),
headers: list(),
request: Crawly.Request.t(),
request_url: Crawly.Request.url(),
status_code: integer()
}
end
7 changes: 4 additions & 3 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ defmodule Crawly.Spider do
global settings defined in the config.
"""



@callback init() :: [start_urls: list(), start_requests: list()]
@callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]
@callback init(options: keyword()) :: [
start_urls: list(),
start_requests: list()
]

@callback base_url() :: binary()

Expand Down
1 change: 1 addition & 0 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ defmodule Crawly.Worker do
inspect(reason)
}"
)

Logger.debug(Exception.format(:error, error, __STACKTRACE__))

{:error, reason}
Expand Down
1 change: 0 additions & 1 deletion test/data_storage_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ defmodule DataStorageTest do
{:stored_items, 0} = Crawly.DataStorage.stats(context.crawler)
end


test "Starting child worker twice", context do
result = Crawly.DataStorage.start_worker(context.crawler)
assert result == {:error, :already_started}
Expand Down
3 changes: 1 addition & 2 deletions test/engine_tests.exs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ defmodule ManagerTest do
assert Crawly.Engine.get_crawl_id(TestSpider)
end


test "Engine will use a tag from external system if set" do
tag = "custom_crawl_tag"
:ok = Crawly.Engine.start_spider(TestSpider, tag)
assert {:ok, tag} == Crawly.Engine.get_crawl_id(TestSpider)
end
end
end
5 changes: 3 additions & 2 deletions test/middlewares/request_options_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@ defmodule Middlewares.RequestOptionsTest do

test "Options are added to request settings" do
req = Crawly.Request.new("http://example.com")

middlewares = [
{
Crawly.Middlewares.RequestOptions,
[timeout: 30_000, recv_timeout: 15000]
}
]

{new_request, _state} = Crawly.Utils.pipe(middlewares, req, %{})

assert [timeout: 30000, recv_timeout: 15000] == new_request.options
assert [timeout: 30000, recv_timeout: 15000] == new_request.options
end
end
5 changes: 4 additions & 1 deletion test/middlewares/user_agent_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ defmodule Middlewares.UserAgentTest do
use ExUnit.Case, async: false

test "Adds a user agent to request header with global config" do
middlewares = [{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}]
middlewares = [
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
]

req = %Crawly.Request{}
state = %{}

Expand Down
7 changes: 5 additions & 2 deletions test/pipelines/experimental/send_to_ui_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ defmodule Pipelines.Experimental.SendToUITest do
use ExUnit.Case, async: false

@item %{title: "Title", author: "Me"}

test "job tag is not re-generated if pipeline was re-executed" do
pipelines = [{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}]
pipelines = [
{Crawly.Pipelines.Experimental.SendToUI, ui_node: :"ui@127.0.0.1"}
]

state = %{spider_name: PipelineTestSpider}
{@item, state} = Crawly.Utils.pipe(pipelines, @item, state)

Expand Down
8 changes: 6 additions & 2 deletions test/settings_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@ defmodule SettingsTest do
assert 5 ==
Crawly.Utils.get_settings(
:concurrent_requests_per_domain,
TestSpiderSettingsOverride, 1)
TestSpiderSettingsOverride,
1
)
end

test "incomplete spider overrides do not break global settings" do
assert 10 ==
Crawly.Utils.get_settings(
:closespider_itemcount,
TestSpiderSettingsOverride, 1)
TestSpiderSettingsOverride,
1
)
end
end

Expand Down
7 changes: 3 additions & 4 deletions test/test_utils.ex
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
defmodule TestUtils do

def stop_process(pid) do
:erlang.exit(pid, :shutdown)
wait_pid(pid)
Expand All @@ -18,7 +17,6 @@ defmodule TestUtils do

result
end

end

defmodule TestSpider do
Expand All @@ -36,12 +34,14 @@ defmodule TestSpider do

def parse_item(_response) do
path = Enum.random(1..100)

%Crawly.ParsedItem{
:items => [
%{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
],
:requests => [
Crawly.Utils.request_from_url("https://www.example.com/#{path}")]
Crawly.Utils.request_from_url("https://www.example.com/#{path}")
]
}
end
end
Expand Down Expand Up @@ -71,4 +71,3 @@ defmodule UtilsTestSpider do
{[], []}
end
end

2 changes: 0 additions & 2 deletions test/test_utils.exs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
defmodule TestUtils do

def stop_process(pid) do
:erlang.exit(pid, :shutdown)
wait_pid(pid)
Expand All @@ -18,5 +17,4 @@ defmodule TestUtils do

result
end

end

0 comments on commit aef6509

Please sign in to comment.