Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crawly custom settings #72

Merged
merged 4 commits into from
Apr 6, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
40 changes: 16 additions & 24 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -29,48 +29,40 @@ use Mix.Config
#
# import_config "#{Mix.env}.exs"

config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
retry: [
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
],

# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 500,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,

# TODO: this looks outdated
follow_redirect: true,

# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent,
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
]

config :crawly, Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"

import_config "#{Mix.env}.exs"
import_config "#{Mix.env()}.exs"
22 changes: 12 additions & 10 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,30 @@ use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],

# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirect: true,
# Request middlewares

# Request middlewares
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
],
retry: [
Expand Down
75 changes: 26 additions & 49 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,6 @@ config :crawly,

## Options

### base_store_path :: binary() [DEPRECATED in 0.6.0]

default: "/tmp"

Defines the path where items are stored in the filesystem. This setting
is used by the Crawly.DataStorageWorker process.

> **Deprecated**: This has been deprecated in favour of having pipelines to handle data storage, as of `0.6.0`

### `user_agents` :: list()

default: ["Crawly Bot 1.0"]

Defines a user agent string for Crawly requests. This setting is used
by the `Crawly.Middlewares.UserAgent` middleware. When the list has more than one
item, all requests will be executed, each with a user agent string chosen
randomly from the supplied list.

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Middlewares.UserAgent` module documentation for correct usage.

### `item` :: [atom()]

default: []

Defines a list of required fields for the item. When none of the default
fields are added to the following item (or if the values of
required fields are "" or nil), the item will be dropped. This setting
is used by the `Crawly.Pipelines.Validate` pipeline

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.Validate` module documentation for correct usage.

### `item_id` :: atom()

default: nil

Defines a field which will be used in order to identify if an item is
a duplicate or not. In most of the ecommerce websites the desired id
field is the SKU. This setting is used in
the `Crawly.Pipelines.DuplicatesFilter` pipeline. If unset, the related
middleware is effectively disabled.

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.DuplicatesFilter` module documentation for correct usage.

### `pipelines` :: [module()]

default: []
Expand Down Expand Up @@ -99,17 +56,12 @@ default: :disabled

An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped.

### closespider_timeout :: pos_integer()
### closespider_timeout :: pos_integer() | :disabled

default: nil

Defines a minimal amount of items which needs to be scraped by the spider within the given timeframe (30s). If the limit is not reached by the spider - it will be stopped.

### follow_redirect :: boolean()

default: false

Defines is Crawly spider is supposed to follow HTTP redirects or not.

### concurrent_requests_per_domain :: pos_integer()

Expand Down Expand Up @@ -152,3 +104,28 @@ Allows to specify a custom HTTP client which will be performing request to the c
default: 4001

Allows to specify a custom port to start the application. That is important when running more than one application in a single machine, in which case shall not use the same port as the others.


## Overriding global settings on spider level

It's possible to override most of the setting on a spider level. In order to do that,
oltarasenko marked this conversation as resolved.
Show resolved Hide resolved
it's required to define a custom callback for Crawly.Spider behaviour.

For example:
```elixir
def override_settings() do
[
concurrent_requests_per_domain: 5,
closespider_timeout: 6
]
end
```

The full list of overridable settings:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it necessary to list this? all settings are overridable, from my understanding.

- closespider_itemcount,
- closespider_timeout,
- concurrent_requests_per_domain,
- fetcher,
- retry,
- middlewares,
- pipelines
2 changes: 1 addition & 1 deletion lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ defmodule Crawly.DataStorage.Worker do
end

def handle_cast({:store, item}, state) do
pipelines = Application.get_env(:crawly, :pipelines, [])
pipelines = Crawly.Utils.get_settings(:pipelines, state.spider_name, [])

state =
case Crawly.Utils.pipe(pipelines, item, state) do
Expand Down
2 changes: 2 additions & 0 deletions lib/crawly/fetchers/fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ defmodule Crawly.Fetchers.Fetcher do
Crawly.Request, HTTP client options and return Crawly.Response.
"""

@type t :: {module(), list()}

@callback fetch(request, options) :: {:ok, response} | {:error, reason}
when request: Crawly.Request.t(),
response: Crawly.Response.t(),
Expand Down
32 changes: 21 additions & 11 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ defmodule Crawly.Manager do

use GenServer

alias Crawly.Utils

def start_link(spider_name) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
Expand All @@ -57,7 +59,7 @@ defmodule Crawly.Manager do

# Start workers
num_workers =
Application.get_env(:crawly, :concurrent_requests_per_domain, 4)
Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)

worker_pids =
Enum.map(1..num_workers, fn _x ->
Expand All @@ -72,8 +74,15 @@ defmodule Crawly.Manager do
)

# Schedule basic service operations for given spider manager
tref = Process.send_after(self(), :operations, get_timeout())
{:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0}}
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)
oltarasenko marked this conversation as resolved.
Show resolved Hide resolved
)

{:ok,
%{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}}
end

def handle_info(:operations, state) do
Expand All @@ -85,7 +94,7 @@ defmodule Crawly.Manager do
delta = items_count - state.prev_scraped_cnt
Logger.info("Current crawl speed is: #{delta} items/min")

case Application.get_env(:crawly, :closespider_itemcount, :disabled) do
case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do
:disabled ->
:ignored
oltarasenko marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -100,8 +109,8 @@ defmodule Crawly.Manager do
:ignoring
end

# Close spider in case if it's not scraping itms fast enough
case Application.get_env(:crawly, :closespider_timeout) do
# Close spider in case if it's not scraping items fast enough
case Utils.get_settings(:closespider_timeout, state.name, :disabled) do
:undefined ->
:ignoring

Expand All @@ -116,12 +125,13 @@ defmodule Crawly.Manager do
:ignoring
end

tref = Process.send_after(self(), :operations, get_timeout())
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, state.name, @timeout)
)

{:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}}
end

defp get_timeout() do
Application.get_env(:crawly, :manager_operations_timeout, @timeout)
end
end
5 changes: 2 additions & 3 deletions lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defmodule Crawly.Middlewares.UserAgent do
### Example Declaration
```
middlewares: [
{UserAgent, user_agents: ["My Custom Bot] }
{UserAgent, user_agents: ["My Custom Bot"] }
]
```
"""
Expand All @@ -24,8 +24,7 @@ defmodule Crawly.Middlewares.UserAgent do
new_headers = List.keydelete(request.headers, "User-Agent", 0)

user_agents =
Map.get(opts, :user_agents) ||
Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"])
Map.get(opts, :user_agents, ["Crawly Bot 1.0"])

useragent = Enum.random(user_agents)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/csv_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
{false, state :: map} | {csv_line :: String.t(), state :: map}
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item)
fields = Map.get(opts, :fields, [])

case fields do
:undefined ->
Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/duplicates_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{item_id: nil})

item_id = Map.get(opts, :item_id) || Application.get_env(:crawly, :item_id)
item_id = Map.get(opts, :item_id)

item_id = Map.get(item, item_id)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/validate.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ defmodule Crawly.Pipelines.Validate do
@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item, [])
fields = Map.get(opts, :fields, [])

validation_result =
fields
Expand Down
13 changes: 2 additions & 11 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,11 @@ defmodule Crawly.Pipelines.WriteToFile do
def run(item, state, opts) do
opts = Enum.into(opts, %{folder: nil, extension: nil})

global_config =
Application.get_env(
:crawly,
Crawly.Pipelines.WriteToFile,
Keyword.new()
)

folder =
Map.get(opts, :folder) ||
Keyword.get(global_config, :folder, System.tmp_dir!())
Map.get(opts, :folder, "./")

extension =
Map.get(opts, :extension) ||
Keyword.get(global_config, :extension, "jl")
Map.get(opts, :extension, "jl")

fd = open_fd(state.spider_name, folder, extension)
:ok = write(fd, item)
Expand Down