Skip to content

Commit

Permalink
Merge 905b3a5 into e4ce56a
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Apr 1, 2020
2 parents e4ce56a + 905b3a5 commit 330573c
Show file tree
Hide file tree
Showing 24 changed files with 241 additions and 211 deletions.
40 changes: 16 additions & 24 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -29,48 +29,40 @@ use Mix.Config
#
# import_config "#{Mix.env}.exs"

config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
retry: [
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
],

# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 500,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,

# TODO: this looks outdated
follow_redirect: true,

# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent,
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
]

config :crawly, Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"

import_config "#{Mix.env}.exs"
import_config "#{Mix.env()}.exs"
22 changes: 12 additions & 10 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,30 @@ use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],

# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirect: true,
# Request middlewares

# Request middlewares
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
],
retry: [
Expand Down
50 changes: 1 addition & 49 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,6 @@ config :crawly,

## Options

### base_store_path :: binary() [DEPRECATED in 0.6.0]

default: "/tmp"

Defines the path where items are stored in the filesystem. This setting
is used by the Crawly.DataStorageWorker process.

> **Deprecated**: This has been deprecated in favour of having pipelines to handle data storage, as of `0.6.0`
### `user_agents` :: list()

default: ["Crawly Bot 1.0"]

Defines a user agent string for Crawly requests. This setting is used
by the `Crawly.Middlewares.UserAgent` middleware. When the list has more than one
item, all requests will be executed, each with a user agent string chosen
randomly from the supplied list.

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Middlewares.UserAgent` module documentation for correct usage.
### `item` :: [atom()]

default: []

Defines a list of required fields for the item. When none of the default
fields are added to the following item (or if the values of
required fields are "" or nil), the item will be dropped. This setting
is used by the `Crawly.Pipelines.Validate` pipeline

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.Validate` module documentation for correct usage.
### `item_id` :: atom()

default: nil

Defines a field which will be used in order to identify if an item is
a duplicate or not. In most of the ecommerce websites the desired id
field is the SKU. This setting is used in
the `Crawly.Pipelines.DuplicatesFilter` pipeline. If unset, the related
middleware is effectively disabled.

> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.DuplicatesFilter` module documentation for correct usage.
### `pipelines` :: [module()]

default: []
Expand Down Expand Up @@ -99,17 +56,12 @@ default: :disabled

An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped.

### closespider_timeout :: pos_integer()
### closespider_timeout :: pos_integer() | :disabled

default: nil

Defines a minimal amount of items which needs to be scraped by the spider within the given timeframe (30s). If the limit is not reached by the spider - it will be stopped.

### follow_redirect :: boolean()

default: false

Defines is Crawly spider is supposed to follow HTTP redirects or not.

### concurrent_requests_per_domain :: pos_integer()

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ defmodule Crawly.DataStorage.Worker do
end

def handle_cast({:store, item}, state) do
pipelines = Application.get_env(:crawly, :pipelines, [])
pipelines = Crawly.Utils.get_settings(:pipelines, state.spider_name, [])

state =
case Crawly.Utils.pipe(pipelines, item, state) do
Expand Down
2 changes: 2 additions & 0 deletions lib/crawly/fetchers/fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ defmodule Crawly.Fetchers.Fetcher do
Crawly.Request, HTTP client options and return Crawly.Response.
"""

@type t :: {module(), list()}

@callback fetch(request, options) :: {:ok, response} | {:error, reason}
when request: Crawly.Request.t(),
response: Crawly.Response.t(),
Expand Down
32 changes: 21 additions & 11 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ defmodule Crawly.Manager do

use GenServer

alias Crawly.Utils

def start_link(spider_name) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
Expand All @@ -57,7 +59,7 @@ defmodule Crawly.Manager do

# Start workers
num_workers =
Application.get_env(:crawly, :concurrent_requests_per_domain, 4)
Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)

worker_pids =
Enum.map(1..num_workers, fn _x ->
Expand All @@ -72,8 +74,15 @@ defmodule Crawly.Manager do
)

# Schedule basic service operations for given spider manager
tref = Process.send_after(self(), :operations, get_timeout())
{:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0}}
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)
)

{:ok,
%{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}}
end

def handle_info(:operations, state) do
Expand All @@ -85,7 +94,7 @@ defmodule Crawly.Manager do
delta = items_count - state.prev_scraped_cnt
Logger.info("Current crawl speed is: #{delta} items/min")

case Application.get_env(:crawly, :closespider_itemcount, :disabled) do
case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do
:disabled ->
:ignored

Expand All @@ -100,8 +109,8 @@ defmodule Crawly.Manager do
:ignoring
end

# Close spider in case if it's not scraping itms fast enough
case Application.get_env(:crawly, :closespider_timeout) do
# Close spider in case if it's not scraping items fast enough
case Utils.get_settings(:closespider_timeout, state.name, :disabled) do
:undefined ->
:ignoring

Expand All @@ -116,12 +125,13 @@ defmodule Crawly.Manager do
:ignoring
end

tref = Process.send_after(self(), :operations, get_timeout())
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, state.name, @timeout)
)

{:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}}
end

defp get_timeout() do
Application.get_env(:crawly, :manager_operations_timeout, @timeout)
end
end
5 changes: 2 additions & 3 deletions lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defmodule Crawly.Middlewares.UserAgent do
### Example Declaration
```
middlewares: [
{UserAgent, user_agents: ["My Custom Bot] }
{UserAgent, user_agents: ["My Custom Bot"] }
]
```
"""
Expand All @@ -24,8 +24,7 @@ defmodule Crawly.Middlewares.UserAgent do
new_headers = List.keydelete(request.headers, "User-Agent", 0)

user_agents =
Map.get(opts, :user_agents) ||
Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"])
Map.get(opts, :user_agents, ["Crawly Bot 1.0"])

useragent = Enum.random(user_agents)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/csv_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
{false, state :: map} | {csv_line :: String.t(), state :: map}
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item)
fields = Map.get(opts, :fields, [])

case fields do
:undefined ->
Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/duplicates_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{item_id: nil})

item_id = Map.get(opts, :item_id) || Application.get_env(:crawly, :item_id)
item_id = Map.get(opts, :item_id)

item_id = Map.get(item, item_id)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/validate.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ defmodule Crawly.Pipelines.Validate do
@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item, [])
fields = Map.get(opts, :fields, [])

validation_result =
fields
Expand Down
13 changes: 2 additions & 11 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,11 @@ defmodule Crawly.Pipelines.WriteToFile do
def run(item, state, opts) do
opts = Enum.into(opts, %{folder: nil, extension: nil})

global_config =
Application.get_env(
:crawly,
Crawly.Pipelines.WriteToFile,
Keyword.new()
)

folder =
Map.get(opts, :folder) ||
Keyword.get(global_config, :folder, System.tmp_dir!())
Map.get(opts, :folder, "./")

extension =
Map.get(opts, :extension) ||
Keyword.get(global_config, :extension, "jl")
Map.get(opts, :extension, "jl")

fd = open_fd(state.spider_name, folder, extension)
:ok = write(fd, item)
Expand Down
48 changes: 48 additions & 0 deletions lib/crawly/settings.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
defmodule Crawly.Settings do
@moduledoc """
Define Crawly setting types
"""

@type numeric_setting() :: pos_integer() | :disabled
@type retry() :: [
retry_codes: [pos_integer()],
max_retries: pos_integer(),
ignored_middlewares: [module()]
]

@type middleware() ::
Crawly.Middlewares.DomainFilter
| Crawly.Middlewares.UniqueRequest
| Crawly.Middlewares.RobotsTxt
| Crawly.Middlewares.AutoCookiesManager
| {Crawly.Middlewares.UserAgent, user_agents: [binary()]}

@type pipeline() ::
Crawly.Pipelines.JSONEncoder
| {Crawly.Pipelines.DuplicatesFilter, item_id: atom()}
| {Crawly.Pipelines.Validate, fields: [atom()]}
| {Crawly.Pipelines.CSVEncoder, fields: [atom()]}
| {Crawly.Pipelines.WriteToFile,
folder: binary(), extension: binary()}

@type t() :: [
# Allows to stop spider after a given number of scraped items
# :disabled by default.
closespider_itemcount: numeric_setting(),

# Allows to stop spider if it extracts less than a given amount of
# items per minute.
closespider_timeout: pos_integer(),

# Allows to control how many workers are started for a given domain
concurrent_requests_per_domain: pos_integer(),

# Allows to define a fetcher to perform HTTP requests
fetcher: Crawly.Fetchers.Fetcher.t(),

# Defines retries
retry: retry(),
middlewares: [middleware()],
pipelines: [pipeline()]
]
end

0 comments on commit 330573c

Please sign in to comment.