Skip to content

Commit

Permalink
Merge 04f640a into e4ce56a
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Apr 1, 2020
2 parents e4ce56a + 04f640a commit d6a8e8b
Show file tree
Hide file tree
Showing 25 changed files with 249 additions and 169 deletions.
40 changes: 16 additions & 24 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -29,48 +29,40 @@ use Mix.Config
#
# import_config "#{Mix.env}.exs"

config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry:
[
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
retry: [
retry_codes: [400],
max_retries: 3,
ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
],

# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 500,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,

# TODO: this looks outdated
follow_redirect: true,

# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent,
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :titile},
Crawly.Pipelines.JSONEncoder
]

config :crawly, Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"

import_config "#{Mix.env}.exs"
import_config "#{Mix.env()}.exs"
22 changes: 12 additions & 10 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,30 @@ use Mix.Config

config :crawly,
manager_operations_timeout: 30_000,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],

# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirect: true,
# Request middlewares

# Request middlewares
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
],
retry: [
Expand Down
6 changes: 3 additions & 3 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ is used by the Crawly.DataStorageWorker process.

> **Deprecated**: This has been deprecated in favour of having pipelines to handle data storage, as of `0.6.0`
### `user_agents` :: list()
### `user_agents` :: list() [DEPRECATED in 0.9.0, use middleware based configuration]

default: ["Crawly Bot 1.0"]

Expand Down Expand Up @@ -99,13 +99,13 @@ default: :disabled

An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped.

### closespider_timeout :: pos_integer()
### closespider_timeout :: pos_integer() | :disabled

default: nil

Defines a minimal amount of items which needs to be scraped by the spider within the given timeframe (30s). If the limit is not reached by the spider - it will be stopped.

### follow_redirect :: boolean()
### follow_redirect :: boolean() [Deprecated, use fetcher settings instead]

default: false

Expand Down
8 changes: 4 additions & 4 deletions documentation/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ file with the following code:

```elixir
def deps do
[
{:crawly, "~> 0.8.0"},
{:floki, "~> 0.26.0"}
]
[
{:crawly, "~> 0.8.0"},
{:floki, "~> 0.26.0"}
]
end
```

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ defmodule Crawly.DataStorage.Worker do
end

def handle_cast({:store, item}, state) do
pipelines = Application.get_env(:crawly, :pipelines, [])
pipelines = Crawly.Utils.get_settings(:pipelines, state.spider_name, [])

state =
case Crawly.Utils.pipe(pipelines, item, state) do
Expand Down
2 changes: 2 additions & 0 deletions lib/crawly/fetchers/fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ defmodule Crawly.Fetchers.Fetcher do
Crawly.Request, HTTP client options and return Crawly.Response.
"""

@type t :: {module(), list()}

@callback fetch(request, options) :: {:ok, response} | {:error, reason}
when request: Crawly.Request.t(),
response: Crawly.Response.t(),
Expand Down
32 changes: 21 additions & 11 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ defmodule Crawly.Manager do

use GenServer

alias Crawly.Utils

def start_link(spider_name) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
Expand All @@ -57,7 +59,7 @@ defmodule Crawly.Manager do

# Start workers
num_workers =
Application.get_env(:crawly, :concurrent_requests_per_domain, 4)
Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)

worker_pids =
Enum.map(1..num_workers, fn _x ->
Expand All @@ -72,8 +74,15 @@ defmodule Crawly.Manager do
)

# Schedule basic service operations for given spider manager
tref = Process.send_after(self(), :operations, get_timeout())
{:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0}}
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)
)

{:ok,
%{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}}
end

def handle_info(:operations, state) do
Expand All @@ -85,7 +94,7 @@ defmodule Crawly.Manager do
delta = items_count - state.prev_scraped_cnt
Logger.info("Current crawl speed is: #{delta} items/min")

case Application.get_env(:crawly, :closespider_itemcount, :disabled) do
case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do
:disabled ->
:ignored

Expand All @@ -100,8 +109,8 @@ defmodule Crawly.Manager do
:ignoring
end

# Close spider in case if it's not scraping itms fast enough
case Application.get_env(:crawly, :closespider_timeout) do
# Close spider in case if it's not scraping items fast enough
case Utils.get_settings(:closespider_timeout, state.name, :disabled) do
:undefined ->
:ignoring

Expand All @@ -116,12 +125,13 @@ defmodule Crawly.Manager do
:ignoring
end

tref = Process.send_after(self(), :operations, get_timeout())
tref =
Process.send_after(
self(),
:operations,
Utils.get_settings(:manager_operations_timeout, state.name, @timeout)
)

{:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}}
end

defp get_timeout() do
Application.get_env(:crawly, :manager_operations_timeout, @timeout)
end
end
5 changes: 2 additions & 3 deletions lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defmodule Crawly.Middlewares.UserAgent do
### Example Declaration
```
middlewares: [
{UserAgent, user_agents: ["My Custom Bot] }
{UserAgent, user_agents: ["My Custom Bot"] }
]
```
"""
Expand All @@ -24,8 +24,7 @@ defmodule Crawly.Middlewares.UserAgent do
new_headers = List.keydelete(request.headers, "User-Agent", 0)

user_agents =
Map.get(opts, :user_agents) ||
Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"])
Map.get(opts, :user_agents, ["Crawly Bot 1.0"])

useragent = Enum.random(user_agents)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/csv_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
{false, state :: map} | {csv_line :: String.t(), state :: map}
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item)
fields = Map.get(opts, :fields, [])

case fields do
:undefined ->
Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/duplicates_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{item_id: nil})

item_id = Map.get(opts, :item_id) || Application.get_env(:crawly, :item_id)
item_id = Map.get(opts, :item_id)

item_id = Map.get(item, item_id)

Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/pipelines/validate.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ defmodule Crawly.Pipelines.Validate do
@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
opts = Enum.into(opts, %{fields: nil})
fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item, [])
fields = Map.get(opts, :fields, [])

validation_result =
fields
Expand Down
13 changes: 2 additions & 11 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,11 @@ defmodule Crawly.Pipelines.WriteToFile do
def run(item, state, opts) do
opts = Enum.into(opts, %{folder: nil, extension: nil})

global_config =
Application.get_env(
:crawly,
Crawly.Pipelines.WriteToFile,
Keyword.new()
)

folder =
Map.get(opts, :folder) ||
Keyword.get(global_config, :folder, System.tmp_dir!())
Map.get(opts, :folder, "./")

extension =
Map.get(opts, :extension) ||
Keyword.get(global_config, :extension, "jl")
Map.get(opts, :extension, "jl")

fd = open_fd(state.spider_name, folder, extension)
:ok = write(fd, item)
Expand Down
48 changes: 48 additions & 0 deletions lib/crawly/settings.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
defmodule Crawly.Settings do
@moduledoc """
Define Crawly setting types
"""

@type numeric_setting() :: pos_integer() | :disabled
@type retry() :: [
retry_codes: [pos_integer()],
max_retries: pos_integer(),
ignored_middlewares: [module()]
]

@type middleware() ::
Crawly.Middlewares.DomainFilter
| Crawly.Middlewares.UniqueRequest
| Crawly.Middlewares.RobotsTxt
| Crawly.Middlewares.AutoCookiesManager
| {Crawly.Middlewares.UserAgent, user_agents: [binary()]}

@type pipeline() ::
Crawly.Pipelines.JSONEncoder
| {Crawly.Pipelines.DuplicatesFilter, item_id: atom()}
| {Crawly.Pipelines.Validate, fields: [atom()]}
| {Crawly.Pipelines.CSVEncoder, fields: [atom()]}
| {Crawly.Pipelines.WriteToFile,
folder: binary(), extension: binary()}

@type t() :: %{
# Allows to stop spider after a given number of scraped items
# :disabled by default.
optional(:closespider_itemcount) => numeric_setting(),

# Allows to stop spider if it extracts less than a given amount of
# items per minute.
optional(:closespider_timeout) => pos_integer(),

# Allows to control how many workers are started for a given domain
optional(:concurrent_requests_per_domain) => pos_integer(),

# Allows to define a fetcher to perform HTTP requests
optional(:fetcher) => Crawly.Fetchers.Fetcher.t(),

# Defines retries
optional(:retry) => retry(),
optional(:middlewares) => [middleware()],
optional(:pipelines) => [pipeline()]
}
end
9 changes: 8 additions & 1 deletion lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@ defmodule Crawly.Spider do
3. `parse_item/1` function which is responsible for parsing the downloaded
request and converting it into items which can be stored and new requests
which can be scheduled
4. `custom_settings/0` an optional callback which can be used in order to
provide custom spider specific settings. Should define a map with custom
settings and their values. These values will take precedence over the
global settings defined in the config.
"""

@callback init() :: [start_urls: list()]

@callback base_url() :: binary()

@callback parse_item(response :: HTTPoison.Response.t()) ::
Crawly.ParsedItem.t()
Crawly.ParsedItem.t()

@callback settings_override() :: Crawly.Settings.t()

@optional_callbacks settings_override: 0
end

0 comments on commit d6a8e8b

Please sign in to comment.