Skip to content

Commit

Permalink
Merge 83a7245 into b5ba74c
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Dec 30, 2019
2 parents b5ba74c + 83a7245 commit 89f6f0f
Show file tree
Hide file tree
Showing 15 changed files with 441 additions and 168 deletions.
69 changes: 37 additions & 32 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -32,37 +32,42 @@ use Mix.Config
config :crawly, Crawly.Worker, client: HTTPoison

config :crawly,
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 500,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
]
retry: [
{:max_retries, 3},
{:ignored_middlewares, [Crawly.Middlewares.UniqueRequest]}
],
# User agents which are going to be used with requests
user_agents: [
"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
],
# Item definition
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,
# Stop spider after scraping certain amount of items
closespider_itemcount: 500,
# Stop spider if it does crawl fast enough
closespider_timeout: 20,
concurrent_requests_per_domain: 5,
follow_redirects: true,
# Request middlewares
middlewares: [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.UniqueRequest,
Crawly.Middlewares.RobotsTxt,
Crawly.Middlewares.UserAgent
],
pipelines: [
Crawly.Pipelines.Validate,
Crawly.Pipelines.DuplicatesFilter,
Crawly.Pipelines.JSONEncoder
]

config :crawly, Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"
config :crawly,
Crawly.Pipelines.WriteToFile,
folder: "/tmp",
extension: "jl"

import_config "#{Mix.env}.exs"
import_config "#{Mix.env}.exs"
5 changes: 5 additions & 0 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ config :crawly,
item: [:title, :author, :time, :url],
# Identifier which is used to filter out duplicates
item_id: :title,

retry: [
{:max_retries, 2},
{:ignored_middlewares, [Crawly.Middlewares.UniqueRequest]}
],
# Stop spider after scraping certain amount of items
closespider_itemcount: 100,
# Stop spider if it does crawl fast enough
Expand Down
10 changes: 9 additions & 1 deletion documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,19 @@ default: 4

The maximum number of concurrent (ie. simultaneous) requests that will be performed by the Crawly workers.


### max_retries :: pos_integer()

default: 3

Controlls the amount of retries made by the Crawly in order to fetch a successful
request (the one with HTTP 200 response).

### proxy :: binary()

Requests can be directed through a proxy. It will set the proxy option for the request.
It's possible to set proxy using the proxy value of Crawly config, for example:

```
config :crawly,
proxy: "<proxy_host>:<proxy_port>",
Expand Down
2 changes: 1 addition & 1 deletion lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ defmodule Crawly.Manager do
Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> %Crawly.Request{url: url} end)
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)

:ok = Crawly.RequestsStorage.store(spider_name, requests)

Expand Down
5 changes: 3 additions & 2 deletions lib/crawly/middlewares/domain_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ defmodule Crawly.Middlewares.DomainFilter do
def run(request, state, _opts \\ []) do
base_url = state.spider_name.base_url()

case String.contains?(request.url, base_url) do
url = Crawly.Request.url(request)
case String.contains?(url, base_url) do
false ->
Logger.debug(
"Dropping request: #{inspect(request.url)} (domain filter)"
"Dropping request: #{inspect(url)} (domain filter)"
)

{false, state}
Expand Down
5 changes: 3 additions & 2 deletions lib/crawly/middlewares/robotstxt.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ defmodule Crawly.Middlewares.RobotsTxt do
require Logger

def run(request, state, _opts \\ []) do
case Gollum.crawlable?("Crawly", request.url) do
url = Crawly.Request.url(request)
case Gollum.crawlable?("Crawly", url) do
:uncrawlable ->
Logger.debug("Dropping request: #{request.url} (robots.txt filter)")
Logger.debug("Dropping request: #{url} (robots.txt filter)")

{false, state}

Expand Down
11 changes: 7 additions & 4 deletions lib/crawly/middlewares/unique_request.ex
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
defmodule Crawly.Middlewares.UniqueRequest do
@moduledoc """
Avoid scheduling requests for the same pages.
Avoid scheduling requests for the same pages. However if retry is requested
the request is still allowed.
"""
require Logger

def run(request, state) do
unique_request_seen_requests =
Map.get(state, :unique_request_seen_requests, %{})

case Map.get(unique_request_seen_requests, request.url) do
url = Crawly.Request.url(request)
case Map.get(unique_request_seen_requests, url) do
nil ->
unique_request_seen_requests =
Map.put(unique_request_seen_requests, request.url, true)
Map.put(unique_request_seen_requests, url, true)

new_state =
Map.put(
Expand All @@ -24,10 +26,11 @@ defmodule Crawly.Middlewares.UniqueRequest do

_ ->
Logger.debug(
"Dropping request: #{request.url}, as it's already processed"
"Dropping request: #{url}, as it's already processed"
)

{false, state}
end
end

end
18 changes: 12 additions & 6 deletions lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,25 @@ defmodule Crawly.Middlewares.UserAgent do
"""
require Logger

def run(request, state, opts \\ []) do
def run(request, state, opts \\ %{}) do
opts = Enum.into(opts, %{user_agents: nil})
headers = Crawly.Request.headers(request)

new_headers = List.keydelete(request.headers, "User-Agent", 0)

# Getting user agent from a list defined by a middleware
user_agents =
Map.get(opts, :user_agents) ||
Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"])
ua = {"User-Agent", Enum.random(user_agents)}

useragent = Enum.random(user_agents)
new_headers =
case List.keyfind(headers, "User-Agent", 0, nil) do
nil ->
[ua | headers]
_ ->
List.keyreplace(headers, "User-Agent", 0, {"User-Agent", ua})
end

new_request =
Map.put(request, :headers, [{"User-Agent", useragent} | new_headers])
new_request = Crawly.Request.headers(request, new_headers)

{new_request, state}
end
Expand Down

0 comments on commit 89f6f0f

Please sign in to comment.