Skip to content

Commit

Permalink
Merge 0760c65 into cca29db
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko authored Nov 11, 2020
2 parents cca29db + 0760c65 commit 3bd20a0
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 28 deletions.
5 changes: 5 additions & 0 deletions documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.
** This callback is going to be deprecated in favour of init/1. For now the backwords
compatibility is kept with a help of macro which always generates `init/1`.

`init(options)` same as `init/0` but also takes options (which can be passed from the engine during
the spider start).

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
14 changes: 9 additions & 5 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@ defmodule Crawly do
options: []
def fetch(url, headers \\ [], options \\ []) do
request0 = Crawly.Request.new(url, headers, options)

ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]

middlewares = request0.middlewares -- ignored_middlewares

{request, _} = Crawly.Utils.pipe(middlewares, request0, %{})

{fetcher, client_options} = Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)
{fetcher, client_options} =
Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

{:ok, response} = fetcher.fetch(request, client_options)
response
Expand All @@ -49,6 +52,7 @@ defmodule Crawly do
case Kernel.function_exported?(spider, :parse_item, 1) do
false ->
{:error, :spider_not_found}

true ->
spider.parse_item(response)
end
Expand Down
51 changes: 43 additions & 8 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,43 @@ defmodule Crawly.Engine do

defstruct(started_spiders: %{}, known_spiders: [])

@spec start_spider(module(), binary()) ::
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
@doc """
Starts a spider. All options passed in the second argument will be passed along to the spider's `init/1` callback.
### Reserved Options
- `:crawl_id` (binary). Optional, automatically generated if not set.
### Backward compatability
If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future.
"""
@type crawl_id_opt :: {:crawl_id, binary()}
@spec start_spider(spider_name, opts) :: result
when spider_name: module(),
opts: [crawl_id_opt],
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, opts \\ [])

def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
Logger.warn(
"Deprecation Warning: Setting the crawl_id as second positional argument is deprecated. Please use the :crawl_id option instead. Refer to docs for more info (https://hexdocs.pm/crawly/Crawly.Engine.html#start_spider/2) "
)

start_spider(spider_name, crawl_id: crawl_id)
end

def start_spider(spider_name, opts) when is_list(opts) do
opts =
Enum.into(opts, %{})
|> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0)

GenServer.call(
__MODULE__,
{:start_spider, spider_name, opts[:crawl_id], Map.to_list(opts)}
)
end

@spec get_manager(module()) ::
Expand Down Expand Up @@ -132,11 +163,15 @@ defmodule Crawly.Engine do
{:reply, format_spider_info(state), state}
end

def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
def handle_call(
{:start_spider, spider_name, crawl_id, options},
_form,
state
) do
result =
case Map.get(state.started_spiders, spider_name) do
nil ->
Crawly.EngineSup.start_spider(spider_name)
Crawly.EngineSup.start_spider(spider_name, options)

_ ->
{:error, :spider_already_started}
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/engine_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ defmodule Crawly.EngineSup do
DynamicSupervisor.init(strategy: :one_for_one)
end

def start_spider(spider_name) do
def start_spider(spider_name, options) do
result =
case Code.ensure_loaded?(spider_name) do
true ->
# Given spider module exists in the namespace, we can proceed
{:ok, _sup_pid} =
DynamicSupervisor.start_child(
__MODULE__,
{Crawly.ManagerSup, spider_name}
{Crawly.ManagerSup, [spider_name, options]}
)

false ->
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ defmodule Crawly.Manager do
end
end

def start_link(spider_name) do
def start_link([spider_name, options]) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
GenServer.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
# Getting spider start urls
init = spider_name.init()
init = spider_name.init(options)

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do
@moduledoc false
use Supervisor

def start_link(spider_name) do
Supervisor.start_link(__MODULE__, spider_name)
def start_link([spider_name, options]) do
Supervisor.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
children = [
# This supervisor is used to spawn Worker processes
{DynamicSupervisor, strategy: :one_for_one, name: spider_name},

# Starts spider manager process
{Crawly.Manager, spider_name}
{Crawly.Manager, [spider_name, options]}
]

Supervisor.init(children, strategy: :one_for_one)
Expand Down
22 changes: 18 additions & 4 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ defmodule Crawly.Spider do
A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
2. `init/1` same as init, but also takes a list of options sent from Engine
3. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
4. `parse_item/1` function which is responsible for parsing the downloaded
request and converting it into items which can be stored and new requests
which can be scheduled
4. `custom_settings/0` an optional callback which can be used in order to
5. `custom_settings/0` an optional callback which can be used in order to
provide custom spider specific settings. Should define a list with custom
settings and their values. These values will take precedence over the
global settings defined in the config.
"""



@callback init() :: [start_urls: list(), start_requests: list()]
@callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand All @@ -26,11 +30,21 @@ defmodule Crawly.Spider do

defmacro __using__(_opts) do
quote do
require Logger
@behaviour Crawly.Spider

def override_settings(), do: []

defoverridable override_settings: 0
# This line is needed to keep the backward compatibility, so all spiders
# with init/0 will still work normally.
def init(_options), do: init()

def init() do
Logger.error("Using default spider init, without start urls")
%{start_urls: []}
end

defoverridable override_settings: 0, init: 1, init: 0
end
end
end
33 changes: 33 additions & 0 deletions test/manager_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ defmodule ManagerTest do

assert_receive {:performing_request, "https://www.example.com/blog.html"}
end

test "It's possible to initialize a spider with parameters" do
Process.register(self(), :manager_test_initial_args_test)

urls = [
"https://example.com/1",
"https://example.com/2",
"https://example.com/3"
]

:ok = Crawly.Engine.start_spider(Manager.InitialArgsTestSpider, urls: urls)

assert_receive recv_opts
assert is_binary(recv_opts[:crawl_id])
assert Enum.sort(recv_opts[:urls]) == Enum.sort(urls)
end
end

defmodule Manager.TestSpider do
Expand Down Expand Up @@ -212,3 +228,20 @@ defmodule Manager.StartRequestsTestSpider do
}
end
end

defmodule Manager.InitialArgsTestSpider do
use Crawly.Spider

def base_url() do
"https://www.example.com"
end

def init(opts) do
send(:manager_test_initial_args_test, opts)
[start_urls: opts[:urls]]
end

def parse_item(_response) do
%{items: [], requests: []}
end
end
1 change: 0 additions & 1 deletion test/test_utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ defmodule TestSpider do
end

defmodule UtilsTestSpider do
use GenServer
use Crawly.Spider

@impl true
Expand Down

0 comments on commit 3bd20a0

Please sign in to comment.