Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved Crawly.fetch/2 to accept fetching with a spider #107

Merged
merged 14 commits into from
Nov 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ In order to make a working web crawler, all the behaviour callbacks need to be i
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.
** This callback is going to be deprecated in favour of init/1. For now the backwords
compatibility is kept with a help of macro which always generates `init/1`.

`init(options)` same as `init/0` but also takes options (which can be passed from the engine during
the spider start).

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
109 changes: 92 additions & 17 deletions lib/crawly.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,105 @@ defmodule Crawly do
when you need to get individual pages and parse them.

The fetched URL is being converted to a request, and the request is piped
through the middlewares specidied in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt` these 2 are
ignored)
through the middlewares specified in a config (with the exception of
`Crawly.Middlewares.DomainFilter`, `Crawly.Middlewares.RobotsTxt`)

Provide a spider with the `:with` option to fetch a given webpage using that spider.

### Fetching with a spider
To fetch a response from a url with a spider, define your spider, and pass the module name to the `:with` option.

iex> Crawly.fetch("https://www.example.com", with: MySpider)
{%HTTPoison.Response{...}, %{...}, [...], %{...}}

Using the `:with` option will return a 4 item tuple:

1. The HTTPoison response
2. The result returned from the `parse_item/1` callback
3. The list of items that have been processed by the declared item pipelines.
4. The pipeline state, included for debugging purposes.
"""
@spec fetch(url, headers, options) :: HTTPoison.Response.t()
@type with_opt :: {:with, nil | module()}
@type request_opt :: {:request_options, list(Crawly.Request.option())}
@type headers_opt :: {:headers, list(Crawly.Request.header())}

@type parsed_item_result :: Crawly.ParsedItem.t()
@type parsed_items :: list(any())
@type pipeline_state :: %{optional(atom()) => any()}

@spec fetch(url, opts) ::
HTTPoison.Response.t()
| {HTTPoison.Response.t(), parsed_item_result, parsed_items,
pipeline_state}
when url: binary(),
headers: [],
options: []
def fetch(url, headers \\ [], options \\ []) do
request0 = Crawly.Request.new(url, headers, options)
opts: [
with_opt
| request_opt
| headers_opt
]
def fetch(url, opts \\ []) do
opts = Enum.into(opts, %{with: nil, request_options: [], headers: []})

request0 =
Crawly.Request.new(url, opts[:headers], opts[:request_options])
|> Map.put(
:middlewares,
Crawly.Utils.get_settings(:middlewares, opts[:with], [])
)

ignored_middlewares = [
Crawly.Middlewares.DomainFilter,
Crawly.Middlewares.RobotsTxt
]
middlewares = request0.middlewares -- ignored_middlewares

{request, _} = Crawly.Utils.pipe(middlewares, request0, %{})
new_middlewares = request0.middlewares -- ignored_middlewares

{fetcher, client_options} = Application.get_env(
:crawly,
:fetcher,
{Crawly.Fetchers.HTTPoisonFetcher, []}
)
request0 =
Map.put(
request0,
:middlewares,
new_middlewares
)

{%{} = request, _} = Crawly.Utils.pipe(request0.middlewares, request0, %{})

{fetcher, client_options} =
Crawly.Utils.get_settings(
:fetcher,
opts[:with],
{Crawly.Fetchers.HTTPoisonFetcher, []}
)

{:ok, response} = fetcher.fetch(request, client_options)
response

case opts[:with] do
nil ->
# no spider provided, return response as is
response

_ ->
# spider provided, send response through parse_item callback, pipe through the pipelines
with parsed_result <- parse(response, opts[:with]),
pipelines <-
Crawly.Utils.get_settings(
:pipelines,
opts[:with]
),
items <- Map.get(parsed_result, :items, []),
{pipeline_result, pipeline_state} <-
Enum.reduce(items, {[], %{}}, fn item, {acc, state} ->
{piped, state} = Crawly.Utils.pipe(pipelines, item, state)

if piped == false do
# dropped
{acc, state}
else
{[piped | acc], state}
end
end) do
{response, parsed_result, pipeline_result, pipeline_state}
end
end
end

@doc """
Expand All @@ -49,13 +121,16 @@ defmodule Crawly do
case Kernel.function_exported?(spider, :parse_item, 1) do
false ->
{:error, :spider_not_found}

true ->
spider.parse_item(response)
end
end

@doc """
Returns a list of known modules which implements Crawly.Spider behaviour
Returns a list of known modules which implements Crawly.Spider behaviour.

Should not be used for spider management. Use functions defined in `Crawly.Engine` for that.
"""
@spec list_spiders() :: [module()]
def list_spiders(), do: Crawly.Utils.list_spiders()
Expand Down
51 changes: 43 additions & 8 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,43 @@ defmodule Crawly.Engine do

defstruct(started_spiders: %{}, known_spiders: [])

@spec start_spider(module(), binary()) ::
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
@doc """
Starts a spider. All options passed in the second argument will be passed along to the spider's `init/1` callback.

### Reserved Options
- `:crawl_id` (binary). Optional, automatically generated if not set.


### Backward compatability
If the 2nd positional argument is a binary, it will be set as the `:crawl_id`. Deprecated, will be removed in the future.
"""
@type crawl_id_opt :: {:crawl_id, binary()}
@spec start_spider(spider_name, opts) :: result
when spider_name: module(),
opts: [crawl_id_opt],
result:
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name, opts \\ [])

def start_spider(spider_name, crawl_id) when is_binary(crawl_id) do
Logger.warn(
"Deprecation Warning: Setting the crawl_id as second positional argument is deprecated. Please use the :crawl_id option instead. Refer to docs for more info (https://hexdocs.pm/crawly/Crawly.Engine.html#start_spider/2) "
)

start_spider(spider_name, crawl_id: crawl_id)
end

def start_spider(spider_name, opts) when is_list(opts) do
opts =
Enum.into(opts, %{})
|> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0)

GenServer.call(
__MODULE__,
{:start_spider, spider_name, opts[:crawl_id], Map.to_list(opts)}
)
end

@spec get_manager(module()) ::
Expand Down Expand Up @@ -132,11 +163,15 @@ defmodule Crawly.Engine do
{:reply, format_spider_info(state), state}
end

def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
def handle_call(
{:start_spider, spider_name, crawl_id, options},
_form,
state
) do
result =
case Map.get(state.started_spiders, spider_name) do
nil ->
Crawly.EngineSup.start_spider(spider_name)
Crawly.EngineSup.start_spider(spider_name, options)

_ ->
{:error, :spider_already_started}
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/engine_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ defmodule Crawly.EngineSup do
DynamicSupervisor.init(strategy: :one_for_one)
end

def start_spider(spider_name) do
def start_spider(spider_name, options) do
result =
case Code.ensure_loaded?(spider_name) do
true ->
# Given spider module exists in the namespace, we can proceed
{:ok, _sup_pid} =
DynamicSupervisor.start_child(
__MODULE__,
{Crawly.ManagerSup, spider_name}
{Crawly.ManagerSup, [spider_name, options]}
)

false ->
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ defmodule Crawly.Manager do
end
end

def start_link(spider_name) do
def start_link([spider_name, options]) do
Logger.debug("Starting the manager for #{spider_name}")
GenServer.start_link(__MODULE__, spider_name)
GenServer.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
# Getting spider start urls
init = spider_name.init()
init = spider_name.init(options)

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand Down
8 changes: 4 additions & 4 deletions lib/crawly/manager_sup.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ defmodule Crawly.ManagerSup do
@moduledoc false
use Supervisor

def start_link(spider_name) do
Supervisor.start_link(__MODULE__, spider_name)
def start_link([spider_name, options]) do
Supervisor.start_link(__MODULE__, [spider_name, options])
end

@impl true
def init(spider_name) do
def init([spider_name, options]) do
children = [
# This supervisor is used to spawn Worker processes
{DynamicSupervisor, strategy: :one_for_one, name: spider_name},

# Starts spider manager process
{Crawly.Manager, spider_name}
{Crawly.Manager, [spider_name, options]}
]

Supervisor.init(children, strategy: :one_for_one)
Expand Down
22 changes: 18 additions & 4 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ defmodule Crawly.Spider do

A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
2. `init/1` same as init, but also takes a list of options sent from Engine
3. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
4. `parse_item/1` function which is responsible for parsing the downloaded
request and converting it into items which can be stored and new requests
which can be scheduled
4. `custom_settings/0` an optional callback which can be used in order to
5. `custom_settings/0` an optional callback which can be used in order to
provide custom spider specific settings. Should define a list with custom
settings and their values. These values will take precedence over the
global settings defined in the config.
"""



@callback init() :: [start_urls: list(), start_requests: list()]
@callback init(options: keyword()) :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand All @@ -26,11 +30,21 @@ defmodule Crawly.Spider do

defmacro __using__(_opts) do
quote do
require Logger
@behaviour Crawly.Spider

def override_settings(), do: []

defoverridable override_settings: 0
# This line is needed to keep the backward compatibility, so all spiders
# with init/0 will still work normally.
def init(_options), do: init()

def init() do
Logger.error("Using default spider init, without start urls")
%{start_urls: []}
end

defoverridable override_settings: 0, init: 1, init: 0
end
end
end
43 changes: 40 additions & 3 deletions test/crawly_test.exs
Original file line number Diff line number Diff line change
@@ -1,8 +1,45 @@
defmodule CrawlyTest do
use ExUnit.Case
doctest Crawly

test "greets the world" do
assert :test == :test
setup do
:meck.new(CrawlyTestSpider, [:non_strict])

:meck.expect(CrawlyTestSpider, :parse_item, fn _resp ->
%{
items: [%{content: "hello"}],
requests: [
Crawly.Utils.request_from_url("https://www.example.com/test")
]
}
end)

:meck.expect(CrawlyTestSpider, :override_settings, fn ->
[pipelines: [Crawly.Pipelines.JSONEncoder]]
end)

on_exit(fn ->
:meck.unload()
end)

{:ok, spider_module: CrawlyTestSpider}
end

test "fetch/1 is able to fetch a given url using global config, returns a response" do
assert %HTTPoison.Response{} = Crawly.fetch("https://example.com")
end

test "fetch/2 with :with option provided returns the response, parsed_item result, and processed ParsedItems",
%{spider_module: spider_module} do
assert {%HTTPoison.Response{}, parsed_item_res, parsed_items,
pipeline_state} =
Crawly.fetch("http://example.com", with: spider_module)

assert %{
items: [_],
requests: requests
} = parsed_item_res

assert [encoded] = parsed_items
assert encoded =~ "hello"
end
end