Skip to content

Commit

Permalink
Merge branch 'master' into 37-spider-management-start-all-spiders
Browse files Browse the repository at this point in the history
  • Loading branch information
Ziinc committed Nov 2, 2020
2 parents cefb9c1 + a96b1e6 commit 81b2871
Show file tree
Hide file tree
Showing 17 changed files with 260 additions and 92 deletions.
2 changes: 2 additions & 0 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# These are supported funding model platforms
github: oltarasenko
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ historical archival.
# mix.exs
defp deps do
[
{:crawly, "~> 0.10.0"},
{:crawly, "~> 0.11.0"},
{:floki, "~> 0.26.0"}
]
end
Expand Down Expand Up @@ -101,6 +101,9 @@ You can read more here:

The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders.

Checkout the code from [GitHub](https://github.com/oltarasenko/crawly_ui)
or try it online [CrawlyUIDemo](http://crawlyui.com)

![](documentation/assets/main_page.png?raw=true)
![](documentation/assets/items_page.png?raw=true)
![](documentation/assets/item_with_filters.png?raw=true)
Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ extracting structured data which can be used for a wide range of
useful applications, like data mining, information processing or
historical archival.

**[Documentation has moved to HexDocs](https://hexdocs.pm/crawly)**
**[Documentation has moved to HexDocs](https://hexdocs.pm/crawly/readme.html)**
5 changes: 4 additions & 1 deletion documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.

In order to make a working web crawler, all the behaviour callbacks need to be implemented.

`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
to prepare first requests on `init()`. Which might be useful if, for example, you
want to pass a session cookie to the starting request. Note: `start_requests` are
processed before start_urls.

`base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.

Expand Down
2 changes: 1 addition & 1 deletion documentation/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ file with the following code:
```elixir
defp deps do
[
{:crawly, "~> 0.10.0"},
{:crawly, "~> 0.11.0"},
{:floki, "~> 0.26.0"}
]
end
Expand Down
108 changes: 80 additions & 28 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,29 @@ defmodule Crawly.Engine do

use GenServer

@type t :: %__MODULE__{started_spiders: started_spiders()}
@type t :: %__MODULE__{
started_spiders: started_spiders(),
known_spiders: [module()]
}
@type started_spiders() :: %{optional(module()) => identifier()}
@type list_spiders() :: [
%{name: module(), state: :stopped | :started, pid: identifier()}
]

defstruct started_spiders: %{}
@type spider_info() :: %{
name: module(),
status: :stopped | :started,
pid: identifier() | nil
}

@spec start_spider(module()) ::
defstruct(started_spiders: %{}, known_spiders: [])

@spec start_spider(module(), binary()) ::
:ok
| {:error, :spider_already_started}
| {:error, :atom}
def start_spider(spider_name) do
GenServer.call(__MODULE__, {:start_spider, spider_name})
def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
end

@spec start_all_spiders() :: :ok
Expand All @@ -35,19 +44,23 @@ defmodule Crawly.Engine do
:error ->
{:error, :spider_not_found}

{:ok, pid_sup} ->
{:ok, {pid_sup, _job_tag}} ->
Supervisor.which_children(pid_sup)
|> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
|> case do
nil -> {:error, :spider_not_found}
{_, pid, :worker, _} -> pid
nil ->
{:error, :spider_not_found}

{_, pid, :worker, _} ->
pid
end
end
end

@spec stop_spider(module(), reason) :: result
when reason: :itemcount_limit | :itemcount_timeout | atom(),
result: :ok | {:error, :spider_not_running}
result:
:ok | {:error, :spider_not_running} | {:error, :spider_not_found}
def stop_spider(spider_name, reason \\ :ignore) do
case Crawly.Utils.get_settings(:on_spider_closed_callback, spider_name) do
nil -> :ignore
Expand All @@ -57,23 +70,39 @@ defmodule Crawly.Engine do
GenServer.call(__MODULE__, {:stop_spider, spider_name})
end

@spec list_spiders() :: list_spiders()
def list_spiders() do
GenServer.call(__MODULE__, :list_spiders)
@spec list_known_spiders() :: [spider_info()]
def list_known_spiders() do
GenServer.call(__MODULE__, :list_known_spiders)
end

@spec running_spiders() :: started_spiders()
def running_spiders() do
GenServer.call(__MODULE__, :running_spiders)
end

@spec get_spider_info(module()) :: spider_info()
def get_spider_info(name) do
GenServer.call(__MODULE__, {:get_spider, name})
end

def refresh_spider_list() do
GenServer.cast(__MODULE__, :refresh_spider_list)
end

def start_link() do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
end

@spec get_crawl_id(atom()) :: {:error, :spider_not_running} | {:ok, binary()}
def get_crawl_id(spider_name) do
GenServer.call(__MODULE__, {:get_crawl_id, spider_name})
end

@spec init(any) :: {:ok, __MODULE__.t()}
def init(_args) do
{:ok, %Crawly.Engine{}}
spiders = get_updated_known_spider_list()

{:ok, %Crawly.Engine{known_spiders: spiders}}
end

def handle_call({:get_manager, spider_name}, _, state) do
Expand All @@ -89,15 +118,28 @@ defmodule Crawly.Engine do
{:reply, pid, state}
end

def handle_call({:get_crawl_id, spider_name}, _from, state) do
msg =
case Map.get(state.started_spiders, spider_name) do
nil ->
{:error, :spider_not_running}

{_pid, crawl_id} ->
{:ok, crawl_id}
end

{:reply, msg, state}
end

def handle_call(:running_spiders, _from, state) do
{:reply, state.started_spiders, state}
end

def handle_call(:list_spiders, _from, state) do
{:reply, list_all_spider_status(state.started_spiders), state}
def handle_call(:list_known_spiders, _from, state) do
{:reply, format_spider_info(state), state}
end

def handle_call({:start_spider, spider_name}, _form, state) do
def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
result =
case Map.get(state.started_spiders, spider_name) do
nil ->
Expand All @@ -110,7 +152,7 @@ defmodule Crawly.Engine do
{msg, new_started_spiders} =
case result do
{:ok, pid} ->
{:ok, Map.put(state.started_spiders, spider_name, pid)}
{:ok, Map.put(state.started_spiders, spider_name, {pid, crawl_id})}

{:error, _} = err ->
{err, state.started_spiders}
Expand Down Expand Up @@ -147,7 +189,7 @@ defmodule Crawly.Engine do
{nil, _} ->
{{:error, :spider_not_running}, state.started_spiders}

{pid, new_started_spiders} ->
{{pid, _crawl_id}, new_started_spiders} ->
Crawly.EngineSup.stop_spider(pid)

{:ok, new_started_spiders}
Expand All @@ -156,18 +198,28 @@ defmodule Crawly.Engine do
{:reply, msg, %Crawly.Engine{state | started_spiders: new_started_spiders}}
end

defp list_all_spider_status(started_spiders) do
Crawly.Utils.list_spiders()
|> Enum.map(fn name ->
def handle_cast(:refresh_spider_list, state) do
updated = get_updated_known_spider_list(state.known_spiders)
{:noreply, %Crawly.Engine{state | known_spiders: updated}}
end

# this function generates a spider_info map for each spider known
defp format_spider_info(state) do
Enum.map(state.known_spiders, fn s ->
pid = Map.get(state.started_spiders, s)

%{
name: name,
state:
case Map.has_key?(started_spiders, name) do
true -> :started
false -> :stopped
end,
pid: Map.get(started_spiders, name)
name: s,
status: if(is_nil(pid), do: :stopped, else: :started),
pid: pid
}
end)
end

defp get_updated_known_spider_list(known \\ []) do
new = Crawly.Utils.list_spiders()

(known ++ new)
|> Enum.dedup_by(& &1)
end
end
29 changes: 24 additions & 5 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ defmodule Crawly.Manager do
@impl true
def init(spider_name) do
# Getting spider start urls
[start_urls: urls] = spider_name.init()
init = spider_name.init()

# Start DataStorage worker
{:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
Expand All @@ -65,10 +65,29 @@ defmodule Crawly.Manager do

Process.link(request_storage_pid)

# Store start requests
requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
# Store start urls
Enum.each(
Keyword.get(init, :start_requests, []),
fn
%Crawly.Request{} = request ->
Crawly.RequestsStorage.store(spider_name, request)

request ->
# We should not attempt to store something which is not a request
Logger.error(
"#{inspect(request)} does not seem to be a request. Ignoring."
)

:ignore
end
)

:ok = Crawly.RequestsStorage.store(spider_name, requests)
Enum.each(
Keyword.get(init, :start_urls, []),
fn url ->
Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
end
)

# Start workers
num_workers =
Expand Down Expand Up @@ -162,7 +181,7 @@ defmodule Crawly.Manager do
defp maybe_stop_spider_by_itemcount_limit(_, _, _), do: :ok

defp maybe_stop_spider_by_timeout(spider_name, current, limit)
when current < limit do
when current < limit and is_integer(limit) do
Logger.info("Stopping #{inspect(spider_name)}, itemcount timeout achieved")

Crawly.Engine.stop_spider(spider_name, :itemcount_timeout)
Expand Down
9 changes: 7 additions & 2 deletions lib/crawly/pipelines/experimental/send_to_ui.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ defmodule Crawly.Pipelines.Experimental.SendToUI do

@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
job_tag = Map.get(state, :job_tag, UUID.uuid1())
job_tag =
Map.get_lazy(state, :job_tag, fn ->
{:ok, job_tag} = Crawly.Engine.get_crawl_id(state.spider_name)
job_tag
end)

spider_name = state.spider_name |> Atom.to_string()

case Keyword.get(opts, :ui_node) do
Expand All @@ -20,7 +25,7 @@ defmodule Crawly.Pipelines.Experimental.SendToUI do
ui_node ->
:rpc.cast(ui_node, CrawlyUI, :store_item, [
spider_name,
item,
item,
job_tag,
Node.self() |> to_string()
])
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ defmodule Crawly.Spider do
A behavior module for implementing a Crawly Spider
A Spider is a module which is responsible for defining:
1. `init/0` function, which must return a keyword list with start_urls list
1. `init/0` function, which must return a keyword list with start_urls/start_requests list
2. `base_url/0` function responsible for filtering out requests not related to
a given website
3. `parse_item/1` function which is responsible for parsing the downloaded
Expand All @@ -15,7 +15,7 @@ defmodule Crawly.Spider do
global settings defined in the config.
"""

@callback init() :: [start_urls: list()]
@callback init() :: [start_urls: list(), start_requests: list()]

@callback base_url() :: binary()

Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ defmodule Crawly.Utils do
acc
end
rescue
error ->
Logger.debug("Could not get behaviour information for: #{inspect(error)}")
_error ->
# Just ignore the case, as probably the given module is not a Spider
acc
end
end
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
defmodule Crawly.Mixfile do
use Mix.Project

@version "0.10.0"
@version "0.11.0"

def project do
[
Expand Down

0 comments on commit 81b2871

Please sign in to comment.