diff --git a/README.md b/README.md
index 45ac207d..0f4c8167 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,19 @@ of asynchronous elements (for example parts loaded by AJAX).
You can read more here:
- [Browser Rendering](https://hexdocs.pm/crawly/basic_concepts.html#browser-rendering)
+## Simple management UI (New in 0.15.0)
+Crawly provides a simple management UI by default on the `localhost:4001`
+
+It allows to:
+ - Start spiders
+ - Stop spiders
+ - Preview scheduled requests
+ - Preview items extracted so far (it's required to add the
+ `Crawly.Pipelines.Experimental.Preview` item pipe to have items preview)
+
+![Crawly Management UI](docs/crawly_ui.gif)
+
+
## Experimental UI
The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders.
diff --git a/config/test.exs b/config/test.exs
index d1d679e9..5cef0c0d 100644
--- a/config/test.exs
+++ b/config/test.exs
@@ -16,6 +16,7 @@ config :crawly,
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
+ Crawly.Pipelines.Experimental.Preview,
Crawly.Pipelines.JSONEncoder
],
retry: [
diff --git a/docs/crawly_ui.gif b/docs/crawly_ui.gif
new file mode 100644
index 00000000..aff362ff
Binary files /dev/null and b/docs/crawly_ui.gif differ
diff --git a/lib/crawly/api.ex b/lib/crawly/api.ex
index 39610ab5..5359c651 100644
--- a/lib/crawly/api.ex
+++ b/lib/crawly/api.ex
@@ -8,6 +8,52 @@ defmodule Crawly.API.Router do
plug(:match)
plug(:dispatch)
+ # Simple UI for crawly management
+ get "/" do
+ running_spiders = Crawly.Engine.running_spiders()
+
+ spiders_list =
+ Enum.map(
+ Crawly.list_spiders(),
+ fn spider ->
+ state =
+ case Map.get(running_spiders, spider) do
+ {_pid, _job_id} -> :running
+ nil -> :idle
+ end
+
+ spider_name =
+ spider
+ |> Atom.to_string()
+ |> String.replace_leading("Elixir.", "")
+
+ {scraped, scheduled} =
+ case state == :running do
+ false ->
+ {" - ", " - "}
+
+ true ->
+ {:stored_items, num} = Crawly.DataStorage.stats(spider)
+
+ {:stored_requests, scheduled} =
+ Crawly.RequestsStorage.stats(spider)
+
+ {num, scheduled}
+ end
+
+ %{
+ name: spider_name,
+ scheduled: scheduled,
+ scraped: scraped,
+ state: state
+ }
+ end
+ )
+
+ response = render_template("list.html.eex", data: spiders_list)
+ send_resp(conn, 200, response)
+ end
+
get "/spiders" do
msg =
case Crawly.Engine.running_spiders() do
@@ -21,6 +67,73 @@ defmodule Crawly.API.Router do
send_resp(conn, 200, msg)
end
+ get "/spiders/:spider_name/requests" do
+ spider_name = String.to_atom("Elixir.#{spider_name}")
+
+ result =
+ case Crawly.RequestsStorage.requests(spider_name) do
+ {:requests, result} ->
+ Enum.map(result, fn req ->
+ %{url: req.url, headers: inspect(req.headers)}
+ end)
+
+ {:error, _} ->
+ []
+ end
+
+ response =
+ render_template("requests_list.html.eex",
+ requests: result,
+ spider_name: spider_name
+ )
+
+ send_resp(conn, 200, response)
+ end
+
+ get "/spiders/:spider_name/items" do
+ pipelines = Application.get_env(:crawly, :pipelines)
+
+ preview_enabled? =
+ Enum.any?(
+ pipelines,
+ fn
+ Crawly.Pipelines.Experimental.Preview -> true
+ {Crawly.Pipelines.Experimental.Preview, _} -> true
+ _ -> false
+ end
+ )
+
+ spider_name = String.to_atom("Elixir.#{spider_name}")
+
+ # According to the preview item pipeline we store items under the field below
+ # use inspect function to get items here
+ items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview"
+
+ result =
+ case Crawly.DataStorage.inspect(spider_name, items_preview_field) do
+ {:inspect, nil} ->
+ []
+
+ {:inspect, result} ->
+ result
+
+ {:error, _} ->
+ []
+
+ nil ->
+ []
+ end
+
+ response =
+ render_template("items_list.html.eex",
+ items: result,
+ preview_enabled?: preview_enabled?,
+ spider_name: spider_name
+ )
+
+ send_resp(conn, 200, response)
+ end
+
get "/spiders/:spider_name/schedule" do
spider_name = String.to_atom("Elixir.#{spider_name}")
result = Crawly.Engine.start_spider(spider_name)
@@ -78,4 +191,13 @@ defmodule Crawly.API.Router do
match _ do
send_resp(conn, 404, "Oops! Page not found!")
end
+
+ defp render_template(template_name, assigns) do
+ base_dir = :code.priv_dir(:crawly)
+ template = Path.join(base_dir, template_name)
+ rendered_template = EEx.eval_file(template, assigns)
+
+ base_template = Path.join(base_dir, "index.html.eex")
+ EEx.eval_file(base_template, rendered_template: rendered_template)
+ end
end
diff --git a/lib/crawly/data_storage/data_storage.ex b/lib/crawly/data_storage/data_storage.ex
index 383e38e4..cf0f7785 100644
--- a/lib/crawly/data_storage/data_storage.ex
+++ b/lib/crawly/data_storage/data_storage.ex
@@ -42,6 +42,12 @@ defmodule Crawly.DataStorage do
GenServer.call(__MODULE__, {:stats, spider})
end
+ @spec inspect(atom(), term()) ::
+ {:error, :data_storage_worker_not_running} | term()
+ def inspect(spider, field) do
+ GenServer.call(__MODULE__, {:inspect, spider, field})
+ end
+
def start_link([]) do
Logger.debug("Starting data storage")
@@ -104,6 +110,19 @@ defmodule Crawly.DataStorage do
{:reply, msg, state}
end
+ def handle_call({:inspect, spider_name, field}, _from, state) do
+ msg =
+ case Map.get(state.workers, spider_name) do
+ nil ->
+ {:error, :data_storage_worker_not_running}
+
+ pid ->
+ Crawly.DataStorage.Worker.inspect(pid, field)
+ end
+
+ {:reply, msg, state}
+ end
+
# Clean up worker
def handle_info({:DOWN, _ref, :process, pid, _}, state) do
spider_name = Map.get(state.pid_spiders, pid)
diff --git a/lib/crawly/data_storage/data_storage_worker.ex b/lib/crawly/data_storage/data_storage_worker.ex
index 665e8a50..c3426c1f 100644
--- a/lib/crawly/data_storage/data_storage_worker.ex
+++ b/lib/crawly/data_storage/data_storage_worker.ex
@@ -31,6 +31,14 @@ defmodule Crawly.DataStorage.Worker do
GenServer.cast(pid, {:store, item})
end
+ @doc """
+ Inspect the inner state of the given data worker
+ """
+ @spec inspect(pid, atom()) :: term()
+ def inspect(pid, field) do
+ GenServer.call(pid, {:inspect, field})
+ end
+
def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)
{:ok, %Worker{spider_name: spider_name, crawl_id: crawl_id}}
@@ -52,6 +60,11 @@ defmodule Crawly.DataStorage.Worker do
{:noreply, state}
end
+ def handle_call({:inspect, field}, _from, state) do
+ msg = {:inspect, Map.get(state, field, nil)}
+ {:reply, msg, state}
+ end
+
def handle_call(:stats, _from, state) do
{:reply, {:stored_items, state.stored_items}, state}
end
diff --git a/lib/crawly/pipelines/experimental/preview.ex b/lib/crawly/pipelines/experimental/preview.ex
new file mode 100644
index 00000000..495958cc
--- /dev/null
+++ b/lib/crawly/pipelines/experimental/preview.ex
@@ -0,0 +1,45 @@
+defmodule Crawly.Pipelines.Experimental.Preview do
+ @moduledoc """
+ Allows to preview items extracted by the spider so far
+
+ Stores previewable items under 'Elixir.Crawly.Pipelines.Experimental.Preview'
+
+ ### Options
+ - `limit`, (optional, if not provided 100 is used) - resrticts the number of items visible in preview
+
+ Probably it's better to place it higher than CSV/JSON converters.
+
+ ### Example usage in Crawly config
+ ```
+ pipelines: [
+ {Crawly.Pipelines.Experimental.Preview, limit: 10},
+
+ # As you can see we're using data transformators afterwords
+ Crawly.Pipelines.JSONEncoder,
+ {Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
+ ]
+ ```
+ """
+ @behaviour Crawly.Pipeline
+
+ # Restrict the number of items stored in state of the worker
+ @limit 20
+
+ require Logger
+
+ @impl Crawly.Pipeline
+ def run(item, state, opts \\ []) do
+ preview = Map.get(state, __MODULE__, [])
+ limit = Keyword.get(opts, :limit, @limit)
+
+ case Enum.count(preview) >= limit do
+ true ->
+ {item, state}
+
+ false ->
+ new_preview = [item | preview]
+ new_state = Map.put(state, __MODULE__, new_preview)
+ {item, new_state}
+ end
+ end
+end
diff --git a/lib/crawly/requests_storage/requests_storage.ex b/lib/crawly/requests_storage/requests_storage.ex
index ce99a91e..9026929f 100644
--- a/lib/crawly/requests_storage/requests_storage.ex
+++ b/lib/crawly/requests_storage/requests_storage.ex
@@ -72,6 +72,12 @@ defmodule Crawly.RequestsStorage do
GenServer.call(__MODULE__, {:stats, spider_name})
end
+ @spec requests(atom()) ::
+ {:requests, [Crawly.Request.t()]} | {:error, :spider_not_running}
+ def requests(spider_name) do
+ GenServer.call(__MODULE__, {:requests, spider_name})
+ end
+
@doc """
Starts a worker for a given spider
"""
@@ -130,6 +136,19 @@ defmodule Crawly.RequestsStorage do
{:reply, msg, state}
end
+ def handle_call({:requests, spider_name}, _from, state) do
+ msg =
+ case Map.get(state.workers, spider_name) do
+ nil ->
+ {:error, :storage_worker_not_running}
+
+ pid ->
+ Crawly.RequestsStorage.Worker.requests(pid)
+ end
+
+ {:reply, msg, state}
+ end
+
def handle_call({:start_worker, spider_name, crawl_id}, _from, state) do
{msg, new_state} =
case Map.get(state.workers, spider_name) do
diff --git a/lib/crawly/requests_storage/requests_storage_worker.ex b/lib/crawly/requests_storage/requests_storage_worker.ex
index ba1ddab7..8a793d5a 100644
--- a/lib/crawly/requests_storage/requests_storage_worker.ex
+++ b/lib/crawly/requests_storage/requests_storage_worker.ex
@@ -43,6 +43,12 @@ defmodule Crawly.RequestsStorage.Worker do
do_call(pid, :stats)
end
+ @doc """
+ Returns all scheduled requests (used for some sort of preview)
+ """
+ @spec requests(pid()) :: {:requests, [Crawly.Request.t()]}
+ def requests(pid), do: do_call(pid, :requests)
+
def start_link(spider_name, crawl_id) do
GenServer.start_link(__MODULE__, [spider_name, crawl_id])
end
@@ -81,6 +87,10 @@ defmodule Crawly.RequestsStorage.Worker do
{:reply, {:stored_requests, state.count}, state}
end
+ def handle_call(:requests, _from, state) do
+ {:reply, {:requests, state.requests}, state}
+ end
+
defp do_call(pid, command) do
GenServer.call(pid, command)
catch
diff --git a/priv/index.html.eex b/priv/index.html.eex
new file mode 100644
index 00000000..a9f0b7e8
--- /dev/null
+++ b/priv/index.html.eex
@@ -0,0 +1,112 @@
+
+
+
+
+
+
+
+
+
+
+ <%= rendered_template %>
+
+
diff --git a/priv/items_list.html.eex b/priv/items_list.html.eex
new file mode 100644
index 00000000..24ad178f
--- /dev/null
+++ b/priv/items_list.html.eex
@@ -0,0 +1,38 @@
+
+
+
+ <%= if not preview_enabled? do %>
+ Items previewer requires the Crawly.Pipelines.Experimental.Preview
+ item pipeline in your config
+ <% end %>
+
+
+
+
+
+
+ Extracted items: <%= spider_name %>
+ Back
+
+
+
+ item |
+
+
+ <%= for item <- items do %>
+
+
+
+ <%= for {key, value} <- item do %>
+ - <%= key %>: <%= value %>
+ <% end %>
+
+ |
+
+
+ <% end %>
+
+
+
+
+
diff --git a/priv/list.html.eex b/priv/list.html.eex
new file mode 100644
index 00000000..0946a294
--- /dev/null
+++ b/priv/list.html.eex
@@ -0,0 +1,33 @@
+
diff --git a/priv/requests_list.html.eex b/priv/requests_list.html.eex
new file mode 100644
index 00000000..a1375277
--- /dev/null
+++ b/priv/requests_list.html.eex
@@ -0,0 +1,25 @@
+
+
+
+
+
+ List of scheduled requests for: <%= spider_name %>
+ Back
+
+
+
+ url |
+ headers |
+
+ <%= for req <- requests do %>
+
+ <%= req.url %> |
+ <%= req.headers %> |
+
+ <% end %>
+
+
+
+
+
+
diff --git a/test/api_test.exs b/test/api_test.exs
index acb8d223..c77f50ec 100644
--- a/test/api_test.exs
+++ b/test/api_test.exs
@@ -4,6 +4,14 @@ defmodule APITest do
@opts Crawly.API.Router.init([])
+ setup do
+ on_exit(fn ->
+ :get
+ |> conn("/spiders/TestSpider/stop", "")
+ |> Crawly.API.Router.call(@opts)
+ end)
+ end
+
test "returns welcome" do
conn =
:get
@@ -30,4 +38,36 @@ defmodule APITest do
assert conn.resp_body == "Stopped!"
end
+
+ test "It's possible to get preview page" do
+ conn =
+ :get
+ |> conn("/spiders/TestSpider/schedule", "")
+ |> Crawly.API.Router.call(@opts)
+
+ assert conn.resp_body == "Started!"
+
+ conn =
+ :get
+ |> conn("/spiders/TestSpider/items", "")
+ |> Crawly.API.Router.call(@opts)
+
+ assert conn.status == 200
+ end
+
+ test "It's possible to get requests preview page" do
+ conn =
+ :get
+ |> conn("/spiders/TestSpider/schedule", "")
+ |> Crawly.API.Router.call(@opts)
+
+ assert conn.resp_body == "Started!"
+
+ conn =
+ :get
+ |> conn("/spiders/TestSpider/requests", "")
+ |> Crawly.API.Router.call(@opts)
+
+ assert conn.status == 200
+ end
end
diff --git a/test/data_storage_worker_test.exs b/test/data_storage_worker_test.exs
index 5197fe1c..0d0cdbfd 100644
--- a/test/data_storage_worker_test.exs
+++ b/test/data_storage_worker_test.exs
@@ -53,4 +53,9 @@ defmodule DataStorageWorkerTest do
result = Crawly.DataStorage.stats(:unknown)
assert result == {:error, :data_storage_worker_not_running}
end
+
+ test "Can inspect data storage worker state", context do
+ result = Crawly.DataStorage.inspect(context.crawler, :crawl_id)
+ assert {:inspect, "123"} == result
+ end
end
diff --git a/test/pipelines/experimental/preview_test.exs b/test/pipelines/experimental/preview_test.exs
new file mode 100644
index 00000000..01f38941
--- /dev/null
+++ b/test/pipelines/experimental/preview_test.exs
@@ -0,0 +1,29 @@
+defmodule Pipelines.PreviewTest do
+ use ExUnit.Case, async: false
+
+ @item %{first: "some", second: "data"}
+ @state %{spider_name: Test, crawl_id: "test"}
+
+ test "Preview items are stored in state" do
+ pipelines = [{Crawly.Pipelines.Experimental.Preview}]
+
+ {item, state} = Crawly.Utils.pipe(pipelines, @item, @state)
+
+ assert assert item == @item
+
+ preview = Map.get(state, :"Elixir.Crawly.Pipelines.Experimental.Preview")
+ assert [@item] == preview
+ end
+
+ test "It's possible to resrtict number of stored items" do
+ pipelines = [{Crawly.Pipelines.Experimental.Preview, limit: 2}]
+
+ # Checking what happens if we try to store 3 items
+ {_item, state0} = Crawly.Utils.pipe(pipelines, @item, @state)
+ {_item, state1} = Crawly.Utils.pipe(pipelines, @item, state0)
+ {_item, state2} = Crawly.Utils.pipe(pipelines, @item, state1)
+
+ preview = Map.get(state2, :"Elixir.Crawly.Pipelines.Experimental.Preview")
+ assert Enum.count(preview) == 2
+ end
+end
diff --git a/test/request_storage_test.exs b/test/request_storage_test.exs
index b8422311..4891a2eb 100644
--- a/test/request_storage_test.exs
+++ b/test/request_storage_test.exs
@@ -109,4 +109,26 @@ defmodule RequestStorageTest do
{:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler)
assert 0 == num
end
+
+ test "Can get requests list from the requests storage", context do
+ request = %Crawly.Request{
+ url: "http://example.com",
+ headers: [],
+ options: []
+ }
+
+ :ok = Crawly.RequestsStorage.store(context.crawler, request)
+
+ {:requests, [stored_request]} =
+ Crawly.RequestsStorage.requests(context.crawler)
+
+ assert request == stored_request
+ end
+
+ test "Getting requests list from the requests storage if nothing is there",
+ context do
+ {:requests, req_lists} = Crawly.RequestsStorage.requests(context.crawler)
+
+ assert req_lists == []
+ end
end