diff --git a/README.md b/README.md index 45ac207d..0f4c8167 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,19 @@ of asynchronous elements (for example parts loaded by AJAX). You can read more here: - [Browser Rendering](https://hexdocs.pm/crawly/basic_concepts.html#browser-rendering) +## Simple management UI (New in 0.15.0) +Crawly provides a simple management UI by default on the `localhost:4001` + +It allows to: + - Start spiders + - Stop spiders + - Preview scheduled requests + - Preview items extracted so far (it's required to add the + `Crawly.Pipelines.Experimental.Preview` item pipe to have items preview) + +![Crawly Management UI](docs/crawly_ui.gif) + + ## Experimental UI The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders. diff --git a/config/test.exs b/config/test.exs index d1d679e9..5cef0c0d 100644 --- a/config/test.exs +++ b/config/test.exs @@ -16,6 +16,7 @@ config :crawly, pipelines: [ {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]}, {Crawly.Pipelines.DuplicatesFilter, item_id: :title}, + Crawly.Pipelines.Experimental.Preview, Crawly.Pipelines.JSONEncoder ], retry: [ diff --git a/docs/crawly_ui.gif b/docs/crawly_ui.gif new file mode 100644 index 00000000..aff362ff Binary files /dev/null and b/docs/crawly_ui.gif differ diff --git a/lib/crawly/api.ex b/lib/crawly/api.ex index 39610ab5..5359c651 100644 --- a/lib/crawly/api.ex +++ b/lib/crawly/api.ex @@ -8,6 +8,52 @@ defmodule Crawly.API.Router do plug(:match) plug(:dispatch) + # Simple UI for crawly management + get "/" do + running_spiders = Crawly.Engine.running_spiders() + + spiders_list = + Enum.map( + Crawly.list_spiders(), + fn spider -> + state = + case Map.get(running_spiders, spider) do + {_pid, _job_id} -> :running + nil -> :idle + end + + spider_name = + spider + |> Atom.to_string() + |> String.replace_leading("Elixir.", "") + + {scraped, scheduled} = + case state == :running do + false -> + {" - ", " - "} + + true -> + {:stored_items, num} = Crawly.DataStorage.stats(spider) + + {:stored_requests, scheduled} = + Crawly.RequestsStorage.stats(spider) + + {num, scheduled} + end + + %{ + name: spider_name, + scheduled: scheduled, + scraped: scraped, + state: state + } + end + ) + + response = render_template("list.html.eex", data: spiders_list) + send_resp(conn, 200, response) + end + get "/spiders" do msg = case Crawly.Engine.running_spiders() do @@ -21,6 +67,73 @@ defmodule Crawly.API.Router do send_resp(conn, 200, msg) end + get "/spiders/:spider_name/requests" do + spider_name = String.to_atom("Elixir.#{spider_name}") + + result = + case Crawly.RequestsStorage.requests(spider_name) do + {:requests, result} -> + Enum.map(result, fn req -> + %{url: req.url, headers: inspect(req.headers)} + end) + + {:error, _} -> + [] + end + + response = + render_template("requests_list.html.eex", + requests: result, + spider_name: spider_name + ) + + send_resp(conn, 200, response) + end + + get "/spiders/:spider_name/items" do + pipelines = Application.get_env(:crawly, :pipelines) + + preview_enabled? = + Enum.any?( + pipelines, + fn + Crawly.Pipelines.Experimental.Preview -> true + {Crawly.Pipelines.Experimental.Preview, _} -> true + _ -> false + end + ) + + spider_name = String.to_atom("Elixir.#{spider_name}") + + # According to the preview item pipeline we store items under the field below + # use inspect function to get items here + items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview" + + result = + case Crawly.DataStorage.inspect(spider_name, items_preview_field) do + {:inspect, nil} -> + [] + + {:inspect, result} -> + result + + {:error, _} -> + [] + + nil -> + [] + end + + response = + render_template("items_list.html.eex", + items: result, + preview_enabled?: preview_enabled?, + spider_name: spider_name + ) + + send_resp(conn, 200, response) + end + get "/spiders/:spider_name/schedule" do spider_name = String.to_atom("Elixir.#{spider_name}") result = Crawly.Engine.start_spider(spider_name) @@ -78,4 +191,13 @@ defmodule Crawly.API.Router do match _ do send_resp(conn, 404, "Oops! Page not found!") end + + defp render_template(template_name, assigns) do + base_dir = :code.priv_dir(:crawly) + template = Path.join(base_dir, template_name) + rendered_template = EEx.eval_file(template, assigns) + + base_template = Path.join(base_dir, "index.html.eex") + EEx.eval_file(base_template, rendered_template: rendered_template) + end end diff --git a/lib/crawly/data_storage/data_storage.ex b/lib/crawly/data_storage/data_storage.ex index 383e38e4..cf0f7785 100644 --- a/lib/crawly/data_storage/data_storage.ex +++ b/lib/crawly/data_storage/data_storage.ex @@ -42,6 +42,12 @@ defmodule Crawly.DataStorage do GenServer.call(__MODULE__, {:stats, spider}) end + @spec inspect(atom(), term()) :: + {:error, :data_storage_worker_not_running} | term() + def inspect(spider, field) do + GenServer.call(__MODULE__, {:inspect, spider, field}) + end + def start_link([]) do Logger.debug("Starting data storage") @@ -104,6 +110,19 @@ defmodule Crawly.DataStorage do {:reply, msg, state} end + def handle_call({:inspect, spider_name, field}, _from, state) do + msg = + case Map.get(state.workers, spider_name) do + nil -> + {:error, :data_storage_worker_not_running} + + pid -> + Crawly.DataStorage.Worker.inspect(pid, field) + end + + {:reply, msg, state} + end + # Clean up worker def handle_info({:DOWN, _ref, :process, pid, _}, state) do spider_name = Map.get(state.pid_spiders, pid) diff --git a/lib/crawly/data_storage/data_storage_worker.ex b/lib/crawly/data_storage/data_storage_worker.ex index 665e8a50..c3426c1f 100644 --- a/lib/crawly/data_storage/data_storage_worker.ex +++ b/lib/crawly/data_storage/data_storage_worker.ex @@ -31,6 +31,14 @@ defmodule Crawly.DataStorage.Worker do GenServer.cast(pid, {:store, item}) end + @doc """ + Inspect the inner state of the given data worker + """ + @spec inspect(pid, atom()) :: term() + def inspect(pid, field) do + GenServer.call(pid, {:inspect, field}) + end + def init(spider_name: spider_name, crawl_id: crawl_id) do Logger.metadata(spider_name: spider_name, crawl_id: crawl_id) {:ok, %Worker{spider_name: spider_name, crawl_id: crawl_id}} @@ -52,6 +60,11 @@ defmodule Crawly.DataStorage.Worker do {:noreply, state} end + def handle_call({:inspect, field}, _from, state) do + msg = {:inspect, Map.get(state, field, nil)} + {:reply, msg, state} + end + def handle_call(:stats, _from, state) do {:reply, {:stored_items, state.stored_items}, state} end diff --git a/lib/crawly/pipelines/experimental/preview.ex b/lib/crawly/pipelines/experimental/preview.ex new file mode 100644 index 00000000..495958cc --- /dev/null +++ b/lib/crawly/pipelines/experimental/preview.ex @@ -0,0 +1,45 @@ +defmodule Crawly.Pipelines.Experimental.Preview do + @moduledoc """ + Allows to preview items extracted by the spider so far + + Stores previewable items under 'Elixir.Crawly.Pipelines.Experimental.Preview' + + ### Options + - `limit`, (optional, if not provided 100 is used) - resrticts the number of items visible in preview + + Probably it's better to place it higher than CSV/JSON converters. + + ### Example usage in Crawly config + ``` + pipelines: [ + {Crawly.Pipelines.Experimental.Preview, limit: 10}, + + # As you can see we're using data transformators afterwords + Crawly.Pipelines.JSONEncoder, + {Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"} + ] + ``` + """ + @behaviour Crawly.Pipeline + + # Restrict the number of items stored in state of the worker + @limit 20 + + require Logger + + @impl Crawly.Pipeline + def run(item, state, opts \\ []) do + preview = Map.get(state, __MODULE__, []) + limit = Keyword.get(opts, :limit, @limit) + + case Enum.count(preview) >= limit do + true -> + {item, state} + + false -> + new_preview = [item | preview] + new_state = Map.put(state, __MODULE__, new_preview) + {item, new_state} + end + end +end diff --git a/lib/crawly/requests_storage/requests_storage.ex b/lib/crawly/requests_storage/requests_storage.ex index ce99a91e..9026929f 100644 --- a/lib/crawly/requests_storage/requests_storage.ex +++ b/lib/crawly/requests_storage/requests_storage.ex @@ -72,6 +72,12 @@ defmodule Crawly.RequestsStorage do GenServer.call(__MODULE__, {:stats, spider_name}) end + @spec requests(atom()) :: + {:requests, [Crawly.Request.t()]} | {:error, :spider_not_running} + def requests(spider_name) do + GenServer.call(__MODULE__, {:requests, spider_name}) + end + @doc """ Starts a worker for a given spider """ @@ -130,6 +136,19 @@ defmodule Crawly.RequestsStorage do {:reply, msg, state} end + def handle_call({:requests, spider_name}, _from, state) do + msg = + case Map.get(state.workers, spider_name) do + nil -> + {:error, :storage_worker_not_running} + + pid -> + Crawly.RequestsStorage.Worker.requests(pid) + end + + {:reply, msg, state} + end + def handle_call({:start_worker, spider_name, crawl_id}, _from, state) do {msg, new_state} = case Map.get(state.workers, spider_name) do diff --git a/lib/crawly/requests_storage/requests_storage_worker.ex b/lib/crawly/requests_storage/requests_storage_worker.ex index ba1ddab7..8a793d5a 100644 --- a/lib/crawly/requests_storage/requests_storage_worker.ex +++ b/lib/crawly/requests_storage/requests_storage_worker.ex @@ -43,6 +43,12 @@ defmodule Crawly.RequestsStorage.Worker do do_call(pid, :stats) end + @doc """ + Returns all scheduled requests (used for some sort of preview) + """ + @spec requests(pid()) :: {:requests, [Crawly.Request.t()]} + def requests(pid), do: do_call(pid, :requests) + def start_link(spider_name, crawl_id) do GenServer.start_link(__MODULE__, [spider_name, crawl_id]) end @@ -81,6 +87,10 @@ defmodule Crawly.RequestsStorage.Worker do {:reply, {:stored_requests, state.count}, state} end + def handle_call(:requests, _from, state) do + {:reply, {:requests, state.requests}, state} + end + defp do_call(pid, command) do GenServer.call(pid, command) catch diff --git a/priv/index.html.eex b/priv/index.html.eex new file mode 100644 index 00000000..a9f0b7e8 --- /dev/null +++ b/priv/index.html.eex @@ -0,0 +1,112 @@ + + + + + + + + + +
+

Crawly Management Tool

+
+ <%= rendered_template %> + + diff --git a/priv/items_list.html.eex b/priv/items_list.html.eex new file mode 100644 index 00000000..24ad178f --- /dev/null +++ b/priv/items_list.html.eex @@ -0,0 +1,38 @@ +
+
+
+ <%= if not preview_enabled? do %> + Items previewer requires the Crawly.Pipelines.Experimental.Preview + item pipeline in your config + <% end %> +
+ +
+
+
+

+ Extracted items: <%= spider_name %> + Back +

+ + + + + + <%= for item <- items do %> + + + + + <% end %> +
item
+
    + <%= for {key, value} <- item do %> +
  • <%= key %>: <%= value %>
  • + <% end %> +
+
+
+
+
+
diff --git a/priv/list.html.eex b/priv/list.html.eex new file mode 100644 index 00000000..0946a294 --- /dev/null +++ b/priv/list.html.eex @@ -0,0 +1,33 @@ +
+
+
+
+

Spiders

+ + + + <%= for spider <- data do %> + + + + + + <%= if spider.state == :idle do %> + + <% else %> + + <% end %> + + + <% end %> +
Spider name + State + Items scraped + Scheduled Requests + Command +
<%= spider.name %><%= spider.state %><%= spider.scraped %> <%= spider.scheduled %>
+
+
+
+
+
diff --git a/priv/requests_list.html.eex b/priv/requests_list.html.eex new file mode 100644 index 00000000..a1375277 --- /dev/null +++ b/priv/requests_list.html.eex @@ -0,0 +1,25 @@ +
+
+
+
+

+ List of scheduled requests for: <%= spider_name %> + Back +

+ + + + + + <%= for req <- requests do %> + + + + + <% end %> + +
urlheaders
<%= req.url %><%= req.headers %>
+
+
+
+
diff --git a/test/api_test.exs b/test/api_test.exs index acb8d223..c77f50ec 100644 --- a/test/api_test.exs +++ b/test/api_test.exs @@ -4,6 +4,14 @@ defmodule APITest do @opts Crawly.API.Router.init([]) + setup do + on_exit(fn -> + :get + |> conn("/spiders/TestSpider/stop", "") + |> Crawly.API.Router.call(@opts) + end) + end + test "returns welcome" do conn = :get @@ -30,4 +38,36 @@ defmodule APITest do assert conn.resp_body == "Stopped!" end + + test "It's possible to get preview page" do + conn = + :get + |> conn("/spiders/TestSpider/schedule", "") + |> Crawly.API.Router.call(@opts) + + assert conn.resp_body == "Started!" + + conn = + :get + |> conn("/spiders/TestSpider/items", "") + |> Crawly.API.Router.call(@opts) + + assert conn.status == 200 + end + + test "It's possible to get requests preview page" do + conn = + :get + |> conn("/spiders/TestSpider/schedule", "") + |> Crawly.API.Router.call(@opts) + + assert conn.resp_body == "Started!" + + conn = + :get + |> conn("/spiders/TestSpider/requests", "") + |> Crawly.API.Router.call(@opts) + + assert conn.status == 200 + end end diff --git a/test/data_storage_worker_test.exs b/test/data_storage_worker_test.exs index 5197fe1c..0d0cdbfd 100644 --- a/test/data_storage_worker_test.exs +++ b/test/data_storage_worker_test.exs @@ -53,4 +53,9 @@ defmodule DataStorageWorkerTest do result = Crawly.DataStorage.stats(:unknown) assert result == {:error, :data_storage_worker_not_running} end + + test "Can inspect data storage worker state", context do + result = Crawly.DataStorage.inspect(context.crawler, :crawl_id) + assert {:inspect, "123"} == result + end end diff --git a/test/pipelines/experimental/preview_test.exs b/test/pipelines/experimental/preview_test.exs new file mode 100644 index 00000000..01f38941 --- /dev/null +++ b/test/pipelines/experimental/preview_test.exs @@ -0,0 +1,29 @@ +defmodule Pipelines.PreviewTest do + use ExUnit.Case, async: false + + @item %{first: "some", second: "data"} + @state %{spider_name: Test, crawl_id: "test"} + + test "Preview items are stored in state" do + pipelines = [{Crawly.Pipelines.Experimental.Preview}] + + {item, state} = Crawly.Utils.pipe(pipelines, @item, @state) + + assert assert item == @item + + preview = Map.get(state, :"Elixir.Crawly.Pipelines.Experimental.Preview") + assert [@item] == preview + end + + test "It's possible to resrtict number of stored items" do + pipelines = [{Crawly.Pipelines.Experimental.Preview, limit: 2}] + + # Checking what happens if we try to store 3 items + {_item, state0} = Crawly.Utils.pipe(pipelines, @item, @state) + {_item, state1} = Crawly.Utils.pipe(pipelines, @item, state0) + {_item, state2} = Crawly.Utils.pipe(pipelines, @item, state1) + + preview = Map.get(state2, :"Elixir.Crawly.Pipelines.Experimental.Preview") + assert Enum.count(preview) == 2 + end +end diff --git a/test/request_storage_test.exs b/test/request_storage_test.exs index b8422311..4891a2eb 100644 --- a/test/request_storage_test.exs +++ b/test/request_storage_test.exs @@ -109,4 +109,26 @@ defmodule RequestStorageTest do {:stored_requests, num} = Crawly.RequestsStorage.stats(context.crawler) assert 0 == num end + + test "Can get requests list from the requests storage", context do + request = %Crawly.Request{ + url: "http://example.com", + headers: [], + options: [] + } + + :ok = Crawly.RequestsStorage.store(context.crawler, request) + + {:requests, [stored_request]} = + Crawly.RequestsStorage.requests(context.crawler) + + assert request == stored_request + end + + test "Getting requests list from the requests storage if nothing is there", + context do + {:requests, req_lists} = Crawly.RequestsStorage.requests(context.crawler) + + assert req_lists == [] + end end