Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple management UI #241

Merged
merged 5 commits into from
Mar 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,19 @@ of asynchronous elements (for example parts loaded by AJAX).
You can read more here:
- [Browser Rendering](https://hexdocs.pm/crawly/basic_concepts.html#browser-rendering)

## Simple management UI (New in 0.15.0)
Crawly provides a simple management UI by default on the `localhost:4001`

It allows to:
- Start spiders
- Stop spiders
- Preview scheduled requests
- Preview items extracted so far (it's required to add the
`Crawly.Pipelines.Experimental.Preview` item pipe to have items preview)

![Crawly Management UI](docs/crawly_ui.gif)


## Experimental UI

The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders.
Expand Down
1 change: 1 addition & 0 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ config :crawly,
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.Experimental.Preview,
Crawly.Pipelines.JSONEncoder
],
retry: [
Expand Down
Binary file added docs/crawly_ui.gif
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
122 changes: 122 additions & 0 deletions lib/crawly/api.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,52 @@ defmodule Crawly.API.Router do
plug(:match)
plug(:dispatch)

# Simple UI for crawly management
get "/" do
running_spiders = Crawly.Engine.running_spiders()

spiders_list =
Enum.map(
Crawly.list_spiders(),
fn spider ->
state =
case Map.get(running_spiders, spider) do
{_pid, _job_id} -> :running
nil -> :idle
end

spider_name =
spider
|> Atom.to_string()
|> String.replace_leading("Elixir.", "")

{scraped, scheduled} =
case state == :running do
false ->
{" - ", " - "}

true ->
{:stored_items, num} = Crawly.DataStorage.stats(spider)

{:stored_requests, scheduled} =
Crawly.RequestsStorage.stats(spider)

{num, scheduled}
end

%{
name: spider_name,
scheduled: scheduled,
scraped: scraped,
state: state
}
end
)

response = render_template("list.html.eex", data: spiders_list)
send_resp(conn, 200, response)
end

get "/spiders" do
msg =
case Crawly.Engine.running_spiders() do
Expand All @@ -21,6 +67,73 @@ defmodule Crawly.API.Router do
send_resp(conn, 200, msg)
end

get "/spiders/:spider_name/requests" do
spider_name = String.to_atom("Elixir.#{spider_name}")

result =
case Crawly.RequestsStorage.requests(spider_name) do
{:requests, result} ->
Enum.map(result, fn req ->
%{url: req.url, headers: inspect(req.headers)}
end)

{:error, _} ->
[]
end

response =
render_template("requests_list.html.eex",
requests: result,
spider_name: spider_name
)

send_resp(conn, 200, response)
end

get "/spiders/:spider_name/items" do
pipelines = Application.get_env(:crawly, :pipelines)

preview_enabled? =
Enum.any?(
pipelines,
fn
Crawly.Pipelines.Experimental.Preview -> true
{Crawly.Pipelines.Experimental.Preview, _} -> true
_ -> false
end
)

spider_name = String.to_atom("Elixir.#{spider_name}")

# According to the preview item pipeline we store items under the field below
# use inspect function to get items here
items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview"

result =
case Crawly.DataStorage.inspect(spider_name, items_preview_field) do
{:inspect, nil} ->
[]

{:inspect, result} ->
result

{:error, _} ->
[]

nil ->
[]
end

response =
render_template("items_list.html.eex",
items: result,
preview_enabled?: preview_enabled?,
spider_name: spider_name
)

send_resp(conn, 200, response)
end

get "/spiders/:spider_name/schedule" do
spider_name = String.to_atom("Elixir.#{spider_name}")
result = Crawly.Engine.start_spider(spider_name)
Expand Down Expand Up @@ -78,4 +191,13 @@ defmodule Crawly.API.Router do
match _ do
send_resp(conn, 404, "Oops! Page not found!")
end

defp render_template(template_name, assigns) do
base_dir = :code.priv_dir(:crawly)
template = Path.join(base_dir, template_name)
rendered_template = EEx.eval_file(template, assigns)

base_template = Path.join(base_dir, "index.html.eex")
EEx.eval_file(base_template, rendered_template: rendered_template)
end
end
19 changes: 19 additions & 0 deletions lib/crawly/data_storage/data_storage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ defmodule Crawly.DataStorage do
GenServer.call(__MODULE__, {:stats, spider})
end

@spec inspect(atom(), term()) ::
{:error, :data_storage_worker_not_running} | term()
def inspect(spider, field) do
GenServer.call(__MODULE__, {:inspect, spider, field})
end

def start_link([]) do
Logger.debug("Starting data storage")

Expand Down Expand Up @@ -104,6 +110,19 @@ defmodule Crawly.DataStorage do
{:reply, msg, state}
end

def handle_call({:inspect, spider_name, field}, _from, state) do
msg =
case Map.get(state.workers, spider_name) do
nil ->
{:error, :data_storage_worker_not_running}

pid ->
Crawly.DataStorage.Worker.inspect(pid, field)
end

{:reply, msg, state}
end

# Clean up worker
def handle_info({:DOWN, _ref, :process, pid, _}, state) do
spider_name = Map.get(state.pid_spiders, pid)
Expand Down
13 changes: 13 additions & 0 deletions lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ defmodule Crawly.DataStorage.Worker do
GenServer.cast(pid, {:store, item})
end

@doc """
Inspect the inner state of the given data worker
"""
@spec inspect(pid, atom()) :: term()
def inspect(pid, field) do
GenServer.call(pid, {:inspect, field})
end

def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)
{:ok, %Worker{spider_name: spider_name, crawl_id: crawl_id}}
Expand All @@ -52,6 +60,11 @@ defmodule Crawly.DataStorage.Worker do
{:noreply, state}
end

def handle_call({:inspect, field}, _from, state) do
msg = {:inspect, Map.get(state, field, nil)}
{:reply, msg, state}
end

def handle_call(:stats, _from, state) do
{:reply, {:stored_items, state.stored_items}, state}
end
Expand Down
45 changes: 45 additions & 0 deletions lib/crawly/pipelines/experimental/preview.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
defmodule Crawly.Pipelines.Experimental.Preview do
@moduledoc """
Allows to preview items extracted by the spider so far

Stores previewable items under 'Elixir.Crawly.Pipelines.Experimental.Preview'

### Options
- `limit`, (optional, if not provided 100 is used) - resrticts the number of items visible in preview

Probably it's better to place it higher than CSV/JSON converters.

### Example usage in Crawly config
```
pipelines: [
{Crawly.Pipelines.Experimental.Preview, limit: 10},

# As you can see we're using data transformators afterwords
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
]
```
"""
@behaviour Crawly.Pipeline

# Restrict the number of items stored in state of the worker
@limit 20

require Logger

@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
preview = Map.get(state, __MODULE__, [])
limit = Keyword.get(opts, :limit, @limit)

case Enum.count(preview) >= limit do
true ->
{item, state}

false ->
new_preview = [item | preview]
new_state = Map.put(state, __MODULE__, new_preview)
{item, new_state}
end
end
end
19 changes: 19 additions & 0 deletions lib/crawly/requests_storage/requests_storage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ defmodule Crawly.RequestsStorage do
GenServer.call(__MODULE__, {:stats, spider_name})
end

@spec requests(atom()) ::
{:requests, [Crawly.Request.t()]} | {:error, :spider_not_running}
def requests(spider_name) do
GenServer.call(__MODULE__, {:requests, spider_name})
end

@doc """
Starts a worker for a given spider
"""
Expand Down Expand Up @@ -130,6 +136,19 @@ defmodule Crawly.RequestsStorage do
{:reply, msg, state}
end

def handle_call({:requests, spider_name}, _from, state) do
msg =
case Map.get(state.workers, spider_name) do
nil ->
{:error, :storage_worker_not_running}

pid ->
Crawly.RequestsStorage.Worker.requests(pid)
end

{:reply, msg, state}
end

def handle_call({:start_worker, spider_name, crawl_id}, _from, state) do
{msg, new_state} =
case Map.get(state.workers, spider_name) do
Expand Down
10 changes: 10 additions & 0 deletions lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ defmodule Crawly.RequestsStorage.Worker do
do_call(pid, :stats)
end

@doc """
Returns all scheduled requests (used for some sort of preview)
"""
@spec requests(pid()) :: {:requests, [Crawly.Request.t()]}
def requests(pid), do: do_call(pid, :requests)

def start_link(spider_name, crawl_id) do
GenServer.start_link(__MODULE__, [spider_name, crawl_id])
end
Expand Down Expand Up @@ -81,6 +87,10 @@ defmodule Crawly.RequestsStorage.Worker do
{:reply, {:stored_requests, state.count}, state}
end

def handle_call(:requests, _from, state) do
{:reply, {:requests, state.requests}, state}
end

defp do_call(pid, command) do
GenServer.call(pid, command)
catch
Expand Down