Skip to content

Commit

Permalink
Allow previewing items
Browse files Browse the repository at this point in the history
1. Extend DataStorage with inspect function that allows inspecting states
2. Add Preview item pipeline that allows storing extracted items as a part of the state
   of DataStorage
3. Update html part so it's possible to navigate the preview
  • Loading branch information
oltarasenko committed Mar 15, 2023
1 parent a5d34e7 commit 17c89fc
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 2 deletions.
38 changes: 38 additions & 0 deletions lib/crawly/api.ex
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,44 @@ defmodule Crawly.API.Router do
send_resp(conn, 200, response)
end

get "/spiders/:spider_name/items" do
pipelines = Application.get_env(:crawly, :pipelines)

preview_enabled? =
Enum.any?(
pipelines,
fn
Crawly.Pipelines.Experimental.Preview -> true
{Crawly.Pipelines.Experimental.Preview, _} -> true
_ -> false
end
)

spider_name = String.to_atom("Elixir.#{spider_name}")

# According to the preview item pipeline we store items under the field below
# use inspect function to get items here
items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview"

result =
case Crawly.DataStorage.inspect(spider_name, items_preview_field) do
{:inspect, result} ->
result

{:error, _} ->
[]
end

response =
render_template("items_list.html.eex",
items: result,
preview_enabled?: preview_enabled?,
spider_name: spider_name
)

send_resp(conn, 200, response)
end

get "/spiders/:spider_name/schedule" do
spider_name = String.to_atom("Elixir.#{spider_name}")
result = Crawly.Engine.start_spider(spider_name)
Expand Down
19 changes: 19 additions & 0 deletions lib/crawly/data_storage/data_storage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ defmodule Crawly.DataStorage do
GenServer.call(__MODULE__, {:stats, spider})
end

@spec inspect(atom(), term()) ::
{:error, :data_storage_worker_not_running} | term()
def inspect(spider, field) do
GenServer.call(__MODULE__, {:inspect, spider, field})
end

def start_link([]) do
Logger.debug("Starting data storage")

Expand Down Expand Up @@ -104,6 +110,19 @@ defmodule Crawly.DataStorage do
{:reply, msg, state}
end

def handle_call({:inspect, spider_name, field}, _from, state) do
msg =
case Map.get(state.workers, spider_name) do
nil ->
{:error, :data_storage_worker_not_running}

pid ->
Crawly.DataStorage.Worker.inspect(pid, field)
end

{:reply, msg, state}
end

# Clean up worker
def handle_info({:DOWN, _ref, :process, pid, _}, state) do
spider_name = Map.get(state.pid_spiders, pid)
Expand Down
13 changes: 13 additions & 0 deletions lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ defmodule Crawly.DataStorage.Worker do
GenServer.cast(pid, {:store, item})
end

@doc """
Inspect the inner state of the given data worker
"""
@spec inspect(pid, atom()) :: term()
def inspect(pid, field) do
GenServer.call(pid, {:inspect, field})
end

def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)
{:ok, %Worker{spider_name: spider_name, crawl_id: crawl_id}}
Expand All @@ -52,6 +60,11 @@ defmodule Crawly.DataStorage.Worker do
{:noreply, state}
end

def handle_call({:inspect, field}, _from, state) do
msg = {:inspect, Map.get(state, field)}
{:reply, msg, state}
end

def handle_call(:stats, _from, state) do
{:reply, {:stored_items, state.stored_items}, state}
end
Expand Down
45 changes: 45 additions & 0 deletions lib/crawly/pipelines/experimental/preview.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
defmodule Crawly.Pipelines.Experimental.Preview do
@moduledoc """
Allows to preview items extracted by the spider so far
Stores previewable items under 'Elixir.Crawly.Pipelines.Experimental.Preview'
### Options
- `limit`, (optional, if not provided 100 is used) - resrticts the number of items visible in preview
Probably it's better to place it higher than CSV/JSON converters.
### Example usage in Crawly config
```
pipelines: [
{Crawly.Pipelines.Experimental.Preview, limit: 10},
# As you can see we're using data transformators afterwords
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, extension: "jl", folder: "/tmp"}
]
```
"""
@behaviour Crawly.Pipeline

# Restrict the number of items stored in state of the worker
@limit 20

require Logger

@impl Crawly.Pipeline
def run(item, state, opts \\ []) do
preview = Map.get(state, __MODULE__, [])
limit = Keyword.get(opts, :limit, @limit)

case Enum.count(preview) >= limit do
true ->
{item, state}

false ->
new_preview = [item | preview]
new_state = Map.put(state, __MODULE__, new_preview)
{item, new_state}
end
end
end
1 change: 0 additions & 1 deletion priv/index.html.eex
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="refresh" content="10">
<script type="text/javascript">
function get(name, link) {
var xhttp = new XMLHttpRequest();
Expand Down
38 changes: 38 additions & 0 deletions priv/items_list.html.eex
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<div class="row">
<div id="status">
<div>
<%= if not preview_enabled? do %>
Items previewer requires the Crawly.Pipelines.Experimental.Preview
item pipeline in your config
<% end %>
</div>

</div>
<div class="leftcolumn">
<div class="card">
<h3>
Extracted items: <%= spider_name %>
<a href="/">Back</a>
</h3>
<table>
<tr>
<th>item</th>
</tr>

<%= for item <- items do %>
<tr>
<td>
<ul>
<%= for {key, value} <- item do %>
<li><%= key %>: <%= value %></li>
<% end %>
</ul>
</td>

</tr>
<% end %>
</table>
</div>
<div class="rightcolumn">
</div>
</div>
2 changes: 1 addition & 1 deletion priv/list.html.eex
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<tr>
<td><%= spider.name %></td>
<td><%= spider.state %></td>
<td> <%= spider.scraped %> </td>
<td><a href="/spiders/<%= spider.name %>/items" ><%= spider.scraped %></a> </td>
<td><a href="/spiders/<%= spider.name %>/requests" ><%= spider.scheduled %></td>
<%= if spider.state == :idle do %>
<td> <input type = "button" onclick = "schedule('<%= spider.name %>')" value = "Schedule"> </td>
Expand Down
5 changes: 5 additions & 0 deletions test/data_storage_worker_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,9 @@ defmodule DataStorageWorkerTest do
result = Crawly.DataStorage.stats(:unknown)
assert result == {:error, :data_storage_worker_not_running}
end

test "Can inspect data storage worker state", context do
result = Crawly.DataStorage.inspect(context.crawler, :crawl_id)
assert {:inspect, "123"} == result
end
end
29 changes: 29 additions & 0 deletions test/pipelines/experimental/preview_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
defmodule Pipelines.PreviewTest do
use ExUnit.Case, async: false

@item %{first: "some", second: "data"}
@state %{spider_name: Test, crawl_id: "test"}

test "Preview items are stored in state" do
pipelines = [{Crawly.Pipelines.Experimental.Preview}]

{item, state} = Crawly.Utils.pipe(pipelines, @item, @state)

assert assert item == @item

preview = Map.get(state, :"Elixir.Crawly.Pipelines.Experimental.Preview")
assert [@item] == preview
end

test "It's possible to resrtict number of stored items" do
pipelines = [{Crawly.Pipelines.Experimental.Preview, limit: 2}]

# Checking what happens if we try to store 3 items
{_item, state0} = Crawly.Utils.pipe(pipelines, @item, @state)
{_item, state1} = Crawly.Utils.pipe(pipelines, @item, state0)
{_item, state2} = Crawly.Utils.pipe(pipelines, @item, state1)

preview = Map.get(state2, :"Elixir.Crawly.Pipelines.Experimental.Preview")
assert Enum.count(preview) == 2
end
end

0 comments on commit 17c89fc

Please sign in to comment.