Skip to content

Commit

Permalink
Improve crawly management homepage
Browse files Browse the repository at this point in the history
1. Allow to view logs of the given spider (access to file)
2. Allow to view items for given spider (access to file)
  • Loading branch information
oltarasenko committed Mar 31, 2023
1 parent b3d1db0 commit 0c1f41f
Show file tree
Hide file tree
Showing 15 changed files with 211 additions and 205 deletions.
12 changes: 9 additions & 3 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
# and its dependencies with the aid of the Mix.Config module.
import Config

config :logger,
backends: [:console, {LoggerFileBackend, :info_log}]

config :crawly,
log_dir: "/tmp/spider_logs",
log_to_file: true,
fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
retry: [
retry_codes: [400],
Expand All @@ -29,9 +34,10 @@ config :crawly,
]}
],
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.JSONEncoder
{Crawly.Pipelines.Validate, fields: ["title", "body", "url"]},
{Crawly.Pipelines.DuplicatesFilter, item_id: "title"},
Crawly.Pipelines.JSONEncoder,
{Crawly.Pipelines.WriteToFile, folder: "/tmp", extension: "jl"}
]

import_config "#{Mix.env()}.exs"
1 change: 0 additions & 1 deletion config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ config :crawly,
pipelines: [
{Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
{Crawly.Pipelines.DuplicatesFilter, item_id: :title},
Crawly.Pipelines.Experimental.Preview,
Crawly.Pipelines.JSONEncoder
],
retry: [
Expand Down
88 changes: 42 additions & 46 deletions lib/crawly/api.ex
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ defmodule Crawly.API.Router do
Enum.map(
Crawly.list_spiders(),
fn spider ->
state =
{crawl_id, state} =
case Map.get(running_spiders, spider) do
{_pid, _job_id} -> :running
nil -> :idle
{_pid, crawl_id} -> {crawl_id, :running}
nil -> {nil, :idle}
end

spider_name =
Expand Down Expand Up @@ -90,6 +90,7 @@ defmodule Crawly.API.Router do

%{
name: spider_name,
crawl_id: crawl_id,
scheduled: scheduled,
scraped: scraped,
state: state,
Expand Down Expand Up @@ -196,67 +197,62 @@ defmodule Crawly.API.Router do
send_resp(conn, 200, msg)
end

get "/spiders/:spider_name/requests" do
spider_name = String.to_atom("Elixir.#{spider_name}")
get "/spiders/:spider_name/logs/:crawl_id" do
spider_name = String.to_existing_atom(spider_name)
log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)

result =
case Crawly.RequestsStorage.requests(spider_name) do
{:requests, result} ->
Enum.map(result, fn req ->
%{url: req.url, headers: inspect(req.headers)}
end)
case File.exists?(log_file_path) do
true -> Plug.Conn.send_file(conn, 200, log_file_path)
false -> send_resp(conn, 404, "Oops! Page not found!")
end
end

get "/spiders/:spider_name/items/:crawl_id" do
folder =
Application.get_env(:crawly, :pipelines, [])
|> Keyword.get(Crawly.Pipelines.WriteToFile, [])
|> Keyword.get(:folder, "")

file_paths =
case File.ls(folder) do
{:ok, list} ->
Enum.filter(list, fn path -> String.contains?(path, crawl_id) end)

{:error, _} ->
[]
end

response =
render_template("requests_list.html.eex",
requests: result,
spider_name: spider_name
)
case file_paths do
[] ->
send_resp(conn, 404, "Oops! Page not found!")

send_resp(conn, 200, response)
end
[file_path] ->
full_path = Path.join([folder, file_path])
Plug.Conn.send_file(conn, 200, full_path)

get "/spiders/:spider_name/items" do
pipelines = Application.get_env(:crawly, :pipelines)

preview_enabled? =
Enum.any?(
pipelines,
fn
Crawly.Pipelines.Experimental.Preview -> true
{Crawly.Pipelines.Experimental.Preview, _} -> true
_ -> false
end
)
other ->
Logger.error("Could not get correct items file: #{inspect(other)}")
send_resp(conn, 500, "Unexpected error")
end
end

get "/spiders/:spider_name/requests" do
spider_name = String.to_atom("Elixir.#{spider_name}")

# According to the preview item pipeline we store items under the field below
# use inspect function to get items here
items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview"

result =
case Crawly.DataStorage.inspect(spider_name, items_preview_field) do
{:inspect, nil} ->
[]

{:inspect, result} ->
result
case Crawly.RequestsStorage.requests(spider_name) do
{:requests, result} ->
Enum.map(result, fn req ->
%{url: req.url, headers: inspect(req.headers)}
end)

{:error, _} ->
[]

nil ->
[]
end

response =
render_template("items_list.html.eex",
items: result,
preview_enabled?: preview_enabled?,
render_template("requests_list.html.eex",
requests: result,
spider_name: spider_name
)

Expand Down
20 changes: 2 additions & 18 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -261,27 +261,11 @@ defmodule Crawly.Engine do
end

defp configure_spider_logs(spider_name, crawl_id) do
log_dir =
Crawly.Utils.get_settings(
:log_dir,
spider_name,
System.tmp_dir()
)

current_unix_timestamp = :os.system_time(:second)

log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)
Logger.add_backend({LoggerFileBackend, :debug})

log_file_path =
Path.join([
log_dir,
inspect(spider_name),
# underscore separates the timestamp and the crawl_id
inspect(current_unix_timestamp) <> "_" <> crawl_id
]) <> ".log"

Logger.configure_backend({LoggerFileBackend, :debug},
path: log_file_path,
path: Crawly.Utils.spider_log_path(spider_name, crawl_id),
level: :debug,
metadata_filter: [crawl_id: crawl_id]
)
Expand Down
45 changes: 0 additions & 45 deletions lib/crawly/pipelines/experimental/preview.ex

This file was deleted.

6 changes: 4 additions & 2 deletions lib/crawly/pipelines/write_to_file.ex
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,22 @@ defmodule Crawly.Pipelines.WriteToFile do

:ok = maybe_create_folder(folder)

# Use crawl_id in filename from now on to identify crawls
crawl_id = Map.get(state, :crawl_id, "no_crawl_id")
extension = Map.get(opts, :extension, "jl")

filename =
case Map.get(opts, :include_timestamp, false) do
false ->
"#{inspect(state.spider_name)}.#{extension}"
"#{inspect(state.spider_name)}_#{crawl_id}.#{extension}"

true ->
ts_string =
NaiveDateTime.utc_now()
|> NaiveDateTime.to_string()
|> String.replace(~r/( |-|:|\.)/, "_")

"#{inspect(state.spider_name)}_#{ts_string}.#{extension}"
"#{inspect(state.spider_name)}_#{ts_string}_#{crawl_id}.#{extension}"
end

fd = open_fd(folder, filename)
Expand Down
42 changes: 42 additions & 0 deletions lib/crawly/utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,48 @@ defmodule Crawly.Utils do
Code.eval_string(template)
end

@doc """
Composes the log file path for a given spider and crawl ID.
Args:
spider_name (atom): The name of the spider to create the log path for.
crawl_id (string): The ID of the crawl to create the log path for.
Returns:
string: The file path to the log file for the given spider and crawl ID.
Examples:
iex> spider_log_path(:my_spider, "crawl_123")
"/tmp/crawly/my_spider/crawl_123.log"
iex> spider_log_path(:my_spider, "crawl_456")
"/tmp/crawly/my_spider/crawl_456.log"
"""
@spec spider_log_path(spider_name, crawl_id) :: path
when spider_name: atom(),
crawl_id: String.t(),
path: String.t()
def spider_log_path(spider_name, crawl_id) do
spider_name_str =
case Atom.to_string(spider_name) do
"Elixir." <> name_str -> name_str
name_str -> name_str
end

log_dir =
Crawly.Utils.get_settings(
:log_dir,
spider_name,
System.tmp_dir()
)

Path.join([
log_dir,
spider_name_str,
crawl_id
]) <> ".log"
end

##############################################################################
# Private functions
##############################################################################
Expand Down
38 changes: 0 additions & 38 deletions priv/items_list.html.eex

This file was deleted.

23 changes: 17 additions & 6 deletions priv/list.html.eex
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,34 @@
<div class="leftcolumn">
<div class="card">
<h2>Spiders <p floa></p></h2>


<table>
<tr>
<th>Spider name</td>
<th>State</td>
<th>Items scraped</td>
<th>Scheduled Requests</td>
<th>Scraped Items</td>
<th>Log</td>
<th>Command</td>
<th>Modify</td>
</tr>
<%= for spider <- data do %>
<tr>
<td><%= spider.name %></td>
<td><%= spider.state %></td>
<td><a href="/spiders/<%= spider.name %>/items" ><%= spider.scraped %></a> </td>
<td><a href="/spiders/<%= spider.name %>/requests" ><%= spider.scheduled %></td>
<td><%= spider.state %> </td>
<td>
<a href="/spiders/<%= spider.name %>/requests" ><%= spider.scheduled %>
</td>
<td>
<a href="/spiders/<%= spider.name %>/items/<%= spider.crawl_id %>" >
<%= spider.scraped %></a>
</td>
<td>
<%= if spider.state == :idle do %>
N/A
<% else %>
<a href="/spiders/<%= spider.name %>/logs/<%= spider.crawl_id %>"> Logs </a>
<% end %>
</td>
<%= if spider.state == :idle do %>
<td> <input type = "button" onclick = "schedule('<%= spider.name %>')" value = "Schedule"> </td>
<% else %>
Expand Down

0 comments on commit 0c1f41f

Please sign in to comment.