Improve crawly management homepage

1. Allow to view logs of the given spider (access to file) 2. Allow to view items for given spider (access to file)
elixir-crawly · Mar 31, 2023 · 0c1f41f · 0c1f41f
1 parent b3d1db0
commit 0c1f41f
Show file tree

Hide file tree

Showing 15 changed files with 211 additions and 205 deletions.
diff --git a/config/config.exs b/config/config.exs
@@ -2,7 +2,12 @@
 # and its dependencies with the aid of the Mix.Config module.
 import Config
 
+config :logger,
+  backends: [:console, {LoggerFileBackend, :info_log}]
+
 config :crawly,
+  log_dir: "/tmp/spider_logs",
+  log_to_file: true,
   fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
   retry: [
     retry_codes: [400],
@@ -29,9 +34,10 @@ config :crawly,
      ]}
   ],
   pipelines: [
-    {Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
-    {Crawly.Pipelines.DuplicatesFilter, item_id: :title},
-    Crawly.Pipelines.JSONEncoder
+    {Crawly.Pipelines.Validate, fields: ["title", "body", "url"]},
+    {Crawly.Pipelines.DuplicatesFilter, item_id: "title"},
+    Crawly.Pipelines.JSONEncoder,
+    {Crawly.Pipelines.WriteToFile, folder: "/tmp", extension: "jl"}
   ]
 
 import_config "#{Mix.env()}.exs"
diff --git a/config/test.exs b/config/test.exs
@@ -16,7 +16,6 @@ config :crawly,
   pipelines: [
     {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
     {Crawly.Pipelines.DuplicatesFilter, item_id: :title},
-    Crawly.Pipelines.Experimental.Preview,
     Crawly.Pipelines.JSONEncoder
   ],
   retry: [

diff --git a/lib/crawly/api.ex b/lib/crawly/api.ex
@@ -56,10 +56,10 @@ defmodule Crawly.API.Router do
       Enum.map(
         Crawly.list_spiders(),
         fn spider ->
-          state =
+          {crawl_id, state} =
             case Map.get(running_spiders, spider) do
-              {_pid, _job_id} -> :running
-              nil -> :idle
+              {_pid, crawl_id} -> {crawl_id, :running}
+              nil -> {nil, :idle}
             end
 
           spider_name =
@@ -90,6 +90,7 @@ defmodule Crawly.API.Router do
 
           %{
             name: spider_name,
+            crawl_id: crawl_id,
             scheduled: scheduled,
             scraped: scraped,
             state: state,
@@ -196,67 +197,62 @@ defmodule Crawly.API.Router do
     send_resp(conn, 200, msg)
   end
 
-  get "/spiders/:spider_name/requests" do
-    spider_name = String.to_atom("Elixir.#{spider_name}")
+  get "/spiders/:spider_name/logs/:crawl_id" do
+    spider_name = String.to_existing_atom(spider_name)
+    log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)
 
-    result =
-      case Crawly.RequestsStorage.requests(spider_name) do
-        {:requests, result} ->
-          Enum.map(result, fn req ->
-            %{url: req.url, headers: inspect(req.headers)}
-          end)
+    case File.exists?(log_file_path) do
+      true -> Plug.Conn.send_file(conn, 200, log_file_path)
+      false -> send_resp(conn, 404, "Oops! Page not found!")
+    end
+  end
+
+  get "/spiders/:spider_name/items/:crawl_id" do
+    folder =
+      Application.get_env(:crawly, :pipelines, [])
+      |> Keyword.get(Crawly.Pipelines.WriteToFile, [])
+      |> Keyword.get(:folder, "")
+
+    file_paths =
+      case File.ls(folder) do
+        {:ok, list} ->
+          Enum.filter(list, fn path -> String.contains?(path, crawl_id) end)
 
         {:error, _} ->
           []
       end
 
-    response =
-      render_template("requests_list.html.eex",
-        requests: result,
-        spider_name: spider_name
-      )
+    case file_paths do
+      [] ->
+        send_resp(conn, 404, "Oops! Page not found!")
 
-    send_resp(conn, 200, response)
-  end
+      [file_path] ->
+        full_path = Path.join([folder, file_path])
+        Plug.Conn.send_file(conn, 200, full_path)
 
-  get "/spiders/:spider_name/items" do
-    pipelines = Application.get_env(:crawly, :pipelines)
-
-    preview_enabled? =
-      Enum.any?(
-        pipelines,
-        fn
-          Crawly.Pipelines.Experimental.Preview -> true
-          {Crawly.Pipelines.Experimental.Preview, _} -> true
-          _ -> false
-        end
-      )
+      other ->
+        Logger.error("Could not get correct items file: #{inspect(other)}")
+        send_resp(conn, 500, "Unexpected error")
+    end
+  end
 
+  get "/spiders/:spider_name/requests" do
     spider_name = String.to_atom("Elixir.#{spider_name}")
 
-    # According to the preview item pipeline we store items under the field below
-    # use inspect function to get items here
-    items_preview_field = :"Elixir.Crawly.Pipelines.Experimental.Preview"
-
     result =
-      case Crawly.DataStorage.inspect(spider_name, items_preview_field) do
-        {:inspect, nil} ->
-          []
-
-        {:inspect, result} ->
-          result
+      case Crawly.RequestsStorage.requests(spider_name) do
+        {:requests, result} ->
+          Enum.map(result, fn req ->
+            %{url: req.url, headers: inspect(req.headers)}
+          end)
 
         {:error, _} ->
           []
-
-        nil ->
-          []
       end
 
     response =
-      render_template("items_list.html.eex",
-        items: result,
-        preview_enabled?: preview_enabled?,
+      render_template("requests_list.html.eex",
+        requests: result,
         spider_name: spider_name
       )
 

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -261,27 +261,11 @@ defmodule Crawly.Engine do
   end
 
   defp configure_spider_logs(spider_name, crawl_id) do
-    log_dir =
-      Crawly.Utils.get_settings(
-        :log_dir,
-        spider_name,
-        System.tmp_dir()
-      )
-
-    current_unix_timestamp = :os.system_time(:second)
-
+    log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)
     Logger.add_backend({LoggerFileBackend, :debug})
 
-    log_file_path =
-      Path.join([
-        log_dir,
-        inspect(spider_name),
-        # underscore separates the timestamp and the crawl_id
-        inspect(current_unix_timestamp) <> "_" <> crawl_id
-      ]) <> ".log"
-
     Logger.configure_backend({LoggerFileBackend, :debug},
-      path: log_file_path,
+      path: Crawly.Utils.spider_log_path(spider_name, crawl_id),
       level: :debug,
       metadata_filter: [crawl_id: crawl_id]
     )

diff --git a/lib/crawly/pipelines/experimental/preview.ex b/lib/crawly/pipelines/experimental/preview.ex
diff --git a/lib/crawly/pipelines/write_to_file.ex b/lib/crawly/pipelines/write_to_file.ex
@@ -69,20 +69,22 @@ defmodule Crawly.Pipelines.WriteToFile do
 
     :ok = maybe_create_folder(folder)
 
+    # Use crawl_id in filename from now on to identify crawls
+    crawl_id = Map.get(state, :crawl_id, "no_crawl_id")
     extension = Map.get(opts, :extension, "jl")
 
     filename =
       case Map.get(opts, :include_timestamp, false) do
         false ->
-          "#{inspect(state.spider_name)}.#{extension}"
+          "#{inspect(state.spider_name)}_#{crawl_id}.#{extension}"
 
         true ->
           ts_string =
             NaiveDateTime.utc_now()
             |> NaiveDateTime.to_string()
             |> String.replace(~r/( |-|:|\.)/, "_")
 
-          "#{inspect(state.spider_name)}_#{ts_string}.#{extension}"
+          "#{inspect(state.spider_name)}_#{ts_string}_#{crawl_id}.#{extension}"
       end
 
     fd = open_fd(folder, filename)

diff --git a/lib/crawly/utils.ex b/lib/crawly/utils.ex
@@ -336,6 +336,48 @@ defmodule Crawly.Utils do
     Code.eval_string(template)
   end
 
+  @doc """
+    Composes the log file path for a given spider and crawl ID.
+
+    Args:
+    spider_name (atom): The name of the spider to create the log path for.
+    crawl_id (string): The ID of the crawl to create the log path for.
+
+    Returns:
+    string: The file path to the log file for the given spider and crawl ID.
+
+    Examples:
+    iex> spider_log_path(:my_spider, "crawl_123")
+    "/tmp/crawly/my_spider/crawl_123.log"
+
+    iex> spider_log_path(:my_spider, "crawl_456")
+    "/tmp/crawly/my_spider/crawl_456.log"
+  """
+  @spec spider_log_path(spider_name, crawl_id) :: path
+        when spider_name: atom(),
+             crawl_id: String.t(),
+             path: String.t()
+  def spider_log_path(spider_name, crawl_id) do
+    spider_name_str =
+      case Atom.to_string(spider_name) do
+        "Elixir." <> name_str -> name_str
+        name_str -> name_str
+      end
+
+    log_dir =
+      Crawly.Utils.get_settings(
+        :log_dir,
+        spider_name,
+        System.tmp_dir()
+      )
+
+    Path.join([
+      log_dir,
+      spider_name_str,
+      crawl_id
+    ]) <> ".log"
+  end
+
   ##############################################################################
   # Private functions
   ##############################################################################

diff --git a/priv/items_list.html.eex b/priv/items_list.html.eex
diff --git a/priv/list.html.eex b/priv/list.html.eex
@@ -3,23 +3,34 @@
       <div class="leftcolumn">
         <div class="card">
           <h2>Spiders <p floa></p></h2>
-
-
           <table>
             <tr>
               <th>Spider name</td>
               <th>State</td>
-              <th>Items scraped</td>
               <th>Scheduled Requests</td>
+              <th>Scraped Items</td>
+              <th>Log</td>
               <th>Command</td>
               <th>Modify</td>
             </tr>
             <%= for spider <- data do %>
               <tr>
                 <td><%= spider.name %></td>
-                <td><%= spider.state %></td>
-                <td><a href="/spiders/<%= spider.name %>/items" ><%= spider.scraped %></a> </td>
-                <td><a href="/spiders/<%= spider.name %>/requests" ><%= spider.scheduled %></td>
+                <td><%= spider.state %> </td>
+                <td>
+                  <a href="/spiders/<%= spider.name %>/requests" ><%= spider.scheduled %>
+                </td>
+                <td>
+                  <a href="/spiders/<%= spider.name %>/items/<%= spider.crawl_id %>" >
+                    <%= spider.scraped %></a>
+                  </td>
+                <td>
+                  <%= if spider.state == :idle do %>
+                    N/A
+                  <% else %>
+                    <a href="/spiders/<%= spider.name %>/logs/<%= spider.crawl_id %>"> Logs </a>
+                  <% end %>
+                </td>
                 <%= if spider.state == :idle do %>
                   <td> <input type = "button" onclick = "schedule('<%= spider.name %>')" value = "Schedule"> </td>
                 <% else %>