Add possibility to generate spiders from YML definitions

1. Create eex template for a basic YML spider 2. Create few helper functions to extract requests and items from basic YML spider 3. Create HTTP API to manage YML spiders
elixir-crawly · Mar 30, 2023 · c1a5cf1 · c1a5cf1
1 parent 5eeeb2a
commit c1a5cf1
Show file tree

Hide file tree

Showing 15 changed files with 861 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ erl_crash.dump
 .DS_Store
 .idea/
 crawly.iml
-.write_to_filetests
+.write_to_filetests
+dets_spiders_storage
diff --git a/Dockerfile b/Dockerfile
@@ -58,4 +58,4 @@ RUN mkdir /app/spiders
 
 EXPOSE 4001
 
-ENTRYPOINT [ "/app/bin/crawly", "start_iex" ]
+ENTRYPOINT [ "/app/bin/crawly", "start" ]
diff --git a/lib/crawly/api.ex b/lib/crawly/api.ex
@@ -3,8 +3,48 @@ defmodule Crawly.API.Router do
   Crawly HTTP API. Allows to schedule/stop/get_stats
   of all running spiders.
   """
+
+  require Logger
+
   use Plug.Router
 
+  @spider_validation_schema %{
+    "type" => "object",
+    "additionalProperties" => false,
+    "required" => ["name", "links_to_follow", "fields", "start_urls"],
+    "properties" => %{
+      "name" => %{"type" => "string"},
+      "base_url" => %{"type" => "string", "format" => "uri"},
+      "start_urls" => %{
+        "type" => "array",
+        "items" => %{"type" => "string", "format" => "uri"}
+      },
+      "links_to_follow" => %{
+        "type" => "array",
+        "items" => %{
+          "type" => "object",
+          "additionalProperties" => false,
+          "properties" => %{
+            "selector" => %{"type" => "string"},
+            "attribute" => %{"type" => "string"}
+          }
+        }
+      },
+      "fields" => %{
+        "type" => "array",
+        "items" => %{
+          "type" => "object",
+          "additionalProperties" => false,
+          "properties" => %{
+            "name" => %{"type" => "string"},
+            "selector" => %{"type" => "string"}
+          }
+        }
+      }
+    }
+  }
+
+  plug(Plug.Parsers, parsers: [:urlencoded, :multipart])
   plug(:match)
   plug(:dispatch)
 
@@ -41,11 +81,19 @@ defmodule Crawly.API.Router do
                 {num, scheduled}
             end
 
+          editable? =
+            case Crawly.SpidersStorage.get(spider_name) do
+              {:error, :not_found} -> false
+              {:ok, _value} -> true
+              _ -> false
+            end
+
           %{
             name: spider_name,
             scheduled: scheduled,
             scraped: scraped,
-            state: state
+            state: state,
+            editable?: editable?
           }
         end
       )
@@ -54,6 +102,87 @@ defmodule Crawly.API.Router do
     send_resp(conn, 200, response)
   end
 
+  get "/new" do
+    spider_name = Map.get(conn.query_params, "spider_name", "")
+
+    spider_data =
+      case spider_name do
+        "" ->
+          {:ok, ""}
+
+        name ->
+          Crawly.SpidersStorage.get(name)
+      end
+
+    case spider_data do
+      {:error, :not_found} ->
+        send_resp(conn, 404, "Page not found")
+
+      {:ok, value} ->
+        response =
+          render_template("new.html.eex",
+            data: %{
+              "errors" => "",
+              "spider" => value,
+              "spider_name" => spider_name
+            }
+          )
+
+        send_resp(conn, 200, response)
+    end
+  end
+
+  post "/new" do
+    name_from_query_params = Map.get(conn.query_params, "spider_name", "")
+    spider_yml = Map.get(conn.body_params, "spider")
+
+    # Validate incoming data with json schema
+    validation_result =
+      case validate_new_spider_request(spider_yml) do
+        {:error, errors} ->
+          {:error, "#{inspect(errors)}"}
+
+        %{"name" => spider_name} = yml ->
+          # Check if spider already registered, but allow editing spiders
+          case {is_spider_registered(spider_name),
+                spider_name == name_from_query_params} do
+            {true, false} ->
+              {:error,
+               "Spider with this name already exists. Try editing it instead of overriding"}
+
+            _ ->
+              {:ok, yml}
+          end
+      end
+
+    case validation_result do
+      {:ok, %{"name" => spider_name} = _parsed_yml} ->
+        :ok = Crawly.SpidersStorage.put(spider_name, spider_yml)
+
+        # Now we can finally load the spider
+        Crawly.Utils.load_yml_spider(spider_yml)
+
+        # Now we can redirect to the homepage
+        conn
+        |> put_resp_header("location", "/")
+        |> send_resp(conn.status || 302, "Redirect")
+
+      {:error, errors} ->
+        # Show errors and spider
+        data = %{"errors" => errors, "spider" => spider_yml}
+        response = render_template("new.html.eex", data: data)
+        send_resp(conn, 400, response)
+    end
+  end
+
+  delete "/spider/:spider_name" do
+    Crawly.SpidersStorage.delete(spider_name)
+
+    conn
+    |> put_resp_header("location", "/")
+    |> send_resp(conn.status || 302, "Redirect")
+  end
+
   get "/spiders" do
     msg =
       case Crawly.Engine.running_spiders() do
@@ -192,7 +321,7 @@ defmodule Crawly.API.Router do
     loaded_spiders =
       case Crawly.load_spiders() do
         {:ok, spiders} -> spiders
-        {:error, _} -> []
+        {:error, :no_spiders_dir} -> []
       end
 
     send_resp(
@@ -206,6 +335,21 @@ defmodule Crawly.API.Router do
     send_resp(conn, 404, "Oops! Page not found!")
   end
 
+  defp validate_new_spider_request(maybe_yml) do
+    with {:ok, yml} <- YamlElixir.read_from_string(maybe_yml),
+         :ok <- ExJsonSchema.Validator.validate(@spider_validation_schema, yml) do
+      yml
+    else
+      {:error, _err} = err -> err
+    end
+  end
+
+  defp is_spider_registered(name) do
+    module_name_str = "Elixir." <> name
+    module_name = String.to_atom(module_name_str)
+    Enum.member?(Crawly.Utils.list_spiders(), module_name)
+  end
+
   defp render_template(template_name, assigns) do
     base_dir = :code.priv_dir(:crawly)
     template = Path.join(base_dir, template_name)

diff --git a/lib/crawly/application.ex b/lib/crawly/application.ex
@@ -9,6 +9,12 @@ defmodule Crawly.Application do
     # Try to load spiders from the SPIDERS_DIR (for crawly standalone setup)
     Crawly.load_spiders()
 
+    # Open dets storage to store spiders data
+    Crawly.SpidersStorage.init()
+
+    # Load spiders stored in the SpidersStorage
+    Crawly.Utils.load_yml_spiders()
+
     import Supervisor.Spec, warn: false
     # List all child processes to be supervised
 

diff --git a/lib/crawly/spiders_storage.ex b/lib/crawly/spiders_storage.ex
@@ -0,0 +1,101 @@
+defmodule Crawly.SpidersStorage do
+  @moduledoc """
+    Module for storing spider information using the `:dets` storage mechanism.
+
+    This module provides functionality for storing and retrieving
+    spider information in a term storage.
+
+    The `:dets` module is used to store the information in a disk-based table.
+    Functions:
+    - `init/0`: Initializes the storage to store spider information.
+    - `put/2`: Inserts the given spider name and YAML configuration into the storage.
+    - `get/1`: Retrieves the YAML configuration for the given spider name.
+    - `list/0`: Returns a list of all spider names stored in the storage.
+    - `delete/1`: Deletes the YAML configuration for the given spider name.
+    - `clear/0`: Deletes all spider information from the storage.
+  """
+  @dets_table :dets_spiders_storage
+
+  require Logger
+
+  @typep spider_name() :: binary() | module()
+  @typep spider_yml() :: binary()
+
+  @doc """
+  Initialize storage to store spiders information
+  """
+  @spec init :: {:error, any} | {:ok, any}
+  def init() do
+    Logger.info("Opening/checking dynamic spiders storage")
+    :dets.open_file(@dets_table, type: :set)
+  end
+
+  @doc """
+  Insert a given object in a term storage
+
+  iex(1)> Crawly.SpidersStorage.put(Test, "12345")
+  :ok
+  """
+  @spec put(spider_name(), spider_yml()) :: :ok | {:error, term()}
+  def put(spider_name, spider_yml) do
+    :dets.insert(@dets_table, {spider_name, spider_yml})
+  end
+
+  @doc """
+  Return value for the given key from the term storage.
+
+  iex(1)> Crawly.SpidersStorage.get(Test)
+  {:ok, "12345"}
+
+  iex(1)> Crawly.SpidersStorage.get(T)
+  {:error, :not_found}
+  """
+  @spec get(spider_name()) ::
+          {:ok, spider_yml()} | {:error, :not_found} | {:error, term()}
+  def get(spider_name) do
+    case :dets.lookup(@dets_table, spider_name) do
+      {:error, _error} = err -> err
+      [] -> {:error, :not_found}
+      [{^spider_name, spider_yml}] -> {:ok, spider_yml}
+    end
+  end
+
+  @doc """
+  Makes a simple list from the spiders storage.
+
+  iex(17)> Crawly.SpidersStorage.list()
+  [Test4, Test3, Test2, Test1, Test]
+  """
+  @spec list() :: [spider_name()] | {:error, term()}
+  def list() do
+    first = :dets.first(@dets_table)
+    list(first, [])
+  end
+
+  @doc """
+  Deletes a given object
+
+  iex(17)> Crawly.SpidersStorage.delete(Test1)
+  :ok
+  """
+  @spec delete(spider_name()) :: :ok | {:error, term()}
+  def delete(spider_name) do
+    :dets.delete(@dets_table, spider_name)
+  end
+
+  @doc """
+  Deletes all objects from the storage
+
+  iex(17)> Crawly.SpidersStorage.clear()
+  :ok
+  """
+  @spec clear() :: :ok | {:error, term()}
+  def clear(), do: :dets.delete_all_objects(@dets_table)
+
+  defp list(:"$end_of_table", acc), do: acc
+
+  defp list(current_element, acc) do
+    next = :dets.next(@dets_table, current_element)
+    list(next, [current_element | acc])
+  end
+end