Skip to content

Commit

Permalink
Add possibility to generate spiders from YML definitions
Browse files Browse the repository at this point in the history
1. Create eex template for a basic YML spider
2. Create few helper functions to extract requests and items from basic YML spider
3. Create HTTP API to manage YML spiders
  • Loading branch information
oltarasenko committed Mar 30, 2023
1 parent 5eeeb2a commit c1a5cf1
Show file tree
Hide file tree
Showing 15 changed files with 861 additions and 21 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Expand Up @@ -11,4 +11,5 @@ erl_crash.dump
.DS_Store
.idea/
crawly.iml
.write_to_filetests
.write_to_filetests
dets_spiders_storage
2 changes: 1 addition & 1 deletion Dockerfile
Expand Up @@ -58,4 +58,4 @@ RUN mkdir /app/spiders

EXPOSE 4001

ENTRYPOINT [ "/app/bin/crawly", "start_iex" ]
ENTRYPOINT [ "/app/bin/crawly", "start" ]
148 changes: 146 additions & 2 deletions lib/crawly/api.ex
Expand Up @@ -3,8 +3,48 @@ defmodule Crawly.API.Router do
Crawly HTTP API. Allows to schedule/stop/get_stats
of all running spiders.
"""

require Logger

use Plug.Router

@spider_validation_schema %{
"type" => "object",
"additionalProperties" => false,
"required" => ["name", "links_to_follow", "fields", "start_urls"],
"properties" => %{
"name" => %{"type" => "string"},
"base_url" => %{"type" => "string", "format" => "uri"},
"start_urls" => %{
"type" => "array",
"items" => %{"type" => "string", "format" => "uri"}
},
"links_to_follow" => %{
"type" => "array",
"items" => %{
"type" => "object",
"additionalProperties" => false,
"properties" => %{
"selector" => %{"type" => "string"},
"attribute" => %{"type" => "string"}
}
}
},
"fields" => %{
"type" => "array",
"items" => %{
"type" => "object",
"additionalProperties" => false,
"properties" => %{
"name" => %{"type" => "string"},
"selector" => %{"type" => "string"}
}
}
}
}
}

plug(Plug.Parsers, parsers: [:urlencoded, :multipart])
plug(:match)
plug(:dispatch)

Expand Down Expand Up @@ -41,11 +81,19 @@ defmodule Crawly.API.Router do
{num, scheduled}
end

editable? =
case Crawly.SpidersStorage.get(spider_name) do
{:error, :not_found} -> false
{:ok, _value} -> true
_ -> false
end

%{
name: spider_name,
scheduled: scheduled,
scraped: scraped,
state: state
state: state,
editable?: editable?
}
end
)
Expand All @@ -54,6 +102,87 @@ defmodule Crawly.API.Router do
send_resp(conn, 200, response)
end

get "/new" do
spider_name = Map.get(conn.query_params, "spider_name", "")

spider_data =
case spider_name do
"" ->
{:ok, ""}

name ->
Crawly.SpidersStorage.get(name)
end

case spider_data do
{:error, :not_found} ->
send_resp(conn, 404, "Page not found")

{:ok, value} ->
response =
render_template("new.html.eex",
data: %{
"errors" => "",
"spider" => value,
"spider_name" => spider_name
}
)

send_resp(conn, 200, response)
end
end

post "/new" do
name_from_query_params = Map.get(conn.query_params, "spider_name", "")
spider_yml = Map.get(conn.body_params, "spider")

# Validate incoming data with json schema
validation_result =
case validate_new_spider_request(spider_yml) do
{:error, errors} ->
{:error, "#{inspect(errors)}"}

%{"name" => spider_name} = yml ->
# Check if spider already registered, but allow editing spiders
case {is_spider_registered(spider_name),
spider_name == name_from_query_params} do
{true, false} ->
{:error,
"Spider with this name already exists. Try editing it instead of overriding"}

_ ->
{:ok, yml}
end
end

case validation_result do
{:ok, %{"name" => spider_name} = _parsed_yml} ->
:ok = Crawly.SpidersStorage.put(spider_name, spider_yml)

# Now we can finally load the spider
Crawly.Utils.load_yml_spider(spider_yml)

# Now we can redirect to the homepage
conn
|> put_resp_header("location", "/")
|> send_resp(conn.status || 302, "Redirect")

{:error, errors} ->
# Show errors and spider
data = %{"errors" => errors, "spider" => spider_yml}
response = render_template("new.html.eex", data: data)
send_resp(conn, 400, response)
end
end

delete "/spider/:spider_name" do
Crawly.SpidersStorage.delete(spider_name)

conn
|> put_resp_header("location", "/")
|> send_resp(conn.status || 302, "Redirect")
end

get "/spiders" do
msg =
case Crawly.Engine.running_spiders() do
Expand Down Expand Up @@ -192,7 +321,7 @@ defmodule Crawly.API.Router do
loaded_spiders =
case Crawly.load_spiders() do
{:ok, spiders} -> spiders
{:error, _} -> []
{:error, :no_spiders_dir} -> []
end

send_resp(
Expand All @@ -206,6 +335,21 @@ defmodule Crawly.API.Router do
send_resp(conn, 404, "Oops! Page not found!")
end

defp validate_new_spider_request(maybe_yml) do
with {:ok, yml} <- YamlElixir.read_from_string(maybe_yml),
:ok <- ExJsonSchema.Validator.validate(@spider_validation_schema, yml) do
yml
else
{:error, _err} = err -> err
end
end

defp is_spider_registered(name) do
module_name_str = "Elixir." <> name
module_name = String.to_atom(module_name_str)
Enum.member?(Crawly.Utils.list_spiders(), module_name)
end

defp render_template(template_name, assigns) do
base_dir = :code.priv_dir(:crawly)
template = Path.join(base_dir, template_name)
Expand Down
6 changes: 6 additions & 0 deletions lib/crawly/application.ex
Expand Up @@ -9,6 +9,12 @@ defmodule Crawly.Application do
# Try to load spiders from the SPIDERS_DIR (for crawly standalone setup)
Crawly.load_spiders()

# Open dets storage to store spiders data
Crawly.SpidersStorage.init()

# Load spiders stored in the SpidersStorage
Crawly.Utils.load_yml_spiders()

import Supervisor.Spec, warn: false
# List all child processes to be supervised

Expand Down
101 changes: 101 additions & 0 deletions lib/crawly/spiders_storage.ex
@@ -0,0 +1,101 @@
defmodule Crawly.SpidersStorage do
@moduledoc """
Module for storing spider information using the `:dets` storage mechanism.
This module provides functionality for storing and retrieving
spider information in a term storage.
The `:dets` module is used to store the information in a disk-based table.
Functions:
- `init/0`: Initializes the storage to store spider information.
- `put/2`: Inserts the given spider name and YAML configuration into the storage.
- `get/1`: Retrieves the YAML configuration for the given spider name.
- `list/0`: Returns a list of all spider names stored in the storage.
- `delete/1`: Deletes the YAML configuration for the given spider name.
- `clear/0`: Deletes all spider information from the storage.
"""
@dets_table :dets_spiders_storage

require Logger

@typep spider_name() :: binary() | module()
@typep spider_yml() :: binary()

@doc """
Initialize storage to store spiders information
"""
@spec init :: {:error, any} | {:ok, any}
def init() do
Logger.info("Opening/checking dynamic spiders storage")
:dets.open_file(@dets_table, type: :set)
end

@doc """
Insert a given object in a term storage
iex(1)> Crawly.SpidersStorage.put(Test, "12345")
:ok
"""
@spec put(spider_name(), spider_yml()) :: :ok | {:error, term()}
def put(spider_name, spider_yml) do
:dets.insert(@dets_table, {spider_name, spider_yml})
end

@doc """
Return value for the given key from the term storage.
iex(1)> Crawly.SpidersStorage.get(Test)
{:ok, "12345"}
iex(1)> Crawly.SpidersStorage.get(T)
{:error, :not_found}
"""
@spec get(spider_name()) ::
{:ok, spider_yml()} | {:error, :not_found} | {:error, term()}
def get(spider_name) do
case :dets.lookup(@dets_table, spider_name) do
{:error, _error} = err -> err
[] -> {:error, :not_found}
[{^spider_name, spider_yml}] -> {:ok, spider_yml}
end
end

@doc """
Makes a simple list from the spiders storage.
iex(17)> Crawly.SpidersStorage.list()
[Test4, Test3, Test2, Test1, Test]
"""
@spec list() :: [spider_name()] | {:error, term()}
def list() do
first = :dets.first(@dets_table)
list(first, [])
end

@doc """
Deletes a given object
iex(17)> Crawly.SpidersStorage.delete(Test1)
:ok
"""
@spec delete(spider_name()) :: :ok | {:error, term()}
def delete(spider_name) do
:dets.delete(@dets_table, spider_name)
end

@doc """
Deletes all objects from the storage
iex(17)> Crawly.SpidersStorage.clear()
:ok
"""
@spec clear() :: :ok | {:error, term()}
def clear(), do: :dets.delete_all_objects(@dets_table)

defp list(:"$end_of_table", acc), do: acc

defp list(current_element, acc) do
next = :dets.next(@dets_table, current_element)
list(next, [current_element | acc])
end
end

0 comments on commit c1a5cf1

Please sign in to comment.