Merge branch 'master' into 37-spider-management-start-all-spiders

elixir-crawly · Nov 2, 2020 · 81b2871 · 81b2871
2 parents cefb9c1 + a96b1e6
commit 81b2871
Show file tree

Hide file tree

Showing 17 changed files with 260 additions and 92 deletions.
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+# These are supported funding model platforms
+github: oltarasenko
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ historical archival.
    # mix.exs
    defp deps do
        [
-         {:crawly, "~> 0.10.0"},
+         {:crawly, "~> 0.11.0"},
          {:floki, "~> 0.26.0"}
        ]
    end
@@ -101,6 +101,9 @@ You can read more here:
 
 The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders.
 
+Checkout the code from [GitHub](https://github.com/oltarasenko/crawly_ui) 
+or try it online [CrawlyUIDemo](http://crawlyui.com)
+
 ![](documentation/assets/main_page.png?raw=true)
 ![](documentation/assets/items_page.png?raw=true)
 ![](documentation/assets/item_with_filters.png?raw=true)

diff --git a/docs/README.md b/docs/README.md
@@ -5,4 +5,4 @@ extracting structured data which can be used for a wide range of
 useful applications, like data mining, information processing or
 historical archival.
 
-**[Documentation has moved to HexDocs](https://hexdocs.pm/crawly)**
+**[Documentation has moved to HexDocs](https://hexdocs.pm/crawly/readme.html)**
diff --git a/documentation/basic_concepts.md b/documentation/basic_concepts.md
@@ -33,7 +33,10 @@ All items are processed sequentially and are processed by Item pipelines.
 
 In order to make a working web crawler, all the behaviour callbacks need to be implemented.
 
-`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly.
+`init()` - a part of the Crawly.Spider behaviour. This function should return a KVList which contains a `start_urls` entry with a list, which defines the starting requests made by Crawly. Alternatively you may provide `start_requests` if it's required
+ to prepare first requests on `init()`. Which might be useful if, for example, you
+ want to pass a session cookie to the starting request. Note: `start_requests` are
+ processed before start_urls.
 
 `base_url()` - defines a base_url of the given Spider. This function is used in order to filter out all requests which are going outside of the crawled website.
 

diff --git a/documentation/tutorial.md b/documentation/tutorial.md
@@ -53,7 +53,7 @@ file with the following code:
 ```elixir
     defp deps do
       [
-        {:crawly, "~> 0.10.0"},
+        {:crawly, "~> 0.11.0"},
         {:floki, "~> 0.26.0"}
       ]
     end

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -8,20 +8,29 @@ defmodule Crawly.Engine do
 
   use GenServer
 
-  @type t :: %__MODULE__{started_spiders: started_spiders()}
+  @type t :: %__MODULE__{
+          started_spiders: started_spiders(),
+          known_spiders: [module()]
+        }
   @type started_spiders() :: %{optional(module()) => identifier()}
   @type list_spiders() :: [
           %{name: module(), state: :stopped | :started, pid: identifier()}
         ]
 
-  defstruct started_spiders: %{}
+  @type spider_info() :: %{
+          name: module(),
+          status: :stopped | :started,
+          pid: identifier() | nil
+        }
 
-  @spec start_spider(module()) ::
+  defstruct(started_spiders: %{}, known_spiders: [])
+
+  @spec start_spider(module(), binary()) ::
           :ok
           | {:error, :spider_already_started}
           | {:error, :atom}
-  def start_spider(spider_name) do
-    GenServer.call(__MODULE__, {:start_spider, spider_name})
+  def start_spider(spider_name, crawl_id \\ UUID.uuid1()) do
+    GenServer.call(__MODULE__, {:start_spider, spider_name, crawl_id})
   end
 
   @spec start_all_spiders() :: :ok
@@ -35,19 +44,23 @@ defmodule Crawly.Engine do
       :error ->
         {:error, :spider_not_found}
 
-      {:ok, pid_sup} ->
+      {:ok, {pid_sup, _job_tag}} ->
         Supervisor.which_children(pid_sup)
         |> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1))
         |> case do
-          nil -> {:error, :spider_not_found}
-          {_, pid, :worker, _} -> pid
+          nil ->
+            {:error, :spider_not_found}
+
+          {_, pid, :worker, _} ->
+            pid
         end
     end
   end
 
   @spec stop_spider(module(), reason) :: result
         when reason: :itemcount_limit | :itemcount_timeout | atom(),
-             result: :ok | {:error, :spider_not_running}
+             result:
+               :ok | {:error, :spider_not_running} | {:error, :spider_not_found}
   def stop_spider(spider_name, reason \\ :ignore) do
     case Crawly.Utils.get_settings(:on_spider_closed_callback, spider_name) do
       nil -> :ignore
@@ -57,23 +70,39 @@ defmodule Crawly.Engine do
     GenServer.call(__MODULE__, {:stop_spider, spider_name})
   end
 
-  @spec list_spiders() :: list_spiders()
-  def list_spiders() do
-    GenServer.call(__MODULE__, :list_spiders)
+  @spec list_known_spiders() :: [spider_info()]
+  def list_known_spiders() do
+    GenServer.call(__MODULE__, :list_known_spiders)
   end
 
   @spec running_spiders() :: started_spiders()
   def running_spiders() do
     GenServer.call(__MODULE__, :running_spiders)
   end
 
+  @spec get_spider_info(module()) :: spider_info()
+  def get_spider_info(name) do
+    GenServer.call(__MODULE__, {:get_spider, name})
+  end
+
+  def refresh_spider_list() do
+    GenServer.cast(__MODULE__, :refresh_spider_list)
+  end
+
   def start_link() do
     GenServer.start_link(__MODULE__, [], name: __MODULE__)
   end
 
+  @spec get_crawl_id(atom()) :: {:error, :spider_not_running} | {:ok, binary()}
+  def get_crawl_id(spider_name) do
+    GenServer.call(__MODULE__, {:get_crawl_id, spider_name})
+  end
+
   @spec init(any) :: {:ok, __MODULE__.t()}
   def init(_args) do
-    {:ok, %Crawly.Engine{}}
+    spiders = get_updated_known_spider_list()
+
+    {:ok, %Crawly.Engine{known_spiders: spiders}}
   end
 
   def handle_call({:get_manager, spider_name}, _, state) do
@@ -89,15 +118,28 @@ defmodule Crawly.Engine do
     {:reply, pid, state}
   end
 
+  def handle_call({:get_crawl_id, spider_name}, _from, state) do
+    msg =
+      case Map.get(state.started_spiders, spider_name) do
+        nil ->
+          {:error, :spider_not_running}
+
+        {_pid, crawl_id} ->
+          {:ok, crawl_id}
+      end
+
+    {:reply, msg, state}
+  end
+
   def handle_call(:running_spiders, _from, state) do
     {:reply, state.started_spiders, state}
   end
 
-  def handle_call(:list_spiders, _from, state) do
-    {:reply, list_all_spider_status(state.started_spiders), state}
+  def handle_call(:list_known_spiders, _from, state) do
+    {:reply, format_spider_info(state), state}
   end
 
-  def handle_call({:start_spider, spider_name}, _form, state) do
+  def handle_call({:start_spider, spider_name, crawl_id}, _form, state) do
     result =
       case Map.get(state.started_spiders, spider_name) do
         nil ->
@@ -110,7 +152,7 @@ defmodule Crawly.Engine do
     {msg, new_started_spiders} =
       case result do
         {:ok, pid} ->
-          {:ok, Map.put(state.started_spiders, spider_name, pid)}
+          {:ok, Map.put(state.started_spiders, spider_name, {pid, crawl_id})}
 
         {:error, _} = err ->
           {err, state.started_spiders}
@@ -147,7 +189,7 @@ defmodule Crawly.Engine do
         {nil, _} ->
           {{:error, :spider_not_running}, state.started_spiders}
 
-        {pid, new_started_spiders} ->
+        {{pid, _crawl_id}, new_started_spiders} ->
           Crawly.EngineSup.stop_spider(pid)
 
           {:ok, new_started_spiders}
@@ -156,18 +198,28 @@ defmodule Crawly.Engine do
     {:reply, msg, %Crawly.Engine{state | started_spiders: new_started_spiders}}
   end
 
-  defp list_all_spider_status(started_spiders) do
-    Crawly.Utils.list_spiders()
-    |> Enum.map(fn name ->
+  def handle_cast(:refresh_spider_list, state) do
+    updated = get_updated_known_spider_list(state.known_spiders)
+    {:noreply, %Crawly.Engine{state | known_spiders: updated}}
+  end
+
+  # this function generates a spider_info map for each spider known
+  defp format_spider_info(state) do
+    Enum.map(state.known_spiders, fn s ->
+      pid = Map.get(state.started_spiders, s)
+
       %{
-        name: name,
-        state:
-          case Map.has_key?(started_spiders, name) do
-            true -> :started
-            false -> :stopped
-          end,
-        pid: Map.get(started_spiders, name)
+        name: s,
+        status: if(is_nil(pid), do: :stopped, else: :started),
+        pid: pid
       }
     end)
   end
+
+  defp get_updated_known_spider_list(known \\ []) do
+    new = Crawly.Utils.list_spiders()
+
+    (known ++ new)
+    |> Enum.dedup_by(& &1)
+  end
 end
diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -53,7 +53,7 @@ defmodule Crawly.Manager do
   @impl true
   def init(spider_name) do
     # Getting spider start urls
-    [start_urls: urls] = spider_name.init()
+    init = spider_name.init()
 
     # Start DataStorage worker
     {:ok, data_storage_pid} = Crawly.DataStorage.start_worker(spider_name)
@@ -65,10 +65,29 @@ defmodule Crawly.Manager do
 
     Process.link(request_storage_pid)
 
-    # Store start requests
-    requests = Enum.map(urls, fn url -> Crawly.Request.new(url) end)
+    # Store start urls
+    Enum.each(
+      Keyword.get(init, :start_requests, []),
+      fn
+        %Crawly.Request{} = request ->
+          Crawly.RequestsStorage.store(spider_name, request)
+
+        request ->
+          # We should not attempt to store something which is not a request
+          Logger.error(
+            "#{inspect(request)} does not seem to be a request. Ignoring."
+          )
+
+          :ignore
+      end
+    )
 
-    :ok = Crawly.RequestsStorage.store(spider_name, requests)
+    Enum.each(
+      Keyword.get(init, :start_urls, []),
+      fn url ->
+        Crawly.RequestsStorage.store(spider_name, Crawly.Request.new(url))
+      end
+    )
 
     # Start workers
     num_workers =
@@ -162,7 +181,7 @@ defmodule Crawly.Manager do
   defp maybe_stop_spider_by_itemcount_limit(_, _, _), do: :ok
 
   defp maybe_stop_spider_by_timeout(spider_name, current, limit)
-       when current < limit do
+       when current < limit and is_integer(limit) do
     Logger.info("Stopping #{inspect(spider_name)}, itemcount timeout achieved")
 
     Crawly.Engine.stop_spider(spider_name, :itemcount_timeout)

diff --git a/lib/crawly/pipelines/experimental/send_to_ui.ex b/lib/crawly/pipelines/experimental/send_to_ui.ex
@@ -7,7 +7,12 @@ defmodule Crawly.Pipelines.Experimental.SendToUI do
 
   @impl Crawly.Pipeline
   def run(item, state, opts \\ []) do
-    job_tag = Map.get(state, :job_tag, UUID.uuid1())
+    job_tag =
+      Map.get_lazy(state, :job_tag, fn ->
+        {:ok, job_tag} = Crawly.Engine.get_crawl_id(state.spider_name)
+        job_tag
+      end)
+
     spider_name = state.spider_name |> Atom.to_string()
 
     case Keyword.get(opts, :ui_node) do
@@ -20,7 +25,7 @@ defmodule Crawly.Pipelines.Experimental.SendToUI do
       ui_node ->
         :rpc.cast(ui_node, CrawlyUI, :store_item, [
           spider_name,
-          item, 
+          item,
           job_tag,
           Node.self() |> to_string()
         ])

diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex
@@ -3,7 +3,7 @@ defmodule Crawly.Spider do
   A behavior module for implementing a Crawly Spider
 
   A Spider is a module which is responsible for defining:
-  1. `init/0` function, which must return a keyword list with start_urls list
+  1. `init/0` function, which must return a keyword list with start_urls/start_requests list
   2. `base_url/0` function responsible for filtering out requests not related to
       a given website
   3. `parse_item/1` function which is responsible for parsing the downloaded
@@ -15,7 +15,7 @@ defmodule Crawly.Spider do
       global settings defined in the config.
   """
 
-  @callback init() :: [start_urls: list()]
+  @callback init() :: [start_urls: list(), start_requests: list()]
 
   @callback base_url() :: binary()
 

diff --git a/lib/crawly/utils.ex b/lib/crawly/utils.ex
@@ -166,8 +166,8 @@ defmodule Crawly.Utils do
               acc
           end
         rescue
-          error ->
-            Logger.debug("Could not get behaviour information for: #{inspect(error)}")
+          _error ->
+            # Just ignore the case, as probably the given module is not a Spider
             acc
         end
       end

diff --git a/mix.exs b/mix.exs
@@ -1,7 +1,7 @@
 defmodule Crawly.Mixfile do
   use Mix.Project
 
-  @version "0.10.0"
+  @version "0.11.0"
 
   def project do
     [