Merge branch 'master' into write_to_file_improvements

elixir-crawly · May 16, 2020 · c9b1fd7 · c9b1fd7
2 parents 7498999 + deca4ea
commit c9b1fd7
Show file tree

Hide file tree

Showing 13 changed files with 248 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -97,6 +97,17 @@ of asynchronous elements (for example parts loaded by AJAX).
 You can read more here:
 - [Browser Rendering](https://hexdocs.pm/crawly/basic_concepts.html#browser-rendering)
 
+## Experimental UI
+
+The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders.
+
+![](documentation/assets/main_page.png?raw=true)
+![](documentation/assets/items_page.png?raw=true)
+![](documentation/assets/item_with_filters.png?raw=true)
+![](documentation/assets/item_preview_example.png?raw=true)
+
+See more at [Experimental UI](https://hexdocs.pm/crawly/experimental_ui.html#content)
+
 ## Documentation
 
 - [API Reference](https://hexdocs.pm/crawly/api-reference.html#content)

diff --git a/documentation/assets/item_preview_example.png b/documentation/assets/item_preview_example.png
diff --git a/documentation/assets/item_with_filters.png b/documentation/assets/item_with_filters.png
diff --git a/documentation/assets/items_page.png b/documentation/assets/items_page.png
diff --git a/documentation/assets/main_page.png b/documentation/assets/main_page.png
diff --git a/documentation/experimental_ui.md b/documentation/experimental_ui.md
@@ -0,0 +1,68 @@
+# Experimental UI
+
+We believe that web scraping is a process. It might seem easy to extract first 
+data items, however we believe that the data delivery requires a bit more efforts or
+a process which supports it!
+
+Our aim is to provide you with the following services:
+
+1. Schedule (start and stop) your spiders on a cloud
+2. View running jobs (performance based analysis)
+3. View and validate scraped items for quality assurance and data analysis purposes.
+4. View individual items and compare them with the actual website.
+
+## Setting it up
+
+You can find setup examples [here](https://github.com/oltarasenko/crawly_ui/tree/master/examples)
+
+On the highest level it's required to:
+1. Add SendToUI pipeline to the list of your item pipelines (before encoder pipelines)
+`{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}`
+2. Organize erlang cluster so Crawly nodes can find CrawlyUI node
+in the example above I was using [erlang-node-discovery](https://github.com/oltarasenko/erlang-node-discovery)
+application for this task, however any other alternative would also work.
+For setting up erlang-node-discovery 
+-  add the following code dependency to deps section of mix.exs
+`{:erlang_node_discovery, git: "https://github.com/oltarasenko/erlang-node-discovery"}`
+- add the following lines to the config.exs: 
+```config :erlang_node_discovery,
+          hosts: ["127.0.0.1", "crawlyui.com"],
+          node_ports: [
+            {:ui, 0}
+          ]
+```
+
+## Testing it locally with a docker-compose
+
+CrawlyUI ships with a docker compose which brings up UI, worker and database
+nodes, so everything is ready for testing with just one command.
+
+In order to try it:
+1. clone crawly_ui repo: `git clone git@github.com:oltarasenko/crawly_ui.git`
+2. build ui and worker nodes: `docker-compose build`
+3. apply migrations: `docker-compose run ui bash -c "/crawlyui/bin/ec eval \"CrawlyUI.ReleaseTasks.migrate\""`
+4. run it all: `docker-compose up`
+
+## Live demo
+
+Live demo is available as well. However it might be a bit unstable due to our continuous release process.
+Please give it a try and let us know what do you think
+
+[Live Demo](http://18.216.221.122/)  
+
+## Items browser
+
+One of the cool features of the CrawlyUI is items browser which allows comparing
+extracted data with a target website loaded in the IFRAME. However, as sites may block iframes, a workaround browser extension may be used to ignore X-Frame headers.
+For example:
+[Chrome extension](https://chrome.google.com/webstore/detail/ignore-x-frame-headers/gleekbfjekiniecknbkamfmkohkpodhe)
+
+## Gallery
+
+![Main Page](documentation/assets/main_page.png?raw=true)
+
+![Items browser](documentation/assets/items_page.png?raw=true)
+
+![Items browser search](documentation/assets/item_with_filters.png?raw=true)
+
+![Items browser](documentation/assets/item_preview_example.png?raw=true)
diff --git a/lib/crawly.ex b/lib/crawly.ex
@@ -53,4 +53,10 @@ defmodule Crawly do
         spider.parse_item(response)
     end
   end
+
+  @doc """
+  Returns a list of known modules which implements Crawly.Spider behaviour
+  """
+  @spec list_spiders() :: [module()]
+  def list_spiders(), do: Crawly.Utils.list_spiders()
 end
diff --git a/lib/crawly/pipelines/experimental/send_to_ui.ex b/lib/crawly/pipelines/experimental/send_to_ui.ex
@@ -0,0 +1,31 @@
+defmodule Crawly.Pipelines.Experimental.SendToUI do
+  @moduledoc """
+  """
+  @behaviour Crawly.Pipeline
+
+  require Logger
+
+  @impl Crawly.Pipeline
+  def run(item, state, opts \\ []) do
+    job_tag = Map.get(state, :job_tag, UUID.uuid1())
+    spider_name = state.spider_name |> Atom.to_string()
+
+    case Keyword.get(opts, :ui_node) do
+      nil ->
+        Logger.debug(
+          "No ui node is set. It's required to set a UI node to use " <>
+            "this pipeline. Ignoring the pipeline."
+        )
+
+      ui_node ->
+        :rpc.cast(ui_node, CrawlyUI, :store_item, [
+          spider_name,
+          item, 
+          job_tag,
+          Node.self() |> to_string()
+        ])
+    end
+
+    {item, Map.put(state, :job_tag, job_tag)}
+  end
+end
diff --git a/lib/crawly/utils.ex b/lib/crawly/utils.ex
@@ -93,9 +93,9 @@ defmodule Crawly.Utils do
       catch
         error, reason ->
           Logger.error(
-            "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{inspect(reason)}, args: #{
-              inspect(args)
-            }"
+            "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{
+              inspect(reason)
+            }, args: #{inspect(args)}"
           )
 
           {item, state}
@@ -122,13 +122,14 @@ defmodule Crawly.Utils do
   taking precedence over the global settings defined in the config.
   """
   @spec get_settings(setting_name, spider_name, default) :: result
-    when setting_name: atom(),
-         spider_name: atom(),
-         default: term(),
-         result: term()
+        when setting_name: atom(),
+             spider_name: atom(),
+             default: term(),
+             result: term()
 
   def get_settings(setting_name, spider_name \\ nil, default \\ nil) do
     global_setting = Application.get_env(:crawly, setting_name, default)
+
     case get_spider_setting(setting_name, spider_name) do
       nil ->
         # No custom settings for a spider found
@@ -139,6 +140,40 @@ defmodule Crawly.Utils do
     end
   end
 
+  @doc """
+  Returns a list of known modules which implements Crawly.Spider behaviour
+  """
+  @spec list_spiders() :: [module()]
+  def list_spiders() do
+    Enum.reduce(
+      get_modules_from_applications(),
+      [],
+      fn mod, acc ->
+        try do
+          behaviors =
+            Keyword.take(mod.__info__(:attributes), [:behaviour])
+            |> Keyword.values()
+            |> List.flatten()
+
+          module_has_spider_behaviour =
+            Enum.any?(behaviors, fn beh -> beh == Crawly.Spider end)
+
+          case module_has_spider_behaviour do
+            true ->
+              [mod] ++ acc
+
+            false ->
+              acc
+          end
+        rescue
+          error ->
+            Logger.debug("Could not get behaviour information for: #{inspect(error)}")
+            acc
+        end
+      end
+    )
+  end
+
   ##############################################################################
   # Private functions
   ##############################################################################
@@ -152,11 +187,24 @@ defmodule Crawly.Utils do
   defp get_spider_setting(setting_name, spider_name) do
     case function_exported?(spider_name, :override_settings, 0) do
       true ->
-
         Keyword.get(spider_name.override_settings(), setting_name, nil)
 
       false ->
         nil
     end
   end
+
+  @spec get_modules_from_applications() :: [module()]
+  def get_modules_from_applications do
+    Enum.reduce(Application.started_applications(), [], fn {app, _descr, _vsn},
+                                                           acc ->
+      case :application.get_key(app, :modules) do
+        {:ok, modules} ->
+          modules ++ acc
+
+        _other ->
+          acc
+      end
+    end)
+  end
 end
diff --git a/mix.exs b/mix.exs
@@ -17,10 +17,24 @@ defmodule Crawly.Mixfile do
       elixirc_paths: elixirc_paths(Mix.env()),
       docs: docs(),
       elixirc_options: [warnings_as_errors: true],
-      deps: deps()
+      deps: deps(),
+      aliases: aliases()
     ]
   end
 
+  defp aliases do
+    [
+      generate_documentation: &generate_documentation/1,
+    ]
+  end
+
+  defp generate_documentation(_) do
+    System.cmd("mix", ["docs"])
+    System.cmd("mkdir", ["-p", "./doc/documentation/assets"])
+    System.cmd("cp", ["-r", "documentation/assets", "doc/documentation"])
+  end
+
+
   defp elixirc_paths(:test), do: ["lib", "test"]
   defp elixirc_paths(_), do: ["lib"]
   # Run "mix help compile.app" to learn about applications.
@@ -103,6 +117,7 @@ defmodule Crawly.Mixfile do
       "documentation/configuration.md",
       "documentation/http_api.md",
       "documentation/ethical_aspects.md",
+      "documentation/experimental_ui.md",
       "readme.md": [title: "Introduction", file: "README.md"]
     ]
   end

diff --git a/test/pipelines/experimental/send_to_ui_test.exs b/test/pipelines/experimental/send_to_ui_test.exs
@@ -0,0 +1,24 @@
+defmodule Pipelines.Experimental.SendToUITest do
+  use ExUnit.Case, async: false
+
+  @item %{title: "Title", author: "Me"}
+  test "job tag is added to the state" do
+    pipelines = [{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}]
+    state = %{spider_name: PipelineTestSpider}
+    {@item, state} = Crawly.Utils.pipe(pipelines, @item, state)
+
+    assert Map.get(state, :job_tag) != nil
+  end
+
+  test "job tag is not re-generated if pipeline was re-executed" do
+    pipelines = [{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}]
+    state = %{spider_name: PipelineTestSpider}
+    {@item, state} = Crawly.Utils.pipe(pipelines, @item, state)
+
+    job_tag = Map.get(state, :job_tag)
+
+    {@item, state2} = Crawly.Utils.pipe(pipelines, @item, state)
+
+    assert Map.get(state2, :job_tag) == job_tag
+  end
+end
diff --git a/test/test_utils.ex b/test/test_utils.ex
@@ -45,3 +45,31 @@ defmodule TestSpider do
     }
   end
 end
+
+defmodule UtilsTestSpider do
+  use GenServer
+  use Crawly.Spider
+
+  @impl true
+  def init(init_arg) do
+    {:ok, init_arg}
+  end
+
+  @impl Crawly.Spider
+  def base_url() do
+    "https://www.example.com"
+  end
+
+  @impl Crawly.Spider
+  def init() do
+    [
+      start_urls: ["https://www.example.com"]
+    ]
+  end
+
+  @impl Crawly.Spider
+  def parse_item(_response) do
+    {[], []}
+  end
+end
+
diff --git a/test/utils_test.exs b/test/utils_test.exs
@@ -91,6 +91,12 @@ defmodule UtilsTest do
     assert Map.has_key?(state, :args) == false
   end
 
+  test "can find CrawlySpider behaviors" do
+    assert Enum.any?(
+      Crawly.Utils.list_spiders(),
+      fn x -> x == UtilsTestSpider end)
+  end
+
   defp expected_request(url) do
     %Crawly.Request{
       url: url,
@@ -105,3 +111,5 @@ defmodule UtilsTest do
     }
   end
 end
+
+