diff --git a/README.md b/README.md index 7c7bda7a..033e758e 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,17 @@ of asynchronous elements (for example parts loaded by AJAX). You can read more here: - [Browser Rendering](https://hexdocs.pm/crawly/basic_concepts.html#browser-rendering) +## Experimental UI + +The CrawlyUI project is an add-on that aims to provide an interface for managing and rapidly developing spiders. + +![](doc/assets/main_page.png?raw=true) +![](doc/assets/items_page.png?raw=true) +![](doc/assets/item_with_filters.png?raw=true) +![](doc/assets/item_preview_example.png?raw=true) + +See more at [Experimental UI](https://hexdocs.pm/crawly/experimental_ui.html#content) + ## Documentation - [API Reference](https://hexdocs.pm/crawly/api-reference.html#content) diff --git a/documentation/experimental_ui.md b/documentation/experimental_ui.md new file mode 100644 index 00000000..3762d302 --- /dev/null +++ b/documentation/experimental_ui.md @@ -0,0 +1,69 @@ +# Experimental UI +--- + +We believe that web scraping is a process. It might seem easy to extract first +data items, however we believe that the data delivery requires a bit more efforts or +a process which supports it! + +Our aim is to provide you with the following services: + +1. Schedule (start and stop) your spiders on a cloud +2. View running jobs (performance based analysis) +3. View and validate scraped items for quality assurance and data analysis purposes. +4. View individual items and compare them with the actual website. + +## Setting it up + +You can find setup examples [here](https://github.com/oltarasenko/crawly_ui/tree/master/examples) + +On the highest level it's required to: +1. Add SendToUI pipeline to the list of your item pipelines (before encoder pipelines) +`{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}` +2. Organize erlang cluster so Crawly nodes can find CrawlyUI node +in the example above I was using [erlang-node-discovery](https://github.com/oltarasenko/erlang-node-discovery) +application for this task, however any other alternative would also work. +For setting up erlang-node-discovery +- add the following code dependency to deps section of mix.exs +`{:erlang_node_discovery, git: "https://github.com/oltarasenko/erlang-node-discovery"}` +- add the following lines to the config.exs: +```config :erlang_node_discovery, + hosts: ["127.0.0.1", "crawlyui.com"], + node_ports: [ + {:ui, 0} + ] +``` + +## Testing it locally with a docker-compose + +CrawlyUI ships with a docker compose which brings up UI, worker and database +nodes, so everything is ready for testing with just one command. + +In order to try it: +1. clone crawly_ui repo: `git clone git@github.com:oltarasenko/crawly_ui.git` +2. build ui and worker nodes: `docker-compose build` +3. apply migrations: `docker-compose run ui bash -c "/crawlyui/bin/ec eval \"CrawlyUI.ReleaseTasks.migrate\""` +4. run it all: `docker-compose up` + +## Live demo + +Live demo is available as well. However it might be a bit unstable due to our continuous release process. +Please give it a try and let us know what do you think + +[Live Demo](http://18.216.221.122/) + +## Items browser + +One of the cool features of the CrawlyUI is items browser which allows comparing +extracted data with a target website loaded in the IFRAME. However, as sites may block iframes, a workaround browser extension may be used to ignore X-Frame headers. +For example: +[Chrome extension](https://chrome.google.com/webstore/detail/ignore-x-frame-headers/gleekbfjekiniecknbkamfmkohkpodhe) + +## Gallery + +![Main Page](assets/main_page.png?raw=true) +-- +![Items browser](assets/items_page.png?raw=true) +-- +![Items browser search](assets/item_with_filters.png?raw=true) +-- +![Items browser](assets/item_preview_example.png?raw=true) diff --git a/lib/crawly.ex b/lib/crawly.ex index adcac66f..930de76a 100644 --- a/lib/crawly.ex +++ b/lib/crawly.ex @@ -53,4 +53,10 @@ defmodule Crawly do spider.parse_item(response) end end + + @doc """ + Returns a list of known modules which implements Crawly.Spider behaviour + """ + @spec list_spiders() :: [module()] + def list_spiders(), do: Crawly.Utils.list_spiders() end diff --git a/lib/crawly/pipelines/experimental/send_to_ui.ex b/lib/crawly/pipelines/experimental/send_to_ui.ex new file mode 100644 index 00000000..73099562 --- /dev/null +++ b/lib/crawly/pipelines/experimental/send_to_ui.ex @@ -0,0 +1,31 @@ +defmodule Crawly.Pipelines.Experimental.SendToUI do + @moduledoc """ + """ + @behaviour Crawly.Pipeline + + require Logger + + @impl Crawly.Pipeline + def run(item, state, opts \\ []) do + job_tag = Map.get(state, :job_tag, UUID.uuid1()) + spider_name = state.spider_name |> Atom.to_string() + + case Keyword.get(opts, :ui_node) do + nil -> + Logger.debug( + "No ui node is set. It's required to set a UI node to use " <> + "this pipeline. Ignoring the pipeline." + ) + + ui_node -> + :rpc.cast(ui_node, CrawlyUI, :store_item, [ + spider_name, + item, + job_tag, + Node.self() |> to_string() + ]) + end + + {item, Map.put(state, :job_tag, job_tag)} + end +end diff --git a/lib/crawly/utils.ex b/lib/crawly/utils.ex index 86b8f86e..1b5a4840 100644 --- a/lib/crawly/utils.ex +++ b/lib/crawly/utils.ex @@ -93,9 +93,9 @@ defmodule Crawly.Utils do catch error, reason -> Logger.error( - "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{inspect(reason)}, args: #{ - inspect(args) - }" + "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{ + inspect(reason) + }, args: #{inspect(args)}" ) {item, state} @@ -122,13 +122,14 @@ defmodule Crawly.Utils do taking precedence over the global settings defined in the config. """ @spec get_settings(setting_name, spider_name, default) :: result - when setting_name: atom(), - spider_name: atom(), - default: term(), - result: term() + when setting_name: atom(), + spider_name: atom(), + default: term(), + result: term() def get_settings(setting_name, spider_name \\ nil, default \\ nil) do global_setting = Application.get_env(:crawly, setting_name, default) + case get_spider_setting(setting_name, spider_name) do nil -> # No custom settings for a spider found @@ -139,6 +140,40 @@ defmodule Crawly.Utils do end end + @doc """ + Returns a list of known modules which implements Crawly.Spider behaviour + """ + @spec list_spiders() :: [module()] + def list_spiders() do + Enum.reduce( + get_modules_from_applications(), + [], + fn mod, acc -> + try do + behaviors = + Keyword.take(mod.__info__(:attributes), [:behaviour]) + |> Keyword.values() + |> List.flatten() + + module_has_spider_behaviour = + Enum.any?(behaviors, fn beh -> beh == Crawly.Spider end) + + case module_has_spider_behaviour do + true -> + [mod] ++ acc + + false -> + acc + end + rescue + error -> + Logger.debug("Could not get behaviour information for: #{inspect(error)}") + acc + end + end + ) + end + ############################################################################## # Private functions ############################################################################## @@ -152,11 +187,24 @@ defmodule Crawly.Utils do defp get_spider_setting(setting_name, spider_name) do case function_exported?(spider_name, :override_settings, 0) do true -> - Keyword.get(spider_name.override_settings(), setting_name, nil) false -> nil end end + + @spec get_modules_from_applications() :: [module()] + def get_modules_from_applications do + Enum.reduce(Application.started_applications(), [], fn {app, _descr, _vsn}, + acc -> + case :application.get_key(app, :modules) do + {:ok, modules} -> + modules ++ acc + + _other -> + acc + end + end) + end end diff --git a/mix.exs b/mix.exs index 008609fa..82f67dc2 100644 --- a/mix.exs +++ b/mix.exs @@ -103,6 +103,7 @@ defmodule Crawly.Mixfile do "documentation/configuration.md", "documentation/http_api.md", "documentation/ethical_aspects.md", + "documentation/experimental_ui.md", "readme.md": [title: "Introduction", file: "README.md"] ] end diff --git a/test/pipelines/experimental/send_to_ui_test.exs b/test/pipelines/experimental/send_to_ui_test.exs new file mode 100644 index 00000000..b016ca58 --- /dev/null +++ b/test/pipelines/experimental/send_to_ui_test.exs @@ -0,0 +1,24 @@ +defmodule Pipelines.Experimental.SendToUITest do + use ExUnit.Case, async: false + + @item %{title: "Title", author: "Me"} + test "job tag is added to the state" do + pipelines = [{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}] + state = %{spider_name: PipelineTestSpider} + {@item, state} = Crawly.Utils.pipe(pipelines, @item, state) + + assert Map.get(state, :job_tag) != nil + end + + test "job tag is not re-generated if pipeline was re-executed" do + pipelines = [{Crawly.Pipelines.Experimental.SendToUI, ui_node: :'ui@127.0.0.1'}] + state = %{spider_name: PipelineTestSpider} + {@item, state} = Crawly.Utils.pipe(pipelines, @item, state) + + job_tag = Map.get(state, :job_tag) + + {@item, state2} = Crawly.Utils.pipe(pipelines, @item, state) + + assert Map.get(state2, :job_tag) == job_tag + end +end diff --git a/test/test_utils.ex b/test/test_utils.ex index 3f4319b0..77e8e253 100644 --- a/test/test_utils.ex +++ b/test/test_utils.ex @@ -45,3 +45,31 @@ defmodule TestSpider do } end end + +defmodule UtilsTestSpider do + use GenServer + use Crawly.Spider + + @impl true + def init(init_arg) do + {:ok, init_arg} + end + + @impl Crawly.Spider + def base_url() do + "https://www.example.com" + end + + @impl Crawly.Spider + def init() do + [ + start_urls: ["https://www.example.com"] + ] + end + + @impl Crawly.Spider + def parse_item(_response) do + {[], []} + end +end + diff --git a/test/utils_test.exs b/test/utils_test.exs index 4cea2f93..fffc81f6 100644 --- a/test/utils_test.exs +++ b/test/utils_test.exs @@ -91,6 +91,12 @@ defmodule UtilsTest do assert Map.has_key?(state, :args) == false end + test "can find CrawlySpider behaviors" do + assert Enum.any?( + Crawly.Utils.list_spiders(), + fn x -> x == UtilsTestSpider end) + end + defp expected_request(url) do %Crawly.Request{ url: url, @@ -105,3 +111,5 @@ defmodule UtilsTest do } end end + +