From 96957c35229f75f874cf46564672ff831a759aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Varj=C3=A3o?= Date: Wed, 8 Jul 2020 15:24:25 -0300 Subject: [PATCH 1/4] [add] an API to Crawly.Manager --- lib/crawly/engine.ex | 17 +++++++ lib/crawly/manager.ex | 21 ++++++-- lib/crawly/manager_api.ex | 19 ++++++++ test/features/managet_test_spider.ex | 48 ++++++++++++++++++ test/manager_api_test.exs | 60 +++++++++++++++++++++++ test/manager_test.exs | 73 +++++----------------------- 6 files changed, 174 insertions(+), 64 deletions(-) create mode 100644 lib/crawly/manager_api.ex create mode 100644 test/features/managet_test_spider.ex create mode 100644 test/manager_api_test.exs diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex index bb5b0b5c..9dcb916d 100644 --- a/lib/crawly/engine.ex +++ b/lib/crawly/engine.ex @@ -21,6 +21,23 @@ defmodule Crawly.Engine do GenServer.call(__MODULE__, {:start_spider, spider_name}) end + @spec get_manager(module()) :: + pid() | {:error, :spider_non_exist} | {:error, :spider_not_found} + def get_manager(spider_name) do + case Map.fetch(running_spiders(), spider_name) do + :error -> + {:error, :spider_non_exist} + + {:ok, pid_sup} -> + Supervisor.which_children(pid_sup) + |> Enum.find(&({Crawly.Manager, _, :worker, [Crawly.Manager]} = &1)) + |> case do + nil -> {:error, :spider_not_found} + {_, pid, :worker, _} -> pid + end + end + end + @spec stop_spider(module(), reason) :: result when reason: :itemcount_limit | :itemcount_timeout | atom(), result: :ok | {:error, :spider_not_running} diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 04bfed49..1a50a120 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -38,6 +38,7 @@ defmodule Crawly.Manager do GenServer.start_link(__MODULE__, spider_name) end + @impl true def init(spider_name) do # Getting spider start urls [start_urls: urls] = spider_name.init() @@ -83,6 +84,18 @@ defmodule Crawly.Manager do %{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}} end + @impl true + def handle_cast({:add_workers, num_of_workers}, state) do + Logger.info("Adding #{num_of_workers} workers for #{state.name}") + + Enum.each(1..num_of_workers, fn _ -> + DynamicSupervisor.start_child(state.name, {Crawly.Worker, [state.name]}) + end) + + {:noreply, state} + end + + @impl true def handle_info(:operations, state) do Process.cancel_timer(state.tref) @@ -109,11 +122,11 @@ defmodule Crawly.Manager do |> Utils.get_settings(state.name) |> maybe_convert_to_integer() - maybe_stop_spider_by_timeout( - state.name, + maybe_stop_spider_by_timeout( + state.name, delta, - closespider_timeout_limit - ) + closespider_timeout_limit + ) tref = Process.send_after( diff --git a/lib/crawly/manager_api.ex b/lib/crawly/manager_api.ex new file mode 100644 index 00000000..dbf43a52 --- /dev/null +++ b/lib/crawly/manager_api.ex @@ -0,0 +1,19 @@ +defmodule Crawly.ManagerAPI do + @moduledoc """ + TODO + """ + + alias Crawly.Engine + + @spec add_workers(module(), non_neg_integer()) :: + :ok | {:error, :spider_non_exist} + def add_workers(spider_name, num_of_workers) do + case Engine.get_manager(spider_name) do + {:error, reason} -> + {:error, reason} + + pid -> + GenServer.cast(pid, {:add_workers, num_of_workers}) + end + end +end diff --git a/test/features/managet_test_spider.ex b/test/features/managet_test_spider.ex new file mode 100644 index 00000000..ed4c79a1 --- /dev/null +++ b/test/features/managet_test_spider.ex @@ -0,0 +1,48 @@ +defmodule Features.Manager.TestSpider do + use Crawly.Spider + + def override_settings() do + on_spider_closed_callback = fn reason -> + case Process.whereis(:spider_closed_callback_test) do + nil -> + :nothing_to_do + + _pid -> + send(:spider_closed_callback_test, reason) + end + end + + [on_spider_closed_callback: on_spider_closed_callback] + end + + def base_url() do + "https://www.example.com" + end + + def init() do + [ + start_urls: ["https://www.example.com/blog.html"] + ] + end + + def parse_item(_response) do + path = Enum.random(1..100) + + %{ + :items => [ + %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"} + ], + :requests => [ + Crawly.Utils.request_from_url("https://www.example.com/#{path}") + ] + } + end + + def spider_closed(:manual_stop) do + send(:spider_closed_callback_test, :manual_stop) + end + + def spider_closed(_) do + :ignored + end +end diff --git a/test/manager_api_test.exs b/test/manager_api_test.exs new file mode 100644 index 00000000..44c5409d --- /dev/null +++ b/test/manager_api_test.exs @@ -0,0 +1,60 @@ +defmodule ManagerAPITest do + use ExUnit.Case, async: false + + import ExUnit.CaptureLog + + alias Crawly.ManagerAPI + alias Features.Manager.TestSpider + + setup do + Application.put_env(:crawly, :concurrent_requests_per_domain, 1) + :ok = Crawly.Engine.start_spider(TestSpider) + + :meck.expect(HTTPoison, :get, fn _, _, _ -> + {:ok, + %HTTPoison.Response{ + status_code: 200, + body: "Some page", + headers: [], + request: %{} + }} + end) + + on_exit(fn -> + :meck.unload() + Crawly.Engine.stop_spider(TestSpider) + end) + end + + # test "it can get the spider pid by name" do + # pid = ManagerAPI.get_spider_pid_by_name(TestSpider) + # state = :sys.get_state(pid) + # assert TestSpider == state.name + # end + + test "it is possible to add more workers to a spider" do + spider_name = TestSpider + initial_number_of_workers = 1 + + assert initial_number_of_workers == + DynamicSupervisor.count_children(spider_name)[:workers] + + workers = 2 + assert :ok == ManagerAPI.add_workers(spider_name, workers) + + assert capture_log(fn -> Process.sleep(500) end) =~ + "Adding #{workers} workers for #{spider_name}" + + pid = Crawly.Engine.get_manager(spider_name) + state = :sys.get_state(pid) + assert spider_name == state.name + + assert initial_number_of_workers + workers == + DynamicSupervisor.count_children(spider_name)[:workers] + end + + test "returns error when spider doesn't exist" do + assert {:error, :spider_non_exist} == + ManagerAPI.add_workers(Manager.NonExistentSpider, 2) + end +end diff --git a/test/manager_test.exs b/test/manager_test.exs index 658e4d25..19f21336 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -1,6 +1,8 @@ defmodule ManagerTest do use ExUnit.Case, async: false + alias Features.Manager.TestSpider + setup do Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_itemcount, 10) @@ -18,7 +20,7 @@ defmodule ManagerTest do on_exit(fn -> :meck.unload() - Crawly.Engine.stop_spider(Manager.TestSpider) + Crawly.Engine.stop_spider(TestSpider) Application.put_env(:crawly, :manager_operations_timeout, 30_000) Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_timeout, 20) @@ -27,16 +29,16 @@ defmodule ManagerTest do end test "max request per minute is respected" do - :ok = Crawly.Engine.start_spider(Manager.TestSpider) + :ok = Crawly.Engine.start_spider(TestSpider) - {:stored_requests, num} = Crawly.RequestsStorage.stats(Manager.TestSpider) + {:stored_requests, num} = Crawly.RequestsStorage.stats(TestSpider) assert num == 1 Process.sleep(1_00) - {:stored_items, num} = Crawly.DataStorage.stats(Manager.TestSpider) + {:stored_items, num} = Crawly.DataStorage.stats(TestSpider) assert num == 1 - :ok = Crawly.Engine.stop_spider(Manager.TestSpider) + :ok = Crawly.Engine.stop_spider(TestSpider) assert %{} == Crawly.Engine.running_spiders() end @@ -45,7 +47,7 @@ defmodule ManagerTest do Application.put_env(:crawly, :manager_operations_timeout, 50) Application.put_env(:crawly, :closespider_itemcount, 1) - :ok = Crawly.Engine.start_spider(Manager.TestSpider) + :ok = Crawly.Engine.start_spider(TestSpider) assert_receive :itemcount_timeout @@ -62,73 +64,24 @@ defmodule ManagerTest do Application.put_env(:crawly, :closespider_timeout, 10) Application.put_env(:crawly, :manager_operations_timeout, 50) - :ok = Crawly.Engine.start_spider(Manager.TestSpider) + :ok = Crawly.Engine.start_spider(TestSpider) assert_receive :itemcount_timeout assert %{} == Crawly.Engine.running_spiders() end test "Can't start already started spider" do - :ok = Crawly.Engine.start_spider(Manager.TestSpider) + :ok = Crawly.Engine.start_spider(TestSpider) assert {:error, :spider_already_started} == - Crawly.Engine.start_spider(Manager.TestSpider) + Crawly.Engine.start_spider(TestSpider) end test "Spider closed callback is called when spider is stopped" do Process.register(self(), :spider_closed_callback_test) - :ok = Crawly.Engine.start_spider(Manager.TestSpider) - :ok = Crawly.Engine.stop_spider(Manager.TestSpider, :manual_stop) + :ok = Crawly.Engine.start_spider(TestSpider) + :ok = Crawly.Engine.stop_spider(TestSpider, :manual_stop) assert_receive :manual_stop end end - -defmodule Manager.TestSpider do - use Crawly.Spider - - def override_settings() do - on_spider_closed_callback = fn reason -> - case Process.whereis(:spider_closed_callback_test) do - nil -> - :nothing_to_do - - _pid -> - send(:spider_closed_callback_test, reason) - end - end - - [on_spider_closed_callback: on_spider_closed_callback] - end - - def base_url() do - "https://www.example.com" - end - - def init() do - [ - start_urls: ["https://www.example.com/blog.html"] - ] - end - - def parse_item(_response) do - path = Enum.random(1..100) - - %{ - :items => [ - %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"} - ], - :requests => [ - Crawly.Utils.request_from_url("https://www.example.com/#{path}") - ] - } - end - - def spider_closed(:manual_stop) do - send(:spider_closed_callback_test, :manual_stop) - end - - def spider_closed(_) do - :ignored - end -end From d1aa00a56e886ecc0563f0da9daa81196e29bc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Varj=C3=A3o?= Date: Wed, 8 Jul 2020 15:32:23 -0300 Subject: [PATCH 2/4] [fix] removing the capture_log during test --- test/manager_api_test.exs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/test/manager_api_test.exs b/test/manager_api_test.exs index 44c5409d..499f9dfa 100644 --- a/test/manager_api_test.exs +++ b/test/manager_api_test.exs @@ -1,8 +1,6 @@ defmodule ManagerAPITest do use ExUnit.Case, async: false - import ExUnit.CaptureLog - alias Crawly.ManagerAPI alias Features.Manager.TestSpider @@ -26,12 +24,6 @@ defmodule ManagerAPITest do end) end - # test "it can get the spider pid by name" do - # pid = ManagerAPI.get_spider_pid_by_name(TestSpider) - # state = :sys.get_state(pid) - # assert TestSpider == state.name - # end - test "it is possible to add more workers to a spider" do spider_name = TestSpider initial_number_of_workers = 1 @@ -42,9 +34,6 @@ defmodule ManagerAPITest do workers = 2 assert :ok == ManagerAPI.add_workers(spider_name, workers) - assert capture_log(fn -> Process.sleep(500) end) =~ - "Adding #{workers} workers for #{spider_name}" - pid = Crawly.Engine.get_manager(spider_name) state = :sys.get_state(pid) assert spider_name == state.name From 6a94c232867fda238d70407318af772caf3e02f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Varj=C3=A3o?= Date: Fri, 10 Jul 2020 16:41:39 -0300 Subject: [PATCH 3/4] removing file manager_api --- lib/crawly/manager_api.ex | 19 ----------- test/features/managet_test_spider.ex | 48 --------------------------- test/manager_api_test.exs | 49 ---------------------------- 3 files changed, 116 deletions(-) delete mode 100644 lib/crawly/manager_api.ex delete mode 100644 test/features/managet_test_spider.ex delete mode 100644 test/manager_api_test.exs diff --git a/lib/crawly/manager_api.ex b/lib/crawly/manager_api.ex deleted file mode 100644 index dbf43a52..00000000 --- a/lib/crawly/manager_api.ex +++ /dev/null @@ -1,19 +0,0 @@ -defmodule Crawly.ManagerAPI do - @moduledoc """ - TODO - """ - - alias Crawly.Engine - - @spec add_workers(module(), non_neg_integer()) :: - :ok | {:error, :spider_non_exist} - def add_workers(spider_name, num_of_workers) do - case Engine.get_manager(spider_name) do - {:error, reason} -> - {:error, reason} - - pid -> - GenServer.cast(pid, {:add_workers, num_of_workers}) - end - end -end diff --git a/test/features/managet_test_spider.ex b/test/features/managet_test_spider.ex deleted file mode 100644 index ed4c79a1..00000000 --- a/test/features/managet_test_spider.ex +++ /dev/null @@ -1,48 +0,0 @@ -defmodule Features.Manager.TestSpider do - use Crawly.Spider - - def override_settings() do - on_spider_closed_callback = fn reason -> - case Process.whereis(:spider_closed_callback_test) do - nil -> - :nothing_to_do - - _pid -> - send(:spider_closed_callback_test, reason) - end - end - - [on_spider_closed_callback: on_spider_closed_callback] - end - - def base_url() do - "https://www.example.com" - end - - def init() do - [ - start_urls: ["https://www.example.com/blog.html"] - ] - end - - def parse_item(_response) do - path = Enum.random(1..100) - - %{ - :items => [ - %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"} - ], - :requests => [ - Crawly.Utils.request_from_url("https://www.example.com/#{path}") - ] - } - end - - def spider_closed(:manual_stop) do - send(:spider_closed_callback_test, :manual_stop) - end - - def spider_closed(_) do - :ignored - end -end diff --git a/test/manager_api_test.exs b/test/manager_api_test.exs deleted file mode 100644 index 499f9dfa..00000000 --- a/test/manager_api_test.exs +++ /dev/null @@ -1,49 +0,0 @@ -defmodule ManagerAPITest do - use ExUnit.Case, async: false - - alias Crawly.ManagerAPI - alias Features.Manager.TestSpider - - setup do - Application.put_env(:crawly, :concurrent_requests_per_domain, 1) - :ok = Crawly.Engine.start_spider(TestSpider) - - :meck.expect(HTTPoison, :get, fn _, _, _ -> - {:ok, - %HTTPoison.Response{ - status_code: 200, - body: "Some page", - headers: [], - request: %{} - }} - end) - - on_exit(fn -> - :meck.unload() - Crawly.Engine.stop_spider(TestSpider) - end) - end - - test "it is possible to add more workers to a spider" do - spider_name = TestSpider - initial_number_of_workers = 1 - - assert initial_number_of_workers == - DynamicSupervisor.count_children(spider_name)[:workers] - - workers = 2 - assert :ok == ManagerAPI.add_workers(spider_name, workers) - - pid = Crawly.Engine.get_manager(spider_name) - state = :sys.get_state(pid) - assert spider_name == state.name - - assert initial_number_of_workers + workers == - DynamicSupervisor.count_children(spider_name)[:workers] - end - - test "returns error when spider doesn't exist" do - assert {:error, :spider_non_exist} == - ManagerAPI.add_workers(Manager.NonExistentSpider, 2) - end -end From 823509f1e300a95752310245ac50dbf6d5bc69e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Varj=C3=A3o?= Date: Fri, 10 Jul 2020 16:43:44 -0300 Subject: [PATCH 4/4] [add] manager api --- lib/crawly/engine.ex | 17 +++++++- lib/crawly/manager.ex | 14 ++++++- test/manager_test.exs | 98 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 112 insertions(+), 17 deletions(-) diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex index 9dcb916d..c6faa2be 100644 --- a/lib/crawly/engine.ex +++ b/lib/crawly/engine.ex @@ -22,11 +22,11 @@ defmodule Crawly.Engine do end @spec get_manager(module()) :: - pid() | {:error, :spider_non_exist} | {:error, :spider_not_found} + pid() | {:error, :spider_not_found} def get_manager(spider_name) do case Map.fetch(running_spiders(), spider_name) do :error -> - {:error, :spider_non_exist} + {:error, :spider_not_found} {:ok, pid_sup} -> Supervisor.which_children(pid_sup) @@ -64,6 +64,19 @@ defmodule Crawly.Engine do {:ok, %Crawly.Engine{}} end + def handle_call({:get_manager, spider_name}, _, state) do + pid = + case Map.get(state.started_spiders, spider_name) do + nil -> + {:error, :spider_not_found} + + pid -> + pid + end + + {:reply, pid, state} + end + def handle_call(:running_spiders, _from, state) do {:reply, state.started_spiders, state} end diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 1a50a120..c01ac194 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -31,7 +31,19 @@ defmodule Crawly.Manager do use GenServer - alias Crawly.Utils + alias Crawly.{Engine, Utils} + + @spec add_workers(module(), non_neg_integer()) :: + :ok | {:error, :spider_non_exist} + def add_workers(spider_name, num_of_workers) do + case Engine.get_manager(spider_name) do + {:error, reason} -> + {:error, reason} + + pid -> + GenServer.cast(pid, {:add_workers, num_of_workers}) + end + end def start_link(spider_name) do Logger.debug("Starting the manager for #{spider_name}") diff --git a/test/manager_test.exs b/test/manager_test.exs index 19f21336..1ca3f7e2 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -1,8 +1,6 @@ defmodule ManagerTest do use ExUnit.Case, async: false - alias Features.Manager.TestSpider - setup do Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_itemcount, 10) @@ -20,7 +18,7 @@ defmodule ManagerTest do on_exit(fn -> :meck.unload() - Crawly.Engine.stop_spider(TestSpider) + Crawly.Engine.stop_spider(Manager.TestSpider) Application.put_env(:crawly, :manager_operations_timeout, 30_000) Application.put_env(:crawly, :concurrent_requests_per_domain, 1) Application.put_env(:crawly, :closespider_timeout, 20) @@ -28,17 +26,41 @@ defmodule ManagerTest do end) end + test "it is possible to add more workers to a spider" do + spider_name = Manager.TestSpider + :ok = Crawly.Engine.start_spider(spider_name) + initial_number_of_workers = 1 + + assert initial_number_of_workers == + DynamicSupervisor.count_children(spider_name)[:workers] + + workers = 2 + assert :ok == Crawly.Manager.add_workers(spider_name, workers) + + pid = Crawly.Engine.get_manager(spider_name) + state = :sys.get_state(pid) + assert spider_name == state.name + + assert initial_number_of_workers + workers == + DynamicSupervisor.count_children(spider_name)[:workers] + end + + test "returns error when spider doesn't exist" do + assert {:error, :spider_not_found} == + Crawly.Manager.add_workers(Manager.NonExistentSpider, 2) + end + test "max request per minute is respected" do - :ok = Crawly.Engine.start_spider(TestSpider) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) - {:stored_requests, num} = Crawly.RequestsStorage.stats(TestSpider) + {:stored_requests, num} = Crawly.RequestsStorage.stats(Manager.TestSpider) assert num == 1 Process.sleep(1_00) - {:stored_items, num} = Crawly.DataStorage.stats(TestSpider) + {:stored_items, num} = Crawly.DataStorage.stats(Manager.TestSpider) assert num == 1 - :ok = Crawly.Engine.stop_spider(TestSpider) + :ok = Crawly.Engine.stop_spider(Manager.TestSpider) assert %{} == Crawly.Engine.running_spiders() end @@ -47,7 +69,7 @@ defmodule ManagerTest do Application.put_env(:crawly, :manager_operations_timeout, 50) Application.put_env(:crawly, :closespider_itemcount, 1) - :ok = Crawly.Engine.start_spider(TestSpider) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) assert_receive :itemcount_timeout @@ -55,7 +77,6 @@ defmodule ManagerTest do end test "Closespider timeout is respected" do - Process.register(self(), :spider_closed_callback_test) # Ignore closespider_itemcount @@ -64,24 +85,73 @@ defmodule ManagerTest do Application.put_env(:crawly, :closespider_timeout, 10) Application.put_env(:crawly, :manager_operations_timeout, 50) - :ok = Crawly.Engine.start_spider(TestSpider) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) assert_receive :itemcount_timeout assert %{} == Crawly.Engine.running_spiders() end test "Can't start already started spider" do - :ok = Crawly.Engine.start_spider(TestSpider) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) assert {:error, :spider_already_started} == - Crawly.Engine.start_spider(TestSpider) + Crawly.Engine.start_spider(Manager.TestSpider) end test "Spider closed callback is called when spider is stopped" do Process.register(self(), :spider_closed_callback_test) - :ok = Crawly.Engine.start_spider(TestSpider) - :ok = Crawly.Engine.stop_spider(TestSpider, :manual_stop) + :ok = Crawly.Engine.start_spider(Manager.TestSpider) + :ok = Crawly.Engine.stop_spider(Manager.TestSpider, :manual_stop) assert_receive :manual_stop end end + +defmodule Manager.TestSpider do + use Crawly.Spider + + def override_settings() do + on_spider_closed_callback = fn reason -> + case Process.whereis(:spider_closed_callback_test) do + nil -> + :nothing_to_do + + _pid -> + send(:spider_closed_callback_test, reason) + end + end + + [on_spider_closed_callback: on_spider_closed_callback] + end + + def base_url() do + "https://www.example.com" + end + + def init() do + [ + start_urls: ["https://www.example.com/blog.html"] + ] + end + + def parse_item(_response) do + path = Enum.random(1..100) + + %{ + :items => [ + %{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"} + ], + :requests => [ + Crawly.Utils.request_from_url("https://www.example.com/#{path}") + ] + } + end + + def spider_closed(:manual_stop) do + send(:spider_closed_callback_test, :manual_stop) + end + + def spider_closed(_) do + :ignored + end +end