Skip to content

Commit

Permalink
Merge 414843b into 8e89eb9
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Apr 9, 2020
2 parents 8e89eb9 + 414843b commit 2b0778d
Show file tree
Hide file tree
Showing 13 changed files with 83 additions and 18 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ historical archival.
```elixir
# lib/crawly_example/esl_spider.ex
defmodule EslSpider do
@behaviour Crawly.Spider
use Crawly.Spider

alias Crawly.Utils

@impl Crawly.Spider
Expand Down
6 changes: 6 additions & 0 deletions documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ default: 4001

Allows to specify a custom port to start the application. That is important when running more than one application in a single machine, in which case shall not use the same port as the others.

### on_spider_closed_callback :: function()

default: :ignored

Allows to define a callback function which will be executed when spider finishes
it's work.

## Overriding global settings on spider level

Expand Down
2 changes: 1 addition & 1 deletion documentation/introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Here’s the code for a spider that scrapes blog posts from the Erlang Solutions

```elixir
defmodule Esl do
@behaviour Crawly.Spider
use Crawly.Spider

@impl Crawly.Spider
def base_url(), do: "https://www.erlang-solutions.com"
Expand Down
2 changes: 1 addition & 1 deletion documentation/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Goals:
```elixir
# lib/crawly_example/esl_spider.ex
defmodule EslSpider do
@behaviour Crawly.Spider
use Crawly.Spider
alias Crawly.Utils

@impl Crawly.Spider
Expand Down
6 changes: 3 additions & 3 deletions documentation/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ homebase.ex under the lib/tutorial/spiders directory of your project.

```elixir
defmodule Homebase do
@behaviour Crawly.Spider
use Crawly.Spider

@impl Crawly.Spider
def base_url(), do: "https://www.homebase.co.uk"
Expand All @@ -96,7 +96,7 @@ defmodule Homebase do
end
```

As you can see, our Spider implements the Spider behaviour and defines
As you can see, our Spider implements the Crawly.Spider behaviour and defines
some functions:

1. base_url: method which returns base_urls for the given Spider, used in
Expand Down Expand Up @@ -267,7 +267,7 @@ into our spider.

```elixir
defmodule Homebase do
@behaviour Crawly.Spider
use Crawly.Spider

@impl Crawly.Spider
def base_url(), do: "https://www.homebase.co.uk"
Expand Down
12 changes: 9 additions & 3 deletions lib/crawly/engine.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,15 @@ defmodule Crawly.Engine do
GenServer.call(__MODULE__, {:start_spider, spider_name})
end

@spec stop_spider(module()) ::
:ok | {:error, :spider_not_running}
def stop_spider(spider_name) do
@spec stop_spider(module(), reason) :: result
when reason: :itemcount_limit | :itemcount_timeout | atom(),
result: :ok | {:error, :spider_not_running}
def stop_spider(spider_name, reason \\ :ignore) do
case Crawly.Utils.get_settings(:on_spider_closed_callback, spider_name) do
nil -> :ignore
fun -> apply(fun, [reason])
end

GenServer.call(__MODULE__, {:stop_spider, spider_name})
end

Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ defmodule Crawly.Manager do
"Stopping #{inspect(spider_name)}, closespider_itemcount achieved"
)

Crawly.Engine.stop_spider(spider_name)
Crawly.Engine.stop_spider(spider_name, :itemcount_limit)
end

defp maybe_stop_spider_by_itemcount_limit(_, _, _), do: :ok
Expand All @@ -140,7 +140,7 @@ defmodule Crawly.Manager do
when current < limit do
Logger.info("Stopping #{inspect(spider_name)}, itemcount timeout achieved")

Crawly.Engine.stop_spider(spider_name)
Crawly.Engine.stop_spider(spider_name, :itemcount_timeout)
end

defp maybe_stop_spider_by_timeout(_, _, _), do: :ok
Expand Down
3 changes: 2 additions & 1 deletion lib/crawly/settings.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ defmodule Crawly.Settings do
# Defines retries
retry: retry(),
middlewares: [middleware()],
pipelines: [pipeline()]
pipelines: [pipeline()],
on_spider_closed_callback: function()
]
end
10 changes: 9 additions & 1 deletion lib/crawly/spider.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,13 @@ defmodule Crawly.Spider do

@callback override_settings() :: Crawly.Settings.t()

@optional_callbacks override_settings: 0
defmacro __using__(_opts) do
quote do
@behaviour Crawly.Spider

def override_settings(), do: []

defoverridable override_settings: 0
end
end
end
47 changes: 43 additions & 4 deletions test/manager_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,48 @@ defmodule ManagerTest do

test "Can't start already started spider" do
:ok = Crawly.Engine.start_spider(Manager.TestSpider)
assert {:error, :spider_already_started} == Crawly.Engine.start_spider(Manager.TestSpider)

assert {:error, :spider_already_started} ==
Crawly.Engine.start_spider(Manager.TestSpider)

:ok = Crawly.Engine.stop_spider(Manager.TestSpider)
end


test "Can't stop the spider which is not started already started spider" do
:ok = Crawly.Engine.start_spider(Manager.TestSpider)
assert {:error, :spider_already_started} == Crawly.Engine.start_spider(Manager.TestSpider)

assert {:error, :spider_already_started} ==
Crawly.Engine.start_spider(Manager.TestSpider)

:ok = Crawly.Engine.stop_spider(Manager.TestSpider)
end

test "Spider closed callback is called when spider is stopped" do
Process.register(self(), :spider_closed_callback_test)
:ok = Crawly.Engine.start_spider(Manager.TestSpider)
:ok = Crawly.Engine.stop_spider(Manager.TestSpider, :manual_stop)

assert_receive :manual_stop
end
end

defmodule Manager.TestSpider do
use Crawly.Spider

def override_settings() do
on_spider_closed_callback = fn reason ->
case Process.whereis(:spider_closed_callback_test) do
nil ->
:nothing_to_do

_pid ->
send(:spider_closed_callback_test, reason)
end
end

[on_spider_closed_callback: on_spider_closed_callback]
end

def base_url() do
"https://www.example.com"
end
Expand All @@ -84,12 +113,22 @@ defmodule Manager.TestSpider do

def parse_item(_response) do
path = Enum.random(1..100)

%{
:items => [
%{title: "t_#{path}", url: "example.com", author: "Me", time: "not set"}
],
:requests => [
Crawly.Utils.request_from_url("https://www.example.com/#{path}")]
Crawly.Utils.request_from_url("https://www.example.com/#{path}")
]
}
end

def spider_closed(:manual_stop) do
send(:spider_closed_callback_test, :manual_stop)
end

def spider_closed(_) do
:ignored
end
end
2 changes: 2 additions & 0 deletions test/settings_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ defmodule SettingsTest do
end

defmodule Elixir.TestSpiderSettingsOverride do
use Crawly.Spider

def base_url() do
"https://www.example.com"
end
Expand Down
2 changes: 2 additions & 0 deletions test/test_utils.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ defmodule TestUtils do
end

defmodule TestSpider do
use Crawly.Spider

def base_url() do
"https://www.example.com"
end
Expand Down
2 changes: 1 addition & 1 deletion test/worker_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ defmodule WorkerTest do
end

defmodule Worker.CrashingTestSpider do
@behaviour Crawly.Spider
use Crawly.Spider

@impl Crawly.Spider
def base_url() do
Expand Down

0 comments on commit 2b0778d

Please sign in to comment.