Skip to content

Commit

Permalink
Simplified logging for spiders
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Dec 15, 2020
1 parent 98cc40f commit efab4f9
Show file tree
Hide file tree
Showing 13 changed files with 34 additions and 53 deletions.
46 changes: 22 additions & 24 deletions lib/crawly/data_storage/data_storage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -55,42 +55,23 @@ defmodule Crawly.DataStorage do
def handle_call({:store, spider, item}, _from, state) do
%{workers: workers} = state

{pid, new_workers} =
message =
case Map.get(workers, spider) do
nil ->
{:ok, pid} =
DynamicSupervisor.start_child(
Crawly.DataStorage.WorkersSup,
{Crawly.DataStorage.Worker, [spider_name: spider]}
)

{pid, Map.put(workers, spider, pid)}
{:error, :data_storage_worker_not_running}

pid ->
{pid, workers}
Crawly.DataStorage.Worker.store(pid, item)
end

Crawly.DataStorage.Worker.store(pid, item)
{:reply, :ok, %{state | workers: new_workers}}
{:reply, message, state}
end

def handle_call({:start_worker, spider_name, crawl_id}, _from, state) do
{msg, new_state} =
case Map.get(state.workers, spider_name) do
nil ->
{:ok, pid} =
DynamicSupervisor.start_child(
Crawly.DataStorage.WorkersSup,
%{
id: :undefined,
restart: :temporary,
start:
{Crawly.DataStorage.Worker, :start_link,
[[spider_name: spider_name, crawl_id: crawl_id]]}
}
)

Process.monitor(pid)
pid = do_start_worker(spider_name, crawl_id)

new_workers = Map.put(state.workers, spider_name, pid)
new_spider_pids = Map.put(state.pid_spiders, pid, spider_name)
Expand Down Expand Up @@ -132,4 +113,21 @@ defmodule Crawly.DataStorage do

{:noreply, new_state}
end

defp do_start_worker(spider_name, crawl_id) do
{:ok, pid} =
DynamicSupervisor.start_child(
Crawly.DataStorage.WorkersSup,
%{
id: :undefined,
restart: :temporary,
start:
{Crawly.DataStorage.Worker, :start_link,
[[spider_name: spider_name, crawl_id: crawl_id]]}
}
)

Process.monitor(pid)
pid
end
end
1 change: 1 addition & 0 deletions lib/crawly/data_storage/data_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ defmodule Crawly.DataStorage.Worker do
end

def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)
{:ok, %Worker{spider_name: spider_name, crawl_id: crawl_id}}
end

Expand Down
1 change: 0 additions & 1 deletion lib/crawly/middlewares/auto_cookies_manager.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ defmodule Crawly.Middlewares.AutoCookiesManager do
]
```
"""
require Logger

def run(request, state) do
known_cookies = Map.get(state, :cookies_manager_seen_cookies, MapSet.new())
Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/middlewares/domain_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ defmodule Crawly.Middlewares.DomainFilter do
case host != nil and String.contains?(base_url, host) do
false ->
Logger.debug(
"Dropping request: #{inspect(request.url)} (domain filter)",
spider_name: state.spider_name,
crawl_id: state.crawl_id
"Dropping request: #{inspect(request.url)} (domain filter)"
)

{false, state}
Expand Down
5 changes: 1 addition & 4 deletions lib/crawly/middlewares/robotstxt.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@ defmodule Crawly.Middlewares.RobotsTxt do
def run(request, state, _opts \\ []) do
case Gollum.crawlable?("Crawly", request.url) do
:uncrawlable ->
Logger.debug("Dropping request: #{request.url} (robots.txt filter)",
spider_name: state.spider_name,
crawl_id: state.crawl_id
)
Logger.debug("Dropping request: #{request.url} (robots.txt filter)")

{false, state}

Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/middlewares/unique_request.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ defmodule Crawly.Middlewares.UniqueRequest do

_ ->
Logger.debug(
"Dropping request: #{request.url}, as it's already processed",
spider_name: state.spider_name,
crawl_id: state.crawl_id
"Dropping request: #{request.url}, as it's already processed"
)

{false, state}
Expand Down
1 change: 0 additions & 1 deletion lib/crawly/middlewares/user_agent.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ defmodule Crawly.Middlewares.UserAgent do
]
```
"""
require Logger

def run(request, state, opts \\ []) do
opts = Enum.into(opts, %{user_agents: nil})
Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/pipelines/csv_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
case opts[:fields] do
fields when fields in [nil, []] ->
Logger.error(
"Dropping item: #{inspect(item)}. Reason: No fields declared for CSVEncoder",
spider_name: state.spider_name,
crawl_id: state.crawl_id
"Dropping item: #{inspect(item)}. Reason: No fields declared for CSVEncoder"
)

{false, state}
Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/pipelines/duplicates_filter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do
nil ->
Logger.info(
"Duplicates filter pipeline is inactive, item_id option is required
to make it operational.",
spider_name: state.spider_name,
crawl_id: state.crawl_id
to make it operational."
)

{item, state}
Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/pipelines/json_encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ defmodule Crawly.Pipelines.JSONEncoder do
{:error, reason} ->
Logger.error(
"Could not encode the following item: #{inspect(item)} into json,
reason: #{inspect(reason)}",
spider_name: state.spider_name,
crawl_id: state.crawl_id
reason: #{inspect(reason)}"
)

{false, state}
Expand Down
4 changes: 1 addition & 3 deletions lib/crawly/pipelines/validate.ex
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ defmodule Crawly.Pipelines.Validate do

_ ->
Logger.info(
"Dropping item: #{inspect(item)}. Reason: missing required fields",
spider_name: state.spider_name,
crawl_id: state.crawl_id
"Dropping item: #{inspect(item)}. Reason: missing required fields"
)

{false, state}
Expand Down
4 changes: 2 additions & 2 deletions lib/crawly/requests_storage/requests_storage_worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ defmodule Crawly.RequestsStorage.Worker do
end

def start_link(spider_name, crawl_id) do
Logger.debug("Starting requests storage worker for #{spider_name}...")

GenServer.start_link(__MODULE__, [spider_name, crawl_id])
end

def init([spider_name, crawl_id]) do
Logger.metadata(spider_name: spider_name, crawl_id: crawl_id)
Logger.debug("Starting requests storage worker for #{spider_name}...")
{:ok, %Worker{requests: [], spider_name: spider_name, crawl_id: crawl_id}}
end

Expand Down
5 changes: 2 additions & 3 deletions lib/crawly/worker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ defmodule Crawly.Worker do
end

def init(spider_name: spider_name, crawl_id: crawl_id) do
Logger.metadata(crawl_id: crawl_id, spider_name: spider_name)
Crawly.Utils.send_after(self(), :work, 0)

{:ok,
Expand Down Expand Up @@ -54,9 +55,7 @@ defmodule Crawly.Worker do
"Crawly worker could not process the request to #{
inspect(request.url)
}
reason: #{inspect(reason)}",
spider_name: state.spider_name,
crawl_id: state.crawl_id
reason: #{inspect(reason)}"
)

@default_backoff
Expand Down

0 comments on commit efab4f9

Please sign in to comment.