diff --git a/config/config.exs b/config/config.exs index 89a34e0b..7805d7c5 100644 --- a/config/config.exs +++ b/config/config.exs @@ -29,48 +29,40 @@ use Mix.Config # # import_config "#{Mix.env}.exs" -config :crawly, Crawly.Worker, client: HTTPoison - config :crawly, fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []}, - retry: - [ - retry_codes: [400], - max_retries: 3, - ignored_middlewares: [Crawly.Middlewares.UniqueRequest] + retry: [ + retry_codes: [400], + max_retries: 3, + ignored_middlewares: [Crawly.Middlewares.UniqueRequest] ], - # User agents which are going to be used with requests - user_agents: [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" - ], - # Item definition - item: [:title, :author, :time, :url], - # Identifier which is used to filter out duplicates - item_id: :title, # Stop spider after scraping certain amount of items closespider_itemcount: 500, # Stop spider if it does crawl fast enough closespider_timeout: 20, concurrent_requests_per_domain: 5, + + # TODO: this looks outdated follow_redirect: true, + # Request middlewares middlewares: [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.RobotsTxt, - Crawly.Middlewares.UserAgent + {Crawly.Middlewares.UserAgent, + user_agents: [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" + ]} ], pipelines: [ - Crawly.Pipelines.Validate, - Crawly.Pipelines.DuplicatesFilter, + {Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]}, + {Crawly.Pipelines.DuplicatesFilter, item_id: :title}, Crawly.Pipelines.JSONEncoder ] -config :crawly, Crawly.Pipelines.WriteToFile, - folder: "/tmp", - extension: "jl" - import_config "#{Mix.env}.exs" +import_config "#{Mix.env()}.exs" diff --git a/config/test.exs b/config/test.exs index 6a006407..ad9c1d78 100644 --- a/config/test.exs +++ b/config/test.exs @@ -2,28 +2,30 @@ use Mix.Config config :crawly, manager_operations_timeout: 30_000, - # User agents which are going to be used with requests - user_agents: [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" - ], + # Stop spider after scraping certain amount of items closespider_itemcount: 100, # Stop spider if it does crawl fast enough closespider_timeout: 20, concurrent_requests_per_domain: 5, follow_redirect: true, - # Request middlewares + + # Request middlewares + # User agents which are going to be used with requests + user_agents: [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41" + ], middlewares: [ Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.RobotsTxt, - Crawly.Middlewares.UserAgent + {Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]} ], pipelines: [ - Crawly.Pipelines.Validate, - Crawly.Pipelines.DuplicatesFilter, + {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]}, + {Crawly.Pipelines.DuplicatesFilter, item_id: :title}, Crawly.Pipelines.JSONEncoder ], retry: [ diff --git a/documentation/configuration.md b/documentation/configuration.md index b98064b4..6b87f472 100644 --- a/documentation/configuration.md +++ b/documentation/configuration.md @@ -16,49 +16,6 @@ config :crawly, ## Options -### base_store_path :: binary() [DEPRECATED in 0.6.0] - -default: "/tmp" - -Defines the path where items are stored in the filesystem. This setting -is used by the Crawly.DataStorageWorker process. - -> **Deprecated**: This has been deprecated in favour of having pipelines to handle data storage, as of `0.6.0` - -### `user_agents` :: list() - -default: ["Crawly Bot 1.0"] - -Defines a user agent string for Crawly requests. This setting is used -by the `Crawly.Middlewares.UserAgent` middleware. When the list has more than one -item, all requests will be executed, each with a user agent string chosen -randomly from the supplied list. - -> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Middlewares.UserAgent` module documentation for correct usage. - -### `item` :: [atom()] - -default: [] - -Defines a list of required fields for the item. When none of the default -fields are added to the following item (or if the values of -required fields are "" or nil), the item will be dropped. This setting -is used by the `Crawly.Pipelines.Validate` pipeline - -> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.Validate` module documentation for correct usage. - -### `item_id` :: atom() - -default: nil - -Defines a field which will be used in order to identify if an item is -a duplicate or not. In most of the ecommerce websites the desired id -field is the SKU. This setting is used in -the `Crawly.Pipelines.DuplicatesFilter` pipeline. If unset, the related -middleware is effectively disabled. - -> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.DuplicatesFilter` module documentation for correct usage. - ### `pipelines` :: [module()] default: [] @@ -99,17 +56,12 @@ default: :disabled An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped. -### closespider_timeout :: pos_integer() +### closespider_timeout :: pos_integer() | :disabled default: nil Defines a minimal amount of items which needs to be scraped by the spider within the given timeframe (30s). If the limit is not reached by the spider - it will be stopped. -### follow_redirect :: boolean() - -default: false - -Defines is Crawly spider is supposed to follow HTTP redirects or not. ### concurrent_requests_per_domain :: pos_integer() diff --git a/lib/crawly/data_storage/data_storage_worker.ex b/lib/crawly/data_storage/data_storage_worker.ex index 3f9bfcf9..35d53f29 100644 --- a/lib/crawly/data_storage/data_storage_worker.ex +++ b/lib/crawly/data_storage/data_storage_worker.ex @@ -33,7 +33,7 @@ defmodule Crawly.DataStorage.Worker do end def handle_cast({:store, item}, state) do - pipelines = Application.get_env(:crawly, :pipelines, []) + pipelines = Crawly.Utils.get_settings(:pipelines, state.spider_name, []) state = case Crawly.Utils.pipe(pipelines, item, state) do diff --git a/lib/crawly/fetchers/fetcher.ex b/lib/crawly/fetchers/fetcher.ex index 20a96b13..49e4b4ec 100644 --- a/lib/crawly/fetchers/fetcher.ex +++ b/lib/crawly/fetchers/fetcher.ex @@ -6,6 +6,8 @@ defmodule Crawly.Fetchers.Fetcher do Crawly.Request, HTTP client options and return Crawly.Response. """ + @type t :: {module(), list()} + @callback fetch(request, options) :: {:ok, response} | {:error, reason} when request: Crawly.Request.t(), response: Crawly.Response.t(), diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex index 37b25b0f..dcb5a0db 100644 --- a/lib/crawly/manager.ex +++ b/lib/crawly/manager.ex @@ -31,6 +31,8 @@ defmodule Crawly.Manager do use GenServer + alias Crawly.Utils + def start_link(spider_name) do Logger.debug("Starting the manager for #{spider_name}") GenServer.start_link(__MODULE__, spider_name) @@ -57,7 +59,7 @@ defmodule Crawly.Manager do # Start workers num_workers = - Application.get_env(:crawly, :concurrent_requests_per_domain, 4) + Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4) worker_pids = Enum.map(1..num_workers, fn _x -> @@ -72,8 +74,15 @@ defmodule Crawly.Manager do ) # Schedule basic service operations for given spider manager - tref = Process.send_after(self(), :operations, get_timeout()) - {:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0}} + tref = + Process.send_after( + self(), + :operations, + Utils.get_settings(:manager_operations_timeout, spider_name, @timeout) + ) + + {:ok, + %{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}} end def handle_info(:operations, state) do @@ -85,7 +94,7 @@ defmodule Crawly.Manager do delta = items_count - state.prev_scraped_cnt Logger.info("Current crawl speed is: #{delta} items/min") - case Application.get_env(:crawly, :closespider_itemcount, :disabled) do + case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do :disabled -> :ignored @@ -100,8 +109,8 @@ defmodule Crawly.Manager do :ignoring end - # Close spider in case if it's not scraping itms fast enough - case Application.get_env(:crawly, :closespider_timeout) do + # Close spider in case if it's not scraping items fast enough + case Utils.get_settings(:closespider_timeout, state.name, :disabled) do :undefined -> :ignoring @@ -116,12 +125,13 @@ defmodule Crawly.Manager do :ignoring end - tref = Process.send_after(self(), :operations, get_timeout()) + tref = + Process.send_after( + self(), + :operations, + Utils.get_settings(:manager_operations_timeout, state.name, @timeout) + ) {:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}} end - - defp get_timeout() do - Application.get_env(:crawly, :manager_operations_timeout, @timeout) - end end diff --git a/lib/crawly/middlewares/user_agent.ex b/lib/crawly/middlewares/user_agent.ex index fe897a18..fa51779e 100644 --- a/lib/crawly/middlewares/user_agent.ex +++ b/lib/crawly/middlewares/user_agent.ex @@ -12,7 +12,7 @@ defmodule Crawly.Middlewares.UserAgent do ### Example Declaration ``` middlewares: [ - {UserAgent, user_agents: ["My Custom Bot] } + {UserAgent, user_agents: ["My Custom Bot"] } ] ``` """ @@ -24,8 +24,7 @@ defmodule Crawly.Middlewares.UserAgent do new_headers = List.keydelete(request.headers, "User-Agent", 0) user_agents = - Map.get(opts, :user_agents) || - Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"]) + Map.get(opts, :user_agents, ["Crawly Bot 1.0"]) useragent = Enum.random(user_agents) diff --git a/lib/crawly/pipelines/csv_encoder.ex b/lib/crawly/pipelines/csv_encoder.ex index f4af184c..a2c00b62 100644 --- a/lib/crawly/pipelines/csv_encoder.ex +++ b/lib/crawly/pipelines/csv_encoder.ex @@ -18,7 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do {false, state :: map} | {csv_line :: String.t(), state :: map} def run(item, state, opts \\ []) do opts = Enum.into(opts, %{fields: nil}) - fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item) + fields = Map.get(opts, :fields, []) case fields do :undefined -> diff --git a/lib/crawly/pipelines/duplicates_filter.ex b/lib/crawly/pipelines/duplicates_filter.ex index 3c0a2003..6aecdb9e 100644 --- a/lib/crawly/pipelines/duplicates_filter.ex +++ b/lib/crawly/pipelines/duplicates_filter.ex @@ -32,7 +32,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do def run(item, state, opts \\ []) do opts = Enum.into(opts, %{item_id: nil}) - item_id = Map.get(opts, :item_id) || Application.get_env(:crawly, :item_id) + item_id = Map.get(opts, :item_id) item_id = Map.get(item, item_id) diff --git a/lib/crawly/pipelines/validate.ex b/lib/crawly/pipelines/validate.ex index 711dd8ad..591d3045 100644 --- a/lib/crawly/pipelines/validate.ex +++ b/lib/crawly/pipelines/validate.ex @@ -27,7 +27,7 @@ defmodule Crawly.Pipelines.Validate do @impl Crawly.Pipeline def run(item, state, opts \\ []) do opts = Enum.into(opts, %{fields: nil}) - fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item, []) + fields = Map.get(opts, :fields, []) validation_result = fields diff --git a/lib/crawly/pipelines/write_to_file.ex b/lib/crawly/pipelines/write_to_file.ex index b451bf65..95761175 100644 --- a/lib/crawly/pipelines/write_to_file.ex +++ b/lib/crawly/pipelines/write_to_file.ex @@ -59,20 +59,11 @@ defmodule Crawly.Pipelines.WriteToFile do def run(item, state, opts) do opts = Enum.into(opts, %{folder: nil, extension: nil}) - global_config = - Application.get_env( - :crawly, - Crawly.Pipelines.WriteToFile, - Keyword.new() - ) - folder = - Map.get(opts, :folder) || - Keyword.get(global_config, :folder, System.tmp_dir!()) + Map.get(opts, :folder, "./") extension = - Map.get(opts, :extension) || - Keyword.get(global_config, :extension, "jl") + Map.get(opts, :extension, "jl") fd = open_fd(state.spider_name, folder, extension) :ok = write(fd, item) diff --git a/lib/crawly/settings.ex b/lib/crawly/settings.ex new file mode 100644 index 00000000..30ba7bfa --- /dev/null +++ b/lib/crawly/settings.ex @@ -0,0 +1,48 @@ +defmodule Crawly.Settings do + @moduledoc """ + Define Crawly setting types + """ + + @type numeric_setting() :: pos_integer() | :disabled + @type retry() :: [ + retry_codes: [pos_integer()], + max_retries: pos_integer(), + ignored_middlewares: [module()] + ] + + @type middleware() :: + Crawly.Middlewares.DomainFilter + | Crawly.Middlewares.UniqueRequest + | Crawly.Middlewares.RobotsTxt + | Crawly.Middlewares.AutoCookiesManager + | {Crawly.Middlewares.UserAgent, user_agents: [binary()]} + + @type pipeline() :: + Crawly.Pipelines.JSONEncoder + | {Crawly.Pipelines.DuplicatesFilter, item_id: atom()} + | {Crawly.Pipelines.Validate, fields: [atom()]} + | {Crawly.Pipelines.CSVEncoder, fields: [atom()]} + | {Crawly.Pipelines.WriteToFile, + folder: binary(), extension: binary()} + + @type t() :: [ + # Allows to stop spider after a given number of scraped items + # :disabled by default. + closespider_itemcount: numeric_setting(), + + # Allows to stop spider if it extracts less than a given amount of + # items per minute. + closespider_timeout: pos_integer(), + + # Allows to control how many workers are started for a given domain + concurrent_requests_per_domain: pos_integer(), + + # Allows to define a fetcher to perform HTTP requests + fetcher: Crawly.Fetchers.Fetcher.t(), + + # Defines retries + retry: retry(), + middlewares: [middleware()], + pipelines: [pipeline()] + ] +end diff --git a/lib/crawly/spider.ex b/lib/crawly/spider.ex index 0be41baf..e38c4a4d 100644 --- a/lib/crawly/spider.ex +++ b/lib/crawly/spider.ex @@ -9,6 +9,10 @@ defmodule Crawly.Spider do 3. `parse_item/1` function which is responsible for parsing the downloaded request and converting it into items which can be stored and new requests which can be scheduled + 4. `custom_settings/0` an optional callback which can be used in order to + provide custom spider specific settings. Should define a list with custom + settings and their values. These values will take precedence over the + global settings defined in the config. """ @callback init() :: [start_urls: list()] @@ -16,6 +20,9 @@ defmodule Crawly.Spider do @callback base_url() :: binary() @callback parse_item(response :: HTTPoison.Response.t()) :: - Crawly.ParsedItem.t() + Crawly.ParsedItem.t() + @callback override_settings() :: Crawly.Settings.t() + + @optional_callbacks override_settings: 0 end diff --git a/lib/crawly/utils.ex b/lib/crawly/utils.ex index 8a94de3e..86b8f86e 100644 --- a/lib/crawly/utils.ex +++ b/lib/crawly/utils.ex @@ -93,9 +93,9 @@ defmodule Crawly.Utils do catch error, reason -> Logger.error( - "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{ - inspect(reason) - }, args: #{inspect(args)}" + "Pipeline crash: #{module}, error: #{inspect(error)}, reason: #{inspect(reason)}, args: #{ + inspect(args) + }" ) {item, state} @@ -113,4 +113,50 @@ defmodule Crawly.Utils do def send_after(pid, message, timeout) do Process.send_after(pid, message, timeout) end + + @doc """ + A helper which allows to extract a given setting. + + Returned value is a result of intersection of the global settings and settings + defined as settings_override inside the spider. Settings defined on spider are + taking precedence over the global settings defined in the config. + """ + @spec get_settings(setting_name, spider_name, default) :: result + when setting_name: atom(), + spider_name: atom(), + default: term(), + result: term() + + def get_settings(setting_name, spider_name \\ nil, default \\ nil) do + global_setting = Application.get_env(:crawly, setting_name, default) + case get_spider_setting(setting_name, spider_name) do + nil -> + # No custom settings for a spider found + global_setting + + custom_setting -> + custom_setting + end + end + + ############################################################################## + # Private functions + ############################################################################## + @spec get_spider_setting(spider_name, setting_name) :: result + when spider_name: atom(), + setting_name: atom(), + result: nil | term() + + defp get_spider_setting(_setting_name, nil), do: nil + + defp get_spider_setting(setting_name, spider_name) do + case function_exported?(spider_name, :override_settings, 0) do + true -> + + Keyword.get(spider_name.override_settings(), setting_name, nil) + + false -> + nil + end + end end diff --git a/lib/crawly/worker.ex b/lib/crawly/worker.ex index af6cb5ac..addc542c 100644 --- a/lib/crawly/worker.ex +++ b/lib/crawly/worker.ex @@ -42,15 +42,12 @@ defmodule Crawly.Worker do case :epipe.run(functions, {request, spider_name}) do {:error, _step, reason, _step_state} -> - - Logger.debug( - fn -> - "Crawly worker could not process the request to #{ - inspect(request.url) - } + Logger.debug(fn -> + "Crawly worker could not process the request to #{ + inspect(request.url) + } reason: #{inspect(reason)}" - end - ) + end) @default_backoff @@ -73,14 +70,16 @@ defmodule Crawly.Worker do # check if spider-level fetcher is set. Overrides the globally configured fetcher. # if not set, log warning for explicit config preferred, # get the globally-configured fetcher. Defaults to HTTPoisonFetcher - {fetcher, options} = Application.get_env( - :crawly, - :fetcher, - {Crawly.Fetchers.HTTPoisonFetcher, []} - ) - - retry_options = Application.get_env(:crawly, :retry, []) + {fetcher, options} = + Crawly.Utils.get_settings( + :fetcher, + spider_name, + {Crawly.Fetchers.HTTPoisonFetcher, []} + ) + + retry_options = Crawly.Utils.get_settings(:retry, spider_name, []) retry_codes = Keyword.get(retry_options, :retry_codes, []) + case fetcher.fetch(request, options) do {:error, _reason} = err -> :ok = maybe_retry_request(spider_name, request) @@ -93,11 +92,11 @@ defmodule Crawly.Worker do true -> :ok = maybe_retry_request(spider_name, request) {:error, :retry} + false -> {:ok, {response, spider_name}} end end - end @spec parse_item({response, spider_name}) :: result @@ -141,7 +140,8 @@ defmodule Crawly.Worker do fn request -> request = Map.put(request, :prev_response, response) Crawly.RequestsStorage.store(spider_name, request) - end) + end + ) # Process all items one by one Enum.each( @@ -157,7 +157,7 @@ defmodule Crawly.Worker do ## Retry a request if max retries allows to do so defp maybe_retry_request(spider, request) do retries = request.retries - retry_settings = Application.get_env(:crawly, :retry, Keyword.new()) + retry_settings = Crawly.Utils.get_settings(:retry, spider, Keyword.new()) ignored_middlewares = Keyword.get(retry_settings, :ignored_middlewares, []) max_retries = Keyword.get(retry_settings, :max_retries, 0) @@ -169,16 +169,16 @@ defmodule Crawly.Worker do middlewares = request.middlewares -- ignored_middlewares request = %Crawly.Request{ - request | - middlewares: middlewares, - retries: retries + 1 + request + | middlewares: middlewares, + retries: retries + 1 } :ok = Crawly.RequestsStorage.store(spider, request) + false -> Logger.error("Dropping request to #{request.url}, (max retries)") :ok end - end end diff --git a/test/data_storage_test.exs b/test/data_storage_test.exs index a65aff05..fe94bae8 100644 --- a/test/data_storage_test.exs +++ b/test/data_storage_test.exs @@ -73,30 +73,4 @@ defmodule DataStorageTest do result = Crawly.DataStorage.stats(:unkown) assert result == {:error, :data_storage_worker_not_running} end - - test "Duplicates pipline is inactive when item_id is not set", context do - :meck.expect(Application, :get_env, fn :crawly, :item_id -> :undefined end) - - Crawly.DataStorage.store(context.crawler, %{ - title: "test title", - author: "me", - time: "Now", - url: "http://example.com" - }) - - Crawly.DataStorage.store(context.crawler, %{ - title: "test title", - author: "me", - time: "Now", - url: "http://example.com" - }) - - Process.sleep(1000) - {:stored_items, 2} = Crawly.DataStorage.stats(context.crawler) - :meck.unload(Application) - end - - -# test "No data is stored if Dev" - end diff --git a/test/manager_test.exs b/test/manager_test.exs index 204ecdde..9fb61fce 100644 --- a/test/manager_test.exs +++ b/test/manager_test.exs @@ -54,8 +54,6 @@ defmodule ManagerTest do Application.put_env(:crawly, :concurrent_requests_per_domain, 1) :ok = Crawly.Engine.start_spider(Manager.TestSpider) Process.sleep(2_000) - stats = Crawly.DataStorage.stats(Manager.TestSpider) - IO.puts("Stats: #{inspect(stats)}") assert %{} == Crawly.Engine.running_spiders() end diff --git a/test/middlewares/user_agent_test.exs b/test/middlewares/user_agent_test.exs index b07cd116..5d5133e0 100644 --- a/test/middlewares/user_agent_test.exs +++ b/test/middlewares/user_agent_test.exs @@ -1,15 +1,8 @@ defmodule Middlewares.UserAgentTest do use ExUnit.Case, async: false - setup do - on_exit(fn -> - Application.put_env(:crawly, :user_agents, nil) - end) - end - test "Adds a user agent to request header with global config" do - Application.put_env(:crawly, :user_agents, ["My Custom Bot"]) - middlewares = [Crawly.Middlewares.UserAgent] + middlewares = [{Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}] req = %Crawly.Request{} state = %{} diff --git a/test/pipelines/csv_encoder_test.exs b/test/pipelines/csv_encoder_test.exs index ac660a65..a73dfe6f 100644 --- a/test/pipelines/csv_encoder_test.exs +++ b/test/pipelines/csv_encoder_test.exs @@ -2,16 +2,8 @@ defmodule Pipelines.CSVEncoderTest do use ExUnit.Case, async: false @valid %{first: "some", second: "data"} - setup do - on_exit(fn -> - Application.put_env(:crawly, :item, [:title, :author, :time, :url]) - end) - end - test "Converts a single-level map to a csv string with global config" do - Application.put_env(:crawly, :item, [:first, :second]) - - pipelines = [Crawly.Pipelines.CSVEncoder] + pipelines = [{Crawly.Pipelines.CSVEncoder, fields: [:first, :second]}] item = @valid state = %{} diff --git a/test/pipelines/duplicates_filter_test.exs b/test/pipelines/duplicates_filter_test.exs index 383120de..883cfd48 100644 --- a/test/pipelines/duplicates_filter_test.exs +++ b/test/pipelines/duplicates_filter_test.exs @@ -2,15 +2,9 @@ defmodule Pipelines.DuplicatesFilterTest do use ExUnit.Case, async: false @valid %{data: [%{some: "nested_data"}], id: "my_id"} - setup do - on_exit(fn -> - Application.put_env(:crawly, :item_id, :title) - end) - end test "Drops duplicate items with the same item_id value through global config" do - Application.put_env(:crawly, :item_id, :id) - pipelines = [Crawly.Pipelines.DuplicatesFilter] + pipelines = [{Crawly.Pipelines.DuplicatesFilter, item_id: :id}] item = @valid state = %{} diff --git a/test/pipelines/validate_test.exs b/test/pipelines/validate_test.exs index 49671eaa..e948a09a 100644 --- a/test/pipelines/validate_test.exs +++ b/test/pipelines/validate_test.exs @@ -13,15 +13,9 @@ defmodule Pipelines.ValidateTest do author: nil } - setup do - on_exit(fn -> - Application.put_env(:crawly, :item, [:title, :author, :time, :url]) - end) - end - test "Returns item unchanged when has required fields" do Application.put_env(:crawly, :item, [:title, :author]) - pipelines = [Crawly.Pipelines.Validate] + pipelines = [{Crawly.Pipelines.Validate, fields: [:title, :author]}] item = @valid state = %{} @@ -30,8 +24,7 @@ defmodule Pipelines.ValidateTest do end test "Drops items when missing required fields with global config" do - Application.put_env(:crawly, :item, [:title, :author]) - pipelines = [Crawly.Pipelines.Validate] + pipelines = [{Crawly.Pipelines.Validate, fields: [:title, :author]}] item = @invalid_missing state = %{} @@ -47,8 +40,7 @@ defmodule Pipelines.ValidateTest do end test "Drops items when required fields are equal to nil" do - Application.put_env(:crawly, :item, [:title, :author]) - pipelines = [Crawly.Pipelines.Validate] + pipelines = [{Crawly.Pipelines.Validate, fields: [:title, :author]}] item = @invalid_nil state = %{} diff --git a/test/pipelines/write_to_file_test.exs b/test/pipelines/write_to_file_test.exs index 3f1a2883..fc249038 100644 --- a/test/pipelines/write_to_file_test.exs +++ b/test/pipelines/write_to_file_test.exs @@ -5,7 +5,6 @@ defmodule Pipelines.WriteToFileTest do setup do on_exit(fn -> - Application.put_env(:crawly, :"Crawly.Pipelines.WriteToFile", nil) :meck.unload(IO) :meck.unload(File) end) @@ -31,15 +30,8 @@ defmodule Pipelines.WriteToFileTest do end ) - Application.put_env( - :crawly, - Crawly.Pipelines.WriteToFile, - folder: "/tmp", - extension: "csv" - ) - pipelines = [ - Crawly.Pipelines.WriteToFile + {Crawly.Pipelines.WriteToFile, folder: "/tmp", extension: "csv"} ] item = @binary diff --git a/test/settings_test.exs b/test/settings_test.exs new file mode 100644 index 00000000..5a15f792 --- /dev/null +++ b/test/settings_test.exs @@ -0,0 +1,46 @@ +defmodule SettingsTest do + use ExUnit.Case + + setup do + Application.ensure_all_started(Crawly) + Application.put_env(:crawly, :concurrent_requests_per_domain, 1) + Application.put_env(:crawly, :closespider_itemcount, 10) + + on_exit(fn -> + Application.put_env(:crawly, :closespider_timeout, 20) + Application.put_env(:crawly, :closespider_itemcount, 100) + end) + end + + test "settings from the spider are overriding globals" do + assert 5 == + Crawly.Utils.get_settings( + :concurrent_requests_per_domain, + TestSpiderSettingsOverride, 1) + end + + test "incomplete spider overrides do not break global settings" do + assert 10 == + Crawly.Utils.get_settings( + :closespider_itemcount, + TestSpiderSettingsOverride, 1) + end +end + +defmodule Elixir.TestSpiderSettingsOverride do + def base_url() do + "https://www.example.com" + end + + def init() do + [ + start_urls: ["https://www.example.com/blog.html"] + ] + end + + def parse_item(_response) do + %{:items => [], :requests => []} + end + + def override_settings(), do: [concurrent_requests_per_domain: 5] +end diff --git a/test/utils_test.exs b/test/utils_test.exs index b47c09de..4cea2f93 100644 --- a/test/utils_test.exs +++ b/test/utils_test.exs @@ -100,7 +100,7 @@ defmodule UtilsTest do Crawly.Middlewares.DomainFilter, Crawly.Middlewares.UniqueRequest, Crawly.Middlewares.RobotsTxt, - Crawly.Middlewares.UserAgent], + {Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}], retries: 0 } end