Merge 905b3a5 into e4ce56a

elixir-crawly · Apr 1, 2020 · 330573c · 330573c
2 parents e4ce56a + 905b3a5
commit 330573c
Show file tree

Hide file tree

Showing 24 changed files with 241 additions and 211 deletions.
diff --git a/config/config.exs b/config/config.exs
@@ -29,48 +29,40 @@ use Mix.Config
 #
 #     import_config "#{Mix.env}.exs"
 
-config :crawly, Crawly.Worker, client: HTTPoison
-
 config :crawly,
   fetcher: {Crawly.Fetchers.HTTPoisonFetcher, []},
-  retry:
-    [
-      retry_codes: [400],
-      max_retries: 3,
-      ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
+  retry: [
+    retry_codes: [400],
+    max_retries: 3,
+    ignored_middlewares: [Crawly.Middlewares.UniqueRequest]
   ],
 
-  # User agents which are going to be used with requests
-  user_agents: [
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
-  ],
-  # Item definition
-  item: [:title, :author, :time, :url],
-  # Identifier which is used to filter out duplicates
-  item_id: :title,
   # Stop spider after scraping certain amount of items
   closespider_itemcount: 500,
   # Stop spider if it does crawl fast enough
   closespider_timeout: 20,
   concurrent_requests_per_domain: 5,
+
+  # TODO: this looks outdated
   follow_redirect: true,
+
   # Request middlewares
   middlewares: [
     Crawly.Middlewares.DomainFilter,
     Crawly.Middlewares.UniqueRequest,
     Crawly.Middlewares.RobotsTxt,
-    Crawly.Middlewares.UserAgent
+    {Crawly.Middlewares.UserAgent,
+     user_agents: [
+       "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
+       "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
+       "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
+     ]}
   ],
   pipelines: [
-    Crawly.Pipelines.Validate,
-    Crawly.Pipelines.DuplicatesFilter,
+    {Crawly.Pipelines.Validate, fields: [:title, :author, :time, :url]},
+    {Crawly.Pipelines.DuplicatesFilter, item_id: :title},
     Crawly.Pipelines.JSONEncoder
   ]
 
-config :crawly, Crawly.Pipelines.WriteToFile,
-  folder: "/tmp",
-  extension: "jl"
 
- import_config "#{Mix.env}.exs"
+import_config "#{Mix.env()}.exs"
diff --git a/config/test.exs b/config/test.exs
@@ -2,28 +2,30 @@ use Mix.Config
 
 config :crawly,
        manager_operations_timeout: 30_000,
-         # User agents which are going to be used with requests
-       user_agents: [
-         "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
-         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
-         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
-       ],
+
          # Stop spider after scraping certain amount of items
        closespider_itemcount: 100,
          # Stop spider if it does crawl fast enough
        closespider_timeout: 20,
        concurrent_requests_per_domain: 5,
        follow_redirect: true,
-         # Request middlewares
+
+       # Request middlewares
+       # User agents which are going to be used with requests
+       user_agents: [
+         "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
+         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
+         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41"
+       ],
        middlewares: [
          Crawly.Middlewares.DomainFilter,
          Crawly.Middlewares.UniqueRequest,
          Crawly.Middlewares.RobotsTxt,
-         Crawly.Middlewares.UserAgent
+         {Crawly.Middlewares.UserAgent, user_agents: ["My Custom Bot"]}
        ],
        pipelines: [
-         Crawly.Pipelines.Validate,
-         Crawly.Pipelines.DuplicatesFilter,
+         {Crawly.Pipelines.Validate, fields: [:title, :url, :time, :author]},
+         {Crawly.Pipelines.DuplicatesFilter, item_id: :title},
          Crawly.Pipelines.JSONEncoder
        ],
        retry: [

diff --git a/documentation/configuration.md b/documentation/configuration.md
@@ -16,49 +16,6 @@ config :crawly,
 
 ## Options
 
-### base_store_path :: binary() [DEPRECATED in 0.6.0]
-
-default: "/tmp"
-
-Defines the path where items are stored in the filesystem. This setting
-is used by the Crawly.DataStorageWorker process.
-
-> **Deprecated**: This has been deprecated in favour of having pipelines to handle data storage, as of `0.6.0`
-
-### `user_agents` :: list()
-
-default: ["Crawly Bot 1.0"]
-
-Defines a user agent string for Crawly requests. This setting is used
-by the `Crawly.Middlewares.UserAgent` middleware. When the list has more than one
-item, all requests will be executed, each with a user agent string chosen
-randomly from the supplied list.
-
-> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Middlewares.UserAgent` module documentation for correct usage.
-
-### `item` :: [atom()]
-
-default: []
-
-Defines a list of required fields for the item. When none of the default
-fields are added to the following item (or if the values of
-required fields are "" or nil), the item will be dropped. This setting
-is used by the `Crawly.Pipelines.Validate` pipeline
-
-> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.Validate` module documentation for correct usage.
-
-### `item_id` :: atom()
-
-default: nil
-
-Defines a field which will be used in order to identify if an item is
-a duplicate or not. In most of the ecommerce websites the desired id
-field is the SKU. This setting is used in
-the `Crawly.Pipelines.DuplicatesFilter` pipeline. If unset, the related
-middleware is effectively disabled.
-
-> **Deprecated**: This has been deprecated in favour of tuple-based pipeline configuration instead of global configurations, as of `0.7.0`. Refer to `Crawly.Pipelines.DuplicatesFilter` module documentation for correct usage.
-
 ### `pipelines` :: [module()]
 
 default: []
@@ -99,17 +56,12 @@ default: :disabled
 
 An integer which specifies a number of items. If the spider scrapes more than that amount and those items are passed by the item pipeline, the spider will be closed. If set to :disabled the spider will not be stopped.
 
-### closespider_timeout :: pos_integer()
+### closespider_timeout :: pos_integer() | :disabled
 
 default: nil
 
 Defines a minimal amount of items which needs to be scraped by the spider within the given timeframe (30s). If the limit is not reached by the spider - it will be stopped.
 
-### follow_redirect :: boolean()
-
-default: false
-
-Defines is Crawly spider is supposed to follow HTTP redirects or not.
 
 ### concurrent_requests_per_domain :: pos_integer()
 

diff --git a/lib/crawly/data_storage/data_storage_worker.ex b/lib/crawly/data_storage/data_storage_worker.ex
@@ -33,7 +33,7 @@ defmodule Crawly.DataStorage.Worker do
   end
 
   def handle_cast({:store, item}, state) do
-    pipelines = Application.get_env(:crawly, :pipelines, [])
+    pipelines = Crawly.Utils.get_settings(:pipelines, state.spider_name, [])
 
     state =
       case Crawly.Utils.pipe(pipelines, item, state) do

diff --git a/lib/crawly/fetchers/fetcher.ex b/lib/crawly/fetchers/fetcher.ex
@@ -6,6 +6,8 @@ defmodule Crawly.Fetchers.Fetcher do
   Crawly.Request, HTTP client options and return Crawly.Response.
   """
 
+  @type t :: {module(), list()}
+
   @callback fetch(request, options) :: {:ok, response} | {:error, reason}
             when request: Crawly.Request.t(),
                  response: Crawly.Response.t(),

diff --git a/lib/crawly/manager.ex b/lib/crawly/manager.ex
@@ -31,6 +31,8 @@ defmodule Crawly.Manager do
 
   use GenServer
 
+  alias Crawly.Utils
+
   def start_link(spider_name) do
     Logger.debug("Starting the manager for #{spider_name}")
     GenServer.start_link(__MODULE__, spider_name)
@@ -57,7 +59,7 @@ defmodule Crawly.Manager do
 
     # Start workers
     num_workers =
-      Application.get_env(:crawly, :concurrent_requests_per_domain, 4)
+      Utils.get_settings(:concurrent_requests_per_domain, spider_name, 4)
 
     worker_pids =
       Enum.map(1..num_workers, fn _x ->
@@ -72,8 +74,15 @@ defmodule Crawly.Manager do
     )
 
     # Schedule basic service operations for given spider manager
-    tref = Process.send_after(self(), :operations, get_timeout())
-    {:ok, %{name: spider_name, tref: tref, prev_scraped_cnt: 0}}
+    tref =
+      Process.send_after(
+        self(),
+        :operations,
+        Utils.get_settings(:manager_operations_timeout, spider_name, @timeout)
+      )
+
+    {:ok,
+     %{name: spider_name, tref: tref, prev_scraped_cnt: 0, workers: worker_pids}}
   end
 
   def handle_info(:operations, state) do
@@ -85,7 +94,7 @@ defmodule Crawly.Manager do
     delta = items_count - state.prev_scraped_cnt
     Logger.info("Current crawl speed is: #{delta} items/min")
 
-    case Application.get_env(:crawly, :closespider_itemcount, :disabled) do
+    case Utils.get_settings(:closespider_itemcount, state.name, :disabled) do
       :disabled ->
         :ignored
 
@@ -100,8 +109,8 @@ defmodule Crawly.Manager do
         :ignoring
     end
 
-    # Close spider in case if it's not scraping itms fast enough
-    case Application.get_env(:crawly, :closespider_timeout) do
+    # Close spider in case if it's not scraping items fast enough
+    case Utils.get_settings(:closespider_timeout, state.name, :disabled) do
       :undefined ->
         :ignoring
 
@@ -116,12 +125,13 @@ defmodule Crawly.Manager do
         :ignoring
     end
 
-    tref = Process.send_after(self(), :operations, get_timeout())
+    tref =
+      Process.send_after(
+        self(),
+        :operations,
+        Utils.get_settings(:manager_operations_timeout, state.name, @timeout)
+      )
 
     {:noreply, %{state | tref: tref, prev_scraped_cnt: items_count}}
   end
-
-  defp get_timeout() do
-    Application.get_env(:crawly, :manager_operations_timeout, @timeout)
-  end
 end
diff --git a/lib/crawly/middlewares/user_agent.ex b/lib/crawly/middlewares/user_agent.ex
@@ -12,7 +12,7 @@ defmodule Crawly.Middlewares.UserAgent do
   ### Example Declaration
   ```
   middlewares: [
-    {UserAgent, user_agents: ["My Custom Bot] }
+    {UserAgent, user_agents: ["My Custom Bot"] }
   ]
   ```
   """
@@ -24,8 +24,7 @@ defmodule Crawly.Middlewares.UserAgent do
     new_headers = List.keydelete(request.headers, "User-Agent", 0)
 
     user_agents =
-      Map.get(opts, :user_agents) ||
-        Application.get_env(:crawly, :user_agents, ["Crawly Bot 1.0"])
+      Map.get(opts, :user_agents, ["Crawly Bot 1.0"])
 
     useragent = Enum.random(user_agents)
 

diff --git a/lib/crawly/pipelines/csv_encoder.ex b/lib/crawly/pipelines/csv_encoder.ex
@@ -18,7 +18,7 @@ defmodule Crawly.Pipelines.CSVEncoder do
           {false, state :: map} | {csv_line :: String.t(), state :: map}
   def run(item, state, opts \\ []) do
     opts = Enum.into(opts, %{fields: nil})
-    fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item)
+    fields = Map.get(opts, :fields, [])
 
     case fields do
       :undefined ->

diff --git a/lib/crawly/pipelines/duplicates_filter.ex b/lib/crawly/pipelines/duplicates_filter.ex
@@ -32,7 +32,7 @@ defmodule Crawly.Pipelines.DuplicatesFilter do
   def run(item, state, opts \\ []) do
     opts = Enum.into(opts, %{item_id: nil})
 
-    item_id = Map.get(opts, :item_id) || Application.get_env(:crawly, :item_id)
+    item_id = Map.get(opts, :item_id)
 
     item_id = Map.get(item, item_id)
 

diff --git a/lib/crawly/pipelines/validate.ex b/lib/crawly/pipelines/validate.ex
@@ -27,7 +27,7 @@ defmodule Crawly.Pipelines.Validate do
   @impl Crawly.Pipeline
   def run(item, state, opts \\ []) do
     opts = Enum.into(opts, %{fields: nil})
-    fields = Map.get(opts, :fields) || Application.get_env(:crawly, :item, [])
+    fields = Map.get(opts, :fields, [])
 
     validation_result =
       fields

diff --git a/lib/crawly/pipelines/write_to_file.ex b/lib/crawly/pipelines/write_to_file.ex
@@ -59,20 +59,11 @@ defmodule Crawly.Pipelines.WriteToFile do
   def run(item, state, opts) do
     opts = Enum.into(opts, %{folder: nil, extension: nil})
 
-    global_config =
-      Application.get_env(
-        :crawly,
-        Crawly.Pipelines.WriteToFile,
-        Keyword.new()
-      )
-
     folder =
-      Map.get(opts, :folder) ||
-        Keyword.get(global_config, :folder, System.tmp_dir!())
+      Map.get(opts, :folder, "./")
 
     extension =
-      Map.get(opts, :extension) ||
-        Keyword.get(global_config, :extension, "jl")
+      Map.get(opts, :extension, "jl")
 
     fd = open_fd(state.spider_name, folder, extension)
     :ok = write(fd, item)

diff --git a/lib/crawly/settings.ex b/lib/crawly/settings.ex
@@ -0,0 +1,48 @@
+defmodule Crawly.Settings do
+  @moduledoc """
+  Define Crawly setting types
+  """
+
+  @type numeric_setting() :: pos_integer() | :disabled
+  @type retry() :: [
+          retry_codes: [pos_integer()],
+          max_retries: pos_integer(),
+          ignored_middlewares: [module()]
+        ]
+
+  @type middleware() ::
+          Crawly.Middlewares.DomainFilter
+          | Crawly.Middlewares.UniqueRequest
+          | Crawly.Middlewares.RobotsTxt
+          | Crawly.Middlewares.AutoCookiesManager
+          | {Crawly.Middlewares.UserAgent, user_agents: [binary()]}
+
+  @type pipeline() ::
+          Crawly.Pipelines.JSONEncoder
+          | {Crawly.Pipelines.DuplicatesFilter, item_id: atom()}
+          | {Crawly.Pipelines.Validate, fields: [atom()]}
+          | {Crawly.Pipelines.CSVEncoder, fields: [atom()]}
+          | {Crawly.Pipelines.WriteToFile,
+             folder: binary(), extension: binary()}
+
+  @type t() :: [
+          # Allows to stop spider after a given number of scraped items
+          # :disabled by default.
+          closespider_itemcount: numeric_setting(),
+
+          # Allows to stop spider if it extracts less than a given amount of
+          # items per minute.
+          closespider_timeout: pos_integer(),
+
+          # Allows to control how many workers are started for a given domain
+          concurrent_requests_per_domain: pos_integer(),
+
+          # Allows to define a fetcher to perform HTTP requests
+          fetcher: Crawly.Fetchers.Fetcher.t(),
+
+          # Defines retries
+          retry: retry(),
+          middlewares: [middleware()],
+          pipelines: [pipeline()]
+        ]
+end