File logging improvements (#170)

* Make file logging optional * Set default value for enable_file_logging to false * Update documentation * Change config name * Code improvement * Update documentation * Add debug log for log file path * refined log configuration, added log tests Co-authored-by: TzeYiing <ty@tzeyiing.com>
elixir-crawly · Feb 3, 2021 · 7e09e53 · 7e09e53
1 parent 1ec03d7
commit 7e09e53
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 5 deletions.
diff --git a/config/config.exs b/config/config.exs
@@ -45,6 +45,7 @@ config :crawly,
 
   # TODO: this looks outdated
   follow_redirect: true,
+  log_to_file: false,
 
   # Request middlewares
   middlewares: [

diff --git a/documentation/configuration.md b/documentation/configuration.md
@@ -133,7 +133,13 @@ Allows to specify a custom HTTP client which will be performing request to the c
 default: /tmp
 
 Set spider logs directory. All spiders have their own dedicated log file
-stored under the `log_dir` folder.
+stored under the `log_dir` folder. This option is ignored if `log_to_file` is not set to `true`.
+
+### log_to_file :: String.t()
+
+default: false
+
+Enables or disables file logging.
 
 ### port :: pos_integer()
 

diff --git a/lib/crawly/engine.ex b/lib/crawly/engine.ex
@@ -61,7 +61,9 @@ defmodule Crawly.Engine do
       |> Map.put_new_lazy(:crawl_id, &UUID.uuid1/0)
 
     # Filter all logs related to a given spider
-    set_spider_log(spider_name, opts[:crawl_id])
+    if Crawly.Utils.get_settings(:log_to_file, spider_name) do
+      configure_spider_logs(spider_name, opts[:crawl_id])
+    end
 
     GenServer.call(
       __MODULE__,
@@ -240,14 +242,32 @@ defmodule Crawly.Engine do
     |> Enum.dedup_by(& &1)
   end
 
-  defp set_spider_log(spider_name, crawl_id) do
-    log_dir = Crawly.Utils.get_settings(:log_dir, spider_name, "/tmp")
+  defp configure_spider_logs(spider_name, crawl_id) do
+    log_dir =
+      Crawly.Utils.get_settings(
+        :log_dir,
+        spider_name,
+        System.tmp_dir()
+      )
+
+    current_unix_timestamp = :os.system_time(:second)
+
     Logger.add_backend({LoggerFileBackend, :debug})
 
+    log_file_path =
+      Path.join([
+        log_dir,
+        inspect(spider_name),
+        # underscore separates the timestamp and the crawl_id
+        inspect(current_unix_timestamp) <> "_" <> crawl_id
+      ]) <> ".log"
+
     Logger.configure_backend({LoggerFileBackend, :debug},
-      path: "/#{log_dir}/#{spider_name}/#{crawl_id}.log",
+      path: log_file_path,
       level: :debug,
       metadata_filter: [crawl_id: crawl_id]
     )
+
+    Logger.debug("Writing logs to #{log_file_path}")
   end
 end
diff --git a/test/engine_test.exs b/test/engine_test.exs
@@ -1,6 +1,17 @@
 defmodule EngineTest do
   use ExUnit.Case
 
+  setup do
+    on_exit(fn ->
+      :meck.unload()
+
+      Crawly.Engine.list_known_spiders()
+      |> Enum.each(fn s ->
+        Crawly.Engine.stop_spider(s)
+      end)
+    end)
+  end
+
   test "list_known_spiders/0 lists all spiders and their current status in the engine" do
     Crawly.Engine.init([])
     Crawly.Engine.refresh_spider_list()
@@ -24,4 +35,24 @@ defmodule EngineTest do
     spiders = Crawly.Engine.list_known_spiders()
     assert Enum.all?(spiders, fn s -> s.status == :stopped end)
   end
+
+  test ":log_to_file allows for logging to log file" do
+    :meck.expect(TestSpider, :override_settings, fn ->
+      [log_dir: "/my_tmp_dir", log_to_file: true]
+    end)
+
+    :meck.expect(Logger, :configure_backend, fn {_, :debug}, opts ->
+      log_file_path = Keyword.get(opts, :path)
+      assert log_file_path =~ "TestSpider"
+      assert log_file_path =~ "/my_tmp_dir"
+    end)
+
+    Crawly.Engine.init([])
+    Crawly.Engine.refresh_spider_list()
+
+    # test a started spider
+    Crawly.Engine.start_spider(TestSpider)
+
+    assert :meck.num_calls(Logger, :configure_backend, :_) == 1
+  end
 end