feat: implement support for structured logging (#7126)

* feat: implement support for structured logging * docs: add release notes * style: add explanatory comment * chore: test + import fixes * tests: fix windows tests
deepset-ai · Feb 27, 2024 · 2a59128 · 2a59128
1 parent e0b6924
commit 2a59128
Show file tree

Hide file tree

Showing 6 changed files with 329 additions and 2 deletions.
diff --git a/haystack/__init__.py b/haystack/__init__.py
@@ -3,6 +3,11 @@
 from haystack.core.pipeline import Pipeline
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.dataclasses import Answer, Document, ExtractedAnswer, GeneratedAnswer
+import haystack.logging
+
+# Initialize the logging configuration
+# This is a no-op unless `structlog` is installed
+haystack.logging.configure_logging()
 
 __all__ = [
     "component",

diff --git a/haystack/logging.py b/haystack/logging.py
@@ -0,0 +1,97 @@
+import logging
+import os
+import typing
+from typing import List
+
+if typing.TYPE_CHECKING:
+    from structlog.typing import Processor
+
+HAYSTACK_LOGGING_USE_JSON_ENV_VAR = "HAYSTACK_LOGGING_USE_JSON"
+HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR = "HAYSTACK_LOGGING_IGNORE_STRUCTLOG"
+
+
+def configure_logging(use_json: bool = False) -> None:
+    """Configure logging for Haystack.
+
+    - If `structlog` is not installed, we keep everything as it is. The user is responsible for configuring logging
+      themselves.
+    - If `structlog` is installed, we configure it to format log entries including its key-value data. To disable this
+      behavior set the environment variable `HAYSTACK_LOGGING_IGNORE_STRUCTLOG` to `true`.
+    - If `structlog` is installed, you can JSON format all logs. Enable this by
+        - setting the `use_json` parameter to `True` when calling this function
+        - setting the environment variable `HAYSTACK_LOGGING_USE_JSON` to `true`
+    """
+    try:
+        import structlog
+        from structlog.processors import ExceptionRenderer
+        from structlog.tracebacks import ExceptionDictTransformer
+
+    except ImportError:
+        # structlog is not installed - fall back to standard logging
+        return
+
+    if os.getenv(HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR, "false").lower() == "true":
+        # If the user wants to ignore structlog, we don't configure it and fall back to standard logging
+        return
+
+    # We roughly follow the structlog documentation here:
+    # https://www.structlog.org/en/stable/standard-library.html#rendering-using-structlog-based-formatters-within-logging
+    # This means that we use structlog to format the log entries for entries emitted via `logging` and `structlog`.
+
+    shared_processors: List[Processor] = [
+        # Add the log level to the event_dict for structlog to use
+        structlog.stdlib.add_log_level,
+        # Adds the current timestamp in ISO format to logs
+        structlog.processors.TimeStamper(fmt="iso"),
+    ]
+
+    structlog.configure(
+        processors=shared_processors + [structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        cache_logger_on_first_use=True,
+        # This is a filter that will filter out log entries that are below the log level of the root logger.
+        wrapper_class=structlog.make_filtering_bound_logger(min_level=logging.root.getEffectiveLevel()),
+    )
+
+    renderers: List[Processor]
+    if os.getenv(HAYSTACK_LOGGING_USE_JSON_ENV_VAR, "false").lower() == "true" or use_json:
+        renderers = [
+            ExceptionRenderer(
+                # don't show locals in production logs - this can be quite sensitive information
+                ExceptionDictTransformer(show_locals=False)
+            ),
+            structlog.processors.JSONRenderer(),
+        ]
+    else:
+        renderers = [structlog.dev.ConsoleRenderer()]
+
+    formatter = structlog.stdlib.ProcessorFormatter(
+        # These run ONLY on `logging` entries that do NOT originate within
+        # structlog.
+        foreign_pre_chain=shared_processors
+        + [
+            # Add the information from the `logging` `extras` to the event dictionary
+            structlog.stdlib.ExtraAdder()
+        ],
+        # These run on ALL entries after the pre_chain is done.
+        processors=[
+            # Remove _record & _from_structlog. to avoid that this metadata is added to the final log record
+            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+            *renderers,
+        ],
+    )
+
+    handler = logging.StreamHandler()
+    handler.name = "HaystackLoggingHandler"
+    # Use OUR `ProcessorFormatter` to format all `logging` entries.
+    handler.setFormatter(formatter)
+
+    root_logger = logging.getLogger()
+    # avoid adding our handler twice
+    old_handlers = [
+        h
+        for h in root_logger.handlers
+        if not (isinstance(h, logging.StreamHandler) and h.name == "HaystackLoggingHandler")
+    ]
+    new_handlers = [handler, *old_handlers]
+    root_logger.handlers = new_handlers
diff --git a/pyproject.toml b/pyproject.toml
@@ -122,6 +122,9 @@ extra-dependencies = [
   # Tracing
   "opentelemetry-sdk",
   "ddtrace",
+
+  # Structured logging
+  "structlog",
 ]
 
 [tool.hatch.envs.test.scripts]

diff --git a/releasenotes/notes/structured-logging-2d6cef3fee2b4f0e.yaml b/releasenotes/notes/structured-logging-2d6cef3fee2b4f0e.yaml
@@ -0,0 +1,18 @@
+---
+features:
+  - |
+    Haystack now supports structured logging out-of-the box.
+    Logging can be separated into 3 categories:
+    - If [`structlog`](https://www.structlog.org/en/stable/) is not installed, Haystack will
+      use the standard Python logging library with whatever configuration is present.
+    - If `structlog` is installed, Haystack will log through [`structlog`](https://www.structlog.org/en/stable/) using
+      structlog's console renderer.
+      To disable structlog, set the environment variable `HAYSTACK_LOGGING_IGNORE_STRUCTLOG_ENV_VAR` to `true`.
+    - To log in JSON, install [`structlog`](https://www.structlog.org/en/stable/) and
+      - set the environment variable `HAYSTACK_LOGGING_JSON` to `true` or
+      - enable JSON logging from Python
+        ```python
+        import haystack.logging
+
+        haystack.logging.configure_logging(use_json=True)
+        ```
diff --git a/test/test_logging.py b/test/test_logging.py
@@ -0,0 +1,206 @@
+import builtins
+import json
+import logging
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from unittest.mock import ANY, Mock
+
+import pytest
+from _pytest.capture import CaptureFixture
+from _pytest.logging import LogCaptureFixture
+from _pytest.monkeypatch import MonkeyPatch
+
+from haystack import logging as haystack_logging
+
+
+@pytest.fixture(autouse=True)
+def reset_logging_config() -> None:
+    old_handlers = logging.root.handlers.copy()
+    yield
+    # Reset the logging configuration after each test to avoid impacting other tests
+    logging.root.handlers = old_handlers
+
+
+class TestSkipLoggingConfiguration:
+    def test_skip_logging_configuration(
+        self, monkeypatch: MonkeyPatch, capfd: CaptureFixture, caplog: LogCaptureFixture
+    ) -> None:
+        monkeypatch.setenv("HAYSTACK_LOGGING_IGNORE_STRUCTLOG", "true")
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # the pytest fixture caplog only captures logs being rendered from the stdlib logging module
+        assert caplog.messages == ["Hello, structured logging!"]
+
+        # Nothing should be captured by capfd since structlog is not configured
+        assert capfd.readouterr().err == ""
+
+    def test_skip_logging_if_structlog_not_installed(
+        self, monkeypatch: MonkeyPatch, capfd: CaptureFixture, caplog: LogCaptureFixture
+    ) -> None:
+        monkeypatch.delitem(sys.modules, "structlog", raising=False)
+        monkeypatch.setattr(builtins, "__import__", Mock(side_effect=ImportError))
+
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # the pytest fixture caplog only captures logs being rendered from the stdlib logging module
+        assert caplog.messages == ["Hello, structured logging!"]
+
+        # Nothing should be captured by capfd since structlog is not configured
+        assert capfd.readouterr().err == ""
+
+
+class TestStructuredLoggingConsoleRendering:
+    def test_log_filtering_when_using_debug(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.debug("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+        assert output == ""
+
+    def test_log_filtering_when_using_debug_and_log_level_is_debug(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.DEBUG)
+
+        logger.debug("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+        assert output != ""
+
+    def test_console_rendered_structured_log(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+
+        # Only check for the minute to be a bit more robust
+        today = datetime.now(tz=timezone.utc).isoformat(timespec="minutes").replace("+00:00", "")
+        assert today in output
+
+        log_level = "warning"
+        assert log_level in output
+
+        assert "Hello, structured logging!" in output
+
+        assert "key1" in output
+        assert "value1" in output
+
+    def test_logging_exceptions(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+
+        def function_that_raises_and_adds_to_stack_trace():
+            raise ValueError("This is an error")
+
+        try:
+            function_that_raises_and_adds_to_stack_trace()
+        except ValueError:
+            logger.exception("An error happened")
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+
+        assert "An error happened" in output
+
+
+class TestStructuredLoggingJSONRendering:
+    def test_logging_as_json(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging(use_json=True)
+
+        logger = logging.getLogger(__name__)
+        logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+        parsed_output = json.loads(output)  # should not raise an error
+
+        assert parsed_output == {
+            "event": "Hello, structured logging!",
+            "key1": "value1",
+            "key2": "value2",
+            "level": "warning",
+            "timestamp": ANY,
+        }
+
+    def test_logging_as_json_enabling_via_env(self, capfd: CaptureFixture, monkeypatch: MonkeyPatch) -> None:
+        monkeypatch.setenv("HAYSTACK_LOGGING_USE_JSON", "true")
+        haystack_logging.configure_logging()
+
+        logger = logging.getLogger(__name__)
+        logger.warning("Hello, structured logging!", extra={"key1": "value1", "key2": "value2"})
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+        parsed_output = json.loads(output)  # should not raise an error
+
+        assert parsed_output == {
+            "event": "Hello, structured logging!",
+            "key1": "value1",
+            "key2": "value2",
+            "level": "warning",
+            "timestamp": ANY,
+        }
+
+    def test_logging_exceptions_json(self, capfd: CaptureFixture) -> None:
+        haystack_logging.configure_logging(use_json=True)
+
+        logger = logging.getLogger(__name__)
+
+        def function_that_raises_and_adds_to_stack_trace():
+            my_local_variable = "my_local_variable"  # noqa: F841
+            raise ValueError("This is an error")
+
+        try:
+            function_that_raises_and_adds_to_stack_trace()
+        except ValueError:
+            logger.exception("An error happened ")
+
+        # Use `capfd` to capture the output of the final structlog rendering result
+        output = capfd.readouterr().err
+        parsed_output = json.loads(output)
+        assert parsed_output == {
+            "event": "An error happened ",
+            "level": "error",
+            "timestamp": ANY,
+            "exception": [
+                {
+                    "exc_type": "ValueError",
+                    "exc_value": "This is an error",
+                    "syntax_error": None,
+                    "is_cause": False,
+                    "frames": [
+                        {
+                            "filename": str(Path.cwd() / "test" / "test_logging.py"),
+                            "lineno": ANY,  # otherwise the test breaks if you add a line :-)
+                            "name": "test_logging_exceptions_json",
+                            "line": "",
+                            "locals": None,
+                        },
+                        {
+                            "filename": str(Path.cwd() / "test" / "test_logging.py"),
+                            "lineno": ANY,  # otherwise the test breaks if you add a line :-)
+                            "name": "function_that_raises_and_adds_to_stack_trace",
+                            "line": "",
+                            "locals": None,
+                        },
+                    ],
+                }
+            ],
+        }
diff --git a/test/tracing/test_datadog.py b/test/tracing/test_datadog.py
@@ -17,8 +17,6 @@ def datadog_tracer(monkeypatch: MonkeyPatch) -> ddtrace.Tracer:
 
     tracer = ddtrace.Tracer()
 
-    # monkeypatch.setattr(ddtrace, "tracer", tracer)
-
     return tracer