feat: change HTML conversion backend from boilerpy3 to Trafilatura (#…

…7705) * change HTML conversion backed to Trafilatura * rm unused var
deepset-ai · May 17, 2024 · 7181f6b · 7181f6b
1 parent 57af95d
commit 7181f6b
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 89 deletions.
diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -2,10 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import warnings
 from pathlib import Path
-from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
-from boilerpy3 import extractors
+from trafilatura import extract
 
 from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -31,40 +32,35 @@ class HTMLToDocument:
     ```
     """
 
-    known_extractors: ClassVar[List[str]] = [
-        "DefaultExtractor",
-        "ArticleExtractor",
-        "ArticleSentencesExtractor",
-        "LargestContentExtractor",
-        "CanolaExtractor",
-        "KeepEverythingExtractor",
-        "NumWordsRulesExtractor",
-    ]
-
     def __init__(
         self,
-        extractor_type: Literal[
-            "DefaultExtractor",
-            "ArticleExtractor",
-            "ArticleSentencesExtractor",
-            "LargestContentExtractor",
-            "CanolaExtractor",
-            "KeepEverythingExtractor",
-            "NumWordsRulesExtractor",
-        ] = "DefaultExtractor",
-        try_others: bool = True,
+        extractor_type: Optional[str] = None,
+        try_others: Optional[bool] = None,
+        extraction_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
         Create an HTMLToDocument component.
 
-        :param
-            extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`.
-            For more information on the different types of extractors,
-            see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
-        :param try_others: If `True`, the component will try other extractors if the user chosen extractor fails.
+        :param extractor_type: Ignored. This parameter is kept for compatibility with previous versions. It will be
+            removed in Haystack 2.4.0. To customize the extraction, use the `extraction_kwargs` parameter.
+        :param try_others: Ignored. This parameter is kept for compatibility with previous versions. It will be
+            removed in Haystack 2.4.0.
+        :param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
+            are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
+            the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
         """
-        self.extractor_type = extractor_type
-        self.try_others = try_others
+        if extractor_type is not None:
+            warnings.warn(
+                "The `extractor_type` parameter is ignored and will be removed in Haystack 2.4.0. "
+                "To customize the extraction, use the `extraction_kwargs` parameter.",
+                DeprecationWarning,
+            )
+        if try_others is not None:
+            warnings.warn(
+                "The `try_others` parameter is ignored and will be removed in Haystack 2.4.0. ", DeprecationWarning
+            )
+
+        self.extraction_kwargs = extraction_kwargs or {}
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -73,7 +69,7 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, extractor_type=self.extractor_type, try_others=self.try_others)
+        return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
@@ -92,6 +88,7 @@ def run(
         self,
         sources: List[Union[str, Path, ByteStream]],
         meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+        extraction_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
         Converts a list of HTML files to Documents.
@@ -104,54 +101,33 @@ def run(
             If it's a single dictionary, its content is added to the metadata of all produced Documents.
             If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
             If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+        :param extraction_kwargs:
+            Additional keyword arguments to customize the extraction process.
 
         :returns:
             A dictionary with the following keys:
             - `documents`: Created Documents
         """
 
+        merged_extraction_kwargs = {**self.extraction_kwargs, **(extraction_kwargs or {})}
+
         documents = []
         meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
-        # Use all extractor types, ensuring user chosen extractor is first, preserve order, avoid duplicates
-        extractors_list = (
-            list(
-                dict.fromkeys(
-                    [self.extractor_type, *self.known_extractors]  # User chosen extractor is always tried first
-                )
-            )
-            if self.try_others
-            else [self.extractor_type]
-        )
-
         for source, metadata in zip(sources, meta_list):
             try:
                 bytestream = get_bytestream_from_source(source=source)
             except Exception as e:
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
 
-            text = None
-            for extractor_name in extractors_list:
-                extractor_class = getattr(extractors, extractor_name)
-                extractor = extractor_class(raise_on_failure=False)
-                try:
-                    text = extractor.get_content(bytestream.data.decode("utf-8"))
-                    if text:
-                        break
-                except Exception as conversion_e:
-                    if self.try_others:
-                        logger.warning(
-                            "Failed to extract text using {extractor} from {source}. Trying next extractor. Error: {error}",
-                            extractor=extractor_name,
-                            source=source,
-                            error=conversion_e,
-                        )
-            if not text:
+            try:
+                text = extract(bytestream.data.decode("utf-8"), **merged_extraction_kwargs)
+            except Exception as conversion_e:
                 logger.warning(
-                    f"Failed to extract text from {source} using extractors: {extractors_list}. Skipping it.",
+                    "Failed to extract text from {source}. Skipping it. Error: {error}",
                     source=source,
-                    extractors_list=extractors_list,
+                    error=conversion_e,
                 )
                 continue
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,7 @@ dependencies = [
   "more-itertools",  # TextDocumentSplitter
   "networkx", # Pipeline graphs
   "typing_extensions>=4.7", # typing support for Python 3.8
-  "boilerpy3", # Fulltext extraction from HTML pages
+  "trafilatura", # Fulltext extraction from HTML pages
   "requests",
   "numpy",
   "python-dateutil",

diff --git a/releasenotes/notes/trafilatura-html-conversion-e9b9044d31fec794.yaml b/releasenotes/notes/trafilatura-html-conversion-e9b9044d31fec794.yaml
@@ -0,0 +1,9 @@
+---
+enhancements:
+  - |
+    `HTMLToDocument`: change the HTML conversion backend from `boilerpy3` to `trafilatura`,
+    which is more robust and better maintained.
+deprecations:
+  - |
+    The following parameters of `HTMLToDocument` are ignored and will be removed in Haystack 2.4.0:
+    `extractor_type` and `try_others`.
diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py
@@ -23,20 +23,6 @@ def test_run(self, test_files_path):
         assert "Haystack" in docs[0].content
         assert docs[0].meta["test"] == "TEST"
 
-    def test_run_different_extractors(self, test_files_path):
-        """
-        Test if the component runs correctly with different boilrepy3 extractors.
-        """
-        sources = [test_files_path / "html" / "what_is_haystack.html"]
-
-        converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
-        converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")
-
-        doc_article = converter_article.run(sources=sources)["documents"][0]
-        doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]
-
-        assert len(doc_keep_everything.content) > len(doc_article.content)
-
     def test_run_doc_metadata(self, test_files_path):
         """
         Test if the component runs correctly when metadata is supplied by the user.
@@ -169,26 +155,27 @@ def test_serde(self):
         """
         Test if the component runs correctly gets serialized and deserialized.
         """
-        converter = HTMLToDocument("ArticleExtractor")
+        converter = HTMLToDocument()
         serde_data = converter.to_dict()
         new_converter = HTMLToDocument.from_dict(serde_data)
-        assert new_converter.extractor_type == converter.extractor_type
-        assert new_converter.try_others == converter.try_others
+        assert new_converter.extraction_kwargs == converter.extraction_kwargs
 
-    def test_run_try_others_false(self, test_files_path, caplog):
-        converter = HTMLToDocument(try_others=False)
-        result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])
-
-        # paul_graham_superlinear.html is a page that the DefaultExtractor cannot extract text from
-        assert len(result["documents"]) == 0
-        assert "Failed to extract text from" in caplog.text
-        assert "Skipping it" in caplog.text
+    def test_run_difficult_html(self, test_files_path):
+        # boilerpy3's DefaultExtractor fails to extract text from this HTML file
 
-    def test_run_try_others_true(self, test_files_path, caplog):
-        # try_others=True is the default value
         converter = HTMLToDocument()
         result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])
 
-        # paul_graham_superlinear.html is a page that the DefaultExtractor cannot extract text from
         assert len(result["documents"]) == 1
         assert "Superlinear" in result["documents"][0].content
+
+    def test_run_with_extraction_kwargs(self, test_files_path):
+        sources = [test_files_path / "html" / "what_is_haystack.html"]
+
+        converter = HTMLToDocument()
+        precise_converter = HTMLToDocument(extraction_kwargs={"favor_precision": True})
+
+        doc = converter.run(sources=sources)["documents"][0]
+        precise_doc = precise_converter.run(sources=sources)["documents"][0]
+
+        assert len(doc.content) > len(precise_doc.content)