Skip to content

Commit

Permalink
feat: change HTML conversion backend from boilerpy3 to Trafilatura (#…
Browse files Browse the repository at this point in the history
…7705)

* change HTML conversion backed to Trafilatura

* rm unused var
  • Loading branch information
anakin87 committed May 17, 2024
1 parent 57af95d commit 7181f6b
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 89 deletions.
96 changes: 36 additions & 60 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
#
# SPDX-License-Identifier: Apache-2.0

import warnings
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
from typing import Any, Dict, List, Optional, Union

from boilerpy3 import extractors
from trafilatura import extract

from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
Expand All @@ -31,40 +32,35 @@ class HTMLToDocument:
```
"""

known_extractors: ClassVar[List[str]] = [
"DefaultExtractor",
"ArticleExtractor",
"ArticleSentencesExtractor",
"LargestContentExtractor",
"CanolaExtractor",
"KeepEverythingExtractor",
"NumWordsRulesExtractor",
]

def __init__(
self,
extractor_type: Literal[
"DefaultExtractor",
"ArticleExtractor",
"ArticleSentencesExtractor",
"LargestContentExtractor",
"CanolaExtractor",
"KeepEverythingExtractor",
"NumWordsRulesExtractor",
] = "DefaultExtractor",
try_others: bool = True,
extractor_type: Optional[str] = None,
try_others: Optional[bool] = None,
extraction_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Create an HTMLToDocument component.
:param
extractor_type: Name of the extractor class to use. Defaults to `DefaultExtractor`.
For more information on the different types of extractors,
see [boilerpy3 documentation](https://github.com/jmriebold/BoilerPy3?tab=readme-ov-file#extractors).
:param try_others: If `True`, the component will try other extractors if the user chosen extractor fails.
:param extractor_type: Ignored. This parameter is kept for compatibility with previous versions. It will be
removed in Haystack 2.4.0. To customize the extraction, use the `extraction_kwargs` parameter.
:param try_others: Ignored. This parameter is kept for compatibility with previous versions. It will be
removed in Haystack 2.4.0.
:param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
"""
self.extractor_type = extractor_type
self.try_others = try_others
if extractor_type is not None:
warnings.warn(
"The `extractor_type` parameter is ignored and will be removed in Haystack 2.4.0. "
"To customize the extraction, use the `extraction_kwargs` parameter.",
DeprecationWarning,
)
if try_others is not None:
warnings.warn(
"The `try_others` parameter is ignored and will be removed in Haystack 2.4.0. ", DeprecationWarning
)

self.extraction_kwargs = extraction_kwargs or {}

def to_dict(self) -> Dict[str, Any]:
"""
Expand All @@ -73,7 +69,7 @@ def to_dict(self) -> Dict[str, Any]:
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, extractor_type=self.extractor_type, try_others=self.try_others)
return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
Expand All @@ -92,6 +88,7 @@ def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
extraction_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Converts a list of HTML files to Documents.
Expand All @@ -104,54 +101,33 @@ def run(
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:param extraction_kwargs:
Additional keyword arguments to customize the extraction process.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""

merged_extraction_kwargs = {**self.extraction_kwargs, **(extraction_kwargs or {})}

documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

# Use all extractor types, ensuring user chosen extractor is first, preserve order, avoid duplicates
extractors_list = (
list(
dict.fromkeys(
[self.extractor_type, *self.known_extractors] # User chosen extractor is always tried first
)
)
if self.try_others
else [self.extractor_type]
)

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue

text = None
for extractor_name in extractors_list:
extractor_class = getattr(extractors, extractor_name)
extractor = extractor_class(raise_on_failure=False)
try:
text = extractor.get_content(bytestream.data.decode("utf-8"))
if text:
break
except Exception as conversion_e:
if self.try_others:
logger.warning(
"Failed to extract text using {extractor} from {source}. Trying next extractor. Error: {error}",
extractor=extractor_name,
source=source,
error=conversion_e,
)
if not text:
try:
text = extract(bytestream.data.decode("utf-8"), **merged_extraction_kwargs)
except Exception as conversion_e:
logger.warning(
f"Failed to extract text from {source} using extractors: {extractors_list}. Skipping it.",
"Failed to extract text from {source}. Skipping it. Error: {error}",
source=source,
extractors_list=extractors_list,
error=conversion_e,
)
continue

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ dependencies = [
"more-itertools", # TextDocumentSplitter
"networkx", # Pipeline graphs
"typing_extensions>=4.7", # typing support for Python 3.8
"boilerpy3", # Fulltext extraction from HTML pages
"trafilatura", # Fulltext extraction from HTML pages
"requests",
"numpy",
"python-dateutil",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
enhancements:
- |
`HTMLToDocument`: change the HTML conversion backend from `boilerpy3` to `trafilatura`,
which is more robust and better maintained.
deprecations:
- |
The following parameters of `HTMLToDocument` are ignored and will be removed in Haystack 2.4.0:
`extractor_type` and `try_others`.
43 changes: 15 additions & 28 deletions test/components/converters/test_html_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,6 @@ def test_run(self, test_files_path):
assert "Haystack" in docs[0].content
assert docs[0].meta["test"] == "TEST"

def test_run_different_extractors(self, test_files_path):
"""
Test if the component runs correctly with different boilrepy3 extractors.
"""
sources = [test_files_path / "html" / "what_is_haystack.html"]

converter_article = HTMLToDocument(extractor_type="ArticleExtractor")
converter_keep_everything = HTMLToDocument(extractor_type="KeepEverythingExtractor")

doc_article = converter_article.run(sources=sources)["documents"][0]
doc_keep_everything = converter_keep_everything.run(sources=sources)["documents"][0]

assert len(doc_keep_everything.content) > len(doc_article.content)

def test_run_doc_metadata(self, test_files_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
Expand Down Expand Up @@ -169,26 +155,27 @@ def test_serde(self):
"""
Test if the component runs correctly gets serialized and deserialized.
"""
converter = HTMLToDocument("ArticleExtractor")
converter = HTMLToDocument()
serde_data = converter.to_dict()
new_converter = HTMLToDocument.from_dict(serde_data)
assert new_converter.extractor_type == converter.extractor_type
assert new_converter.try_others == converter.try_others
assert new_converter.extraction_kwargs == converter.extraction_kwargs

def test_run_try_others_false(self, test_files_path, caplog):
converter = HTMLToDocument(try_others=False)
result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])

# paul_graham_superlinear.html is a page that the DefaultExtractor cannot extract text from
assert len(result["documents"]) == 0
assert "Failed to extract text from" in caplog.text
assert "Skipping it" in caplog.text
def test_run_difficult_html(self, test_files_path):
# boilerpy3's DefaultExtractor fails to extract text from this HTML file

def test_run_try_others_true(self, test_files_path, caplog):
# try_others=True is the default value
converter = HTMLToDocument()
result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])

# paul_graham_superlinear.html is a page that the DefaultExtractor cannot extract text from
assert len(result["documents"]) == 1
assert "Superlinear" in result["documents"][0].content

def test_run_with_extraction_kwargs(self, test_files_path):
sources = [test_files_path / "html" / "what_is_haystack.html"]

converter = HTMLToDocument()
precise_converter = HTMLToDocument(extraction_kwargs={"favor_precision": True})

doc = converter.run(sources=sources)["documents"][0]
precise_doc = precise_converter.run(sources=sources)["documents"][0]

assert len(doc.content) > len(precise_doc.content)

0 comments on commit 7181f6b

Please sign in to comment.