Skip to content

Commit

Permalink
add keep-id to DocumentCleaner (#7703)
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlosFerLo committed May 16, 2024
1 parent 686a499 commit 57af95d
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 1 deletion.
5 changes: 4 additions & 1 deletion haystack/components/preprocessors/document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
remove_empty_lines: bool = True,
remove_extra_whitespaces: bool = True,
remove_repeated_substrings: bool = False,
keep_id: bool = False,
remove_substrings: Optional[List[str]] = None,
remove_regex: Optional[str] = None,
):
Expand All @@ -53,13 +54,15 @@ def __init__(
which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
:param remove_substrings: List of substrings to remove from the text.
:param remove_regex: Regex to match and replace substrings by "".
:param keep_id: keep the ids of the original documents
"""

self.remove_empty_lines = remove_empty_lines
self.remove_extra_whitespaces = remove_extra_whitespaces
self.remove_repeated_substrings = remove_repeated_substrings
self.remove_substrings = remove_substrings
self.remove_regex = remove_regex
self.keep_id = keep_id

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
Expand Down Expand Up @@ -98,7 +101,7 @@ def run(self, documents: List[Document]):
if self.remove_repeated_substrings:
text = self._remove_repeated_substrings(text)

cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta)))
cleaned_docs.append(Document(content=text, meta=deepcopy(doc.meta), id=doc.id if self.keep_id else ""))

return {"documents": cleaned_docs}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
The `DocumentCleaner` class has the optional attribute `keep_id` that if set to True it keeps the document ids unchanged after cleanup.
9 changes: 9 additions & 0 deletions test/components/preprocessors/test_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def test_init(self):
assert cleaner.remove_repeated_substrings is False
assert cleaner.remove_substrings is None
assert cleaner.remove_regex is None
assert cleaner.keep_id is False

def test_non_text_document(self, caplog):
with caplog.at_level(logging.WARNING):
Expand Down Expand Up @@ -130,3 +131,11 @@ def test_copy_metadata(self):
for doc, cleaned_doc in zip(documents, result["documents"]):
assert doc.meta == cleaned_doc.meta
assert cleaned_doc.content == "Text."

def test_keep_id_does_not_alter_document_ids(self):
cleaner = DocumentCleaner(keep_id=True)
documents = [Document(content="Text. ", id="1"), Document(content="Text. ", id="2")]
result = cleaner.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id == "1"
assert result["documents"][1].id == "2"

0 comments on commit 57af95d

Please sign in to comment.