Skip to content

Commit

Permalink
refactor: DocxToDocument update (#7857)
Browse files Browse the repository at this point in the history
* Some changes

Use tests file path

* Update tests

* Add another unit test

* Shorten _get_docx_metadata

* Update tests

* Remove try block

* Add a dataclass

* Add a to dict unit test

* Remove unused import

* Add release notes

* Update docstrings

* Use optional instead of pipe

* Update docstring

* Remove file
  • Loading branch information
sjrl committed Jun 19, 2024
1 parent fe60eed commit 3db56d9
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 57 deletions.
3 changes: 2 additions & 1 deletion haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.docx import DocxToDocument
from haystack.components.converters.docx import DocxMetadata, DocxToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
Expand All @@ -24,4 +24,5 @@
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DocxToDocument",
"DocxMetadata",
]
106 changes: 63 additions & 43 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import io
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand All @@ -19,6 +20,45 @@
from docx.document import Document as DocxDocument


@dataclass
class DocxMetadata:
"""
Describes the metadata of Docx file.
:param author: The author
:param category: The category
:param comments: The comments
:param content_status: The content status
:param created: The creation date
:param identifier: The identifier
:param keywords: Available keywords
:param language: The language of the document
:param last_modified_by: The last modified by user date
:param last_printed: The last printed date
:param modified: The last modification date
:param revision: The revision number
:param subject: The subject
:param title: The title
:param version: The version
"""

author: str
category: str
comments: str
content_status: str
created: Optional[datetime]
identifier: str
keywords: str
language: str
last_modified_by: str
last_printed: Optional[datetime]
modified: Optional[datetime]
revision: int
subject: str
title: str
version: str


@component
class DocxToDocument:
"""
Expand Down Expand Up @@ -72,16 +112,15 @@ def run(
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
# Load source ByteStream
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue

# Load the Docx Document
try:
file = docx.Document(io.BytesIO(bytestream.data))
paragraphs = [para.text for para in file.paragraphs]
text = "\n".join(paragraphs)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
Expand All @@ -90,56 +129,37 @@ def run(
)
continue

# Load the Metadata
try:
docx_meta = self._get_docx_metadata(document=file)
except Exception as e:
logger.warning(
"Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e
)
docx_meta = {}

# Load the content
try:
paragraphs = [para.text for para in file.paragraphs]
text = "\n".join(paragraphs)
except Exception as e:
logger.warning(
"Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e
)
continue

merged_metadata = {**bytestream.meta, **docx_meta, **metadata}
docx_metadata = self._get_docx_metadata(document=file)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
document = Document(content=text, meta=merged_metadata)

documents.append(document)

return {"documents": documents}

def _get_docx_metadata(self, document: "DocxDocument") -> Dict[str, Union[str, int, datetime]]:
def _get_docx_metadata(self, document: "DocxDocument") -> DocxMetadata:
"""
Get all relevant data from the 'core_properties' attribute from a Docx Document.
:param document:
The Docx Document you want to extract metadata from
:returns:
A dictionary containing all the relevant fields from the 'core_properties'
A `DocxMetadata` dataclass all the relevant fields from the 'core_properties'
"""
return {
"author": document.core_properties.author,
"category": document.core_properties.category,
"comments": document.core_properties.comments,
"content_status": document.core_properties.content_status,
"created": document.core_properties.created,
"identifier": document.core_properties.identifier,
"keywords": document.core_properties.keywords,
"language": document.core_properties.language,
"last_modified_by": document.core_properties.last_modified_by,
"last_printed": document.core_properties.last_printed,
"modified": document.core_properties.modified,
"revision": document.core_properties.revision,
"subject": document.core_properties.subject,
"title": document.core_properties.title,
"version": document.core_properties.version,
}
return DocxMetadata(
author=document.core_properties.author,
category=document.core_properties.category,
comments=document.core_properties.comments,
content_status=document.core_properties.content_status,
created=document.core_properties.created,
identifier=document.core_properties.identifier,
keywords=document.core_properties.keywords,
language=document.core_properties.language,
last_modified_by=document.core_properties.last_modified_by,
last_printed=document.core_properties.last_printed,
modified=document.core_properties.modified,
revision=document.core_properties.revision,
subject=document.core_properties.subject,
title=document.core_properties.title,
version=document.core_properties.version,
)
119 changes: 106 additions & 13 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import logging
from unittest.mock import patch
import datetime

import pytest

from haystack.dataclasses import ByteStream
from haystack.components.converters import DocxToDocument
from haystack import Document
from haystack.components.converters.docx import DocxToDocument, DocxMetadata


@pytest.fixture
Expand All @@ -16,7 +17,6 @@ class TestDocxToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DocxToDocument)

@pytest.mark.integration
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
Expand All @@ -26,18 +26,63 @@ def test_run(self, test_files_path, docx_converter):
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"docx": DocxMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
),
}

@pytest.mark.integration
def test_run_with_meta(self, test_files_path, docx_converter):
output = docx_converter.run(
sources=[test_files_path / "docx" / "sample_docx_1.docx"], meta={"language": "it", "author": "test_author"}
)
def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
assert doc.meta == {
"file_path": str(paths[0]),
"docx": DocxMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
),
"language": "it",
"author": "test_author",
}

# check that the metadata from the bytestream is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
sources = [str(test_files_path / "txt" / "doc_1.txt")]
with caplog.at_level(logging.WARNING):
results = docx_converter.run(sources=sources)
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []

def test_run_error_handling(self, test_files_path, docx_converter, caplog):
def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
Expand All @@ -46,7 +91,6 @@ def test_run_error_handling(self, test_files_path, docx_converter, caplog):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text

@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly when mixed sources are provided.
Expand All @@ -60,3 +104,52 @@ def test_mixed_sources_run(self, test_files_path, docx_converter):
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content

def test_document_with_docx_metadata_to_dict(self):
docx_metadata = DocxMetadata(
author="Microsoft Office User",
category="category",
comments="comments",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
)
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
assert doc.to_dict(flatten=False) == {
"blob": None,
"dataframe": None,
"content": "content",
"id": "1",
"score": None,
"embedding": None,
"sparse_embedding": None,
"meta": {
"test": 1,
"docx": {
"author": "Microsoft Office User",
"category": "category",
"comments": "comments",
"content_status": "",
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
},
}

0 comments on commit 3db56d9

Please sign in to comment.