refactor: DocxToDocument update (#7857)

* Some changes Use tests file path * Update tests * Add another unit test * Shorten _get_docx_metadata * Update tests * Remove try block * Add a dataclass * Add a to dict unit test * Remove unused import * Add release notes * Update docstrings * Use optional instead of pipe * Update docstring * Remove file
deepset-ai · Jun 19, 2024 · 3db56d9 · 3db56d9
1 parent fe60eed
commit 3db56d9
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 57 deletions.
diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
-from haystack.components.converters.docx import DocxToDocument
+from haystack.components.converters.docx import DocxMetadata, DocxToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
 from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
@@ -24,4 +24,5 @@
     "OpenAPIServiceToFunctions",
     "OutputAdapter",
     "DocxToDocument",
+    "DocxMetadata",
 ]
diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
@@ -19,6 +20,45 @@
     from docx.document import Document as DocxDocument
 
 
+@dataclass
+class DocxMetadata:
+    """
+    Describes the metadata of Docx file.
+
+    :param author: The author
+    :param category: The category
+    :param comments: The comments
+    :param content_status: The content status
+    :param created: The creation date
+    :param identifier: The identifier
+    :param keywords: Available keywords
+    :param language: The language of the document
+    :param last_modified_by: The last modified by user date
+    :param last_printed: The last printed date
+    :param modified: The last modification date
+    :param revision: The revision number
+    :param subject: The subject
+    :param title: The title
+    :param version: The version
+    """
+
+    author: str
+    category: str
+    comments: str
+    content_status: str
+    created: Optional[datetime]
+    identifier: str
+    keywords: str
+    language: str
+    last_modified_by: str
+    last_printed: Optional[datetime]
+    modified: Optional[datetime]
+    revision: int
+    subject: str
+    title: str
+    version: str
+
+
 @component
 class DocxToDocument:
     """
@@ -72,16 +112,15 @@ def run(
         meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
         for source, metadata in zip(sources, meta_list):
-            # Load source ByteStream
             try:
                 bytestream = get_bytestream_from_source(source)
             except Exception as e:
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
-
-            # Load the Docx Document
             try:
                 file = docx.Document(io.BytesIO(bytestream.data))
+                paragraphs = [para.text for para in file.paragraphs]
+                text = "\n".join(paragraphs)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
@@ -90,56 +129,37 @@ def run(
                 )
                 continue
 
-            # Load the Metadata
-            try:
-                docx_meta = self._get_docx_metadata(document=file)
-            except Exception as e:
-                logger.warning(
-                    "Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e
-                )
-                docx_meta = {}
-
-            # Load the content
-            try:
-                paragraphs = [para.text for para in file.paragraphs]
-                text = "\n".join(paragraphs)
-            except Exception as e:
-                logger.warning(
-                    "Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e
-                )
-                continue
-
-            merged_metadata = {**bytestream.meta, **docx_meta, **metadata}
+            docx_metadata = self._get_docx_metadata(document=file)
+            merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
             document = Document(content=text, meta=merged_metadata)
-
             documents.append(document)
 
         return {"documents": documents}
 
-    def _get_docx_metadata(self, document: "DocxDocument") -> Dict[str, Union[str, int, datetime]]:
+    def _get_docx_metadata(self, document: "DocxDocument") -> DocxMetadata:
         """
         Get all relevant data from the 'core_properties' attribute from a Docx Document.
 
         :param document:
             The Docx Document you want to extract metadata from
 
         :returns:
-            A dictionary containing all the relevant fields from the 'core_properties'
+            A `DocxMetadata` dataclass all the relevant fields from the 'core_properties'
         """
-        return {
-            "author": document.core_properties.author,
-            "category": document.core_properties.category,
-            "comments": document.core_properties.comments,
-            "content_status": document.core_properties.content_status,
-            "created": document.core_properties.created,
-            "identifier": document.core_properties.identifier,
-            "keywords": document.core_properties.keywords,
-            "language": document.core_properties.language,
-            "last_modified_by": document.core_properties.last_modified_by,
-            "last_printed": document.core_properties.last_printed,
-            "modified": document.core_properties.modified,
-            "revision": document.core_properties.revision,
-            "subject": document.core_properties.subject,
-            "title": document.core_properties.title,
-            "version": document.core_properties.version,
-        }
+        return DocxMetadata(
+            author=document.core_properties.author,
+            category=document.core_properties.category,
+            comments=document.core_properties.comments,
+            content_status=document.core_properties.content_status,
+            created=document.core_properties.created,
+            identifier=document.core_properties.identifier,
+            keywords=document.core_properties.keywords,
+            language=document.core_properties.language,
+            last_modified_by=document.core_properties.last_modified_by,
+            last_printed=document.core_properties.last_printed,
+            modified=document.core_properties.modified,
+            revision=document.core_properties.revision,
+            subject=document.core_properties.subject,
+            title=document.core_properties.title,
+            version=document.core_properties.version,
+        )
diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py
@@ -1,10 +1,11 @@
 import logging
-from unittest.mock import patch
+import datetime
 
 import pytest
 
 from haystack.dataclasses import ByteStream
-from haystack.components.converters import DocxToDocument
+from haystack import Document
+from haystack.components.converters.docx import DocxToDocument, DocxMetadata
 
 
 @pytest.fixture
@@ -16,7 +17,6 @@ class TestDocxToDocument:
     def test_init(self, docx_converter):
         assert isinstance(docx_converter, DocxToDocument)
 
-    @pytest.mark.integration
     def test_run(self, test_files_path, docx_converter):
         """
         Test if the component runs correctly
@@ -26,18 +26,63 @@ def test_run(self, test_files_path, docx_converter):
         docs = output["documents"]
         assert len(docs) == 1
         assert "History" in docs[0].content
+        assert docs[0].meta.keys() == {"file_path", "docx"}
+        assert docs[0].meta == {
+            "file_path": str(paths[0]),
+            "docx": DocxMetadata(
+                author="Microsoft Office User",
+                category="",
+                comments="",
+                content_status="",
+                created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
+                identifier="",
+                keywords="",
+                language="",
+                last_modified_by="Carlos Fernández Lorán",
+                last_printed=None,
+                modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
+                revision=2,
+                subject="",
+                title="",
+                version="",
+            ),
+        }
 
-    @pytest.mark.integration
-    def test_run_with_meta(self, test_files_path, docx_converter):
-        output = docx_converter.run(
-            sources=[test_files_path / "docx" / "sample_docx_1.docx"], meta={"language": "it", "author": "test_author"}
-        )
+    def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
+        doc = output["documents"][0]
+        assert doc.meta == {
+            "file_path": str(paths[0]),
+            "docx": DocxMetadata(
+                author="Microsoft Office User",
+                category="",
+                comments="",
+                content_status="",
+                created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
+                identifier="",
+                keywords="",
+                language="",
+                last_modified_by="Carlos Fernández Lorán",
+                last_printed=None,
+                modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
+                revision=2,
+                subject="",
+                title="",
+                version="",
+            ),
+            "language": "it",
+            "author": "test_author",
+        }
 
-        # check that the metadata from the bytestream is merged with that from the meta parameter
-        assert output["documents"][0].meta["author"] == "test_author"
-        assert output["documents"][0].meta["language"] == "it"
+    def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
+        sources = [str(test_files_path / "txt" / "doc_1.txt")]
+        with caplog.at_level(logging.WARNING):
+            results = docx_converter.run(sources=sources)
+            assert "doc_1.txt and convert it" in caplog.text
+            assert results["documents"] == []
 
-    def test_run_error_handling(self, test_files_path, docx_converter, caplog):
+    def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
         """
         Test if the component correctly handles errors.
         """
@@ -46,7 +91,6 @@ def test_run_error_handling(self, test_files_path, docx_converter, caplog):
             docx_converter.run(sources=paths)
             assert "Could not read non_existing_file.docx" in caplog.text
 
-    @pytest.mark.integration
     def test_mixed_sources_run(self, test_files_path, docx_converter):
         """
         Test if the component runs correctly when mixed sources are provided.
@@ -60,3 +104,52 @@ def test_mixed_sources_run(self, test_files_path, docx_converter):
         assert len(docs) == 2
         assert "History and standardization" in docs[0].content
         assert "History and standardization" in docs[1].content
+
+    def test_document_with_docx_metadata_to_dict(self):
+        docx_metadata = DocxMetadata(
+            author="Microsoft Office User",
+            category="category",
+            comments="comments",
+            content_status="",
+            created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
+            identifier="",
+            keywords="",
+            language="",
+            last_modified_by="Carlos Fernández Lorán",
+            last_printed=None,
+            modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
+            revision=2,
+            subject="",
+            title="",
+            version="",
+        )
+        doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
+        assert doc.to_dict(flatten=False) == {
+            "blob": None,
+            "dataframe": None,
+            "content": "content",
+            "id": "1",
+            "score": None,
+            "embedding": None,
+            "sparse_embedding": None,
+            "meta": {
+                "test": 1,
+                "docx": {
+                    "author": "Microsoft Office User",
+                    "category": "category",
+                    "comments": "comments",
+                    "content_status": "",
+                    "created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
+                    "identifier": "",
+                    "keywords": "",
+                    "language": "",
+                    "last_modified_by": "Carlos Fernández Lorán",
+                    "last_printed": None,
+                    "modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
+                    "revision": 2,
+                    "subject": "",
+                    "title": "",
+                    "version": "",
+                },
+            },
+        }