Skip to content

Commit

Permalink
PyPDFToDocument: remove deprecated converter_name and CONVERTERS_REGI…
Browse files Browse the repository at this point in the history
…STRY (#7910)
  • Loading branch information
anakin87 committed Jun 21, 2024
1 parent 08104e0 commit c51f8ff
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 76 deletions.
40 changes: 3 additions & 37 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import io
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Union

Expand Down Expand Up @@ -56,12 +55,6 @@ def from_dict(cls, data):
return default_from_dict(cls, data)


# This registry is used to store converters names and instances.
# It can be used to register custom converters.
# It is now deprecated and will be removed in Haystack 2.3.0.
CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()}


@component
class PyPDFToDocument:
"""
Expand All @@ -82,40 +75,16 @@ class PyPDFToDocument:
```
"""

def __init__(self, converter_name: Optional[str] = None, converter: Optional[PyPDFConverter] = None):
def __init__(self, converter: Optional[PyPDFConverter] = None):
"""
Create an PyPDFToDocument component.
:param converter_name:
The name of a PyPDFConverter instance stored in the CONVERTERS_REGISTRY. Deprecated.
:param converter:
An instance of a PyPDFConverter compatible class.
"""
pypdf_import.check()

self.converter_name = converter_name
if converter_name:
warnings.warn(
"The `converter_name` parameter is deprecated and will be removed in Haystack 2.3.0. "
"Please use the `converter` parameter instead.",
DeprecationWarning,
)
try:
converter = CONVERTERS_REGISTRY[converter_name]
except KeyError:
msg = (
f"Invalid converter_name: {converter_name}.\n "
f"Available converters: {list(CONVERTERS_REGISTRY.keys())}"
f"To specify a custom converter, it is recommended to use the `converter` parameter."
)
raise ValueError(msg) from KeyError

self.converter = converter

elif converter:
self.converter = converter
else:
self.converter = DefaultConverter()
self.converter = converter or DefaultConverter()

def to_dict(self):
"""
Expand All @@ -124,7 +93,7 @@ def to_dict(self):
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, converter_name=self.converter_name, converter=self.converter.to_dict())
return default_to_dict(self, converter=self.converter.to_dict())

@classmethod
def from_dict(cls, data):
Expand All @@ -137,9 +106,6 @@ def from_dict(cls, data):
:returns:
Deserialized component.
"""
if data["init_parameters"].get("converter_name"):
return default_from_dict(cls, data)

converter_class = deserialize_type(data["init_parameters"]["converter"]["type"])
data["init_parameters"]["converter"] = converter_class.from_dict(data["init_parameters"]["converter"])
return default_from_dict(cls, data)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
upgrade:
- |
The deprecated `converter_name` parameter has been removed from `PyPDFToDocument`.
To specify a custom converter for `PyPDFToDocument`, use the `converter` initialization parameter and
pass an instance of a class that implements the `PyPDFConverter` protocol.
The `PyPDFConverter` protocol defines the methods `convert`, `to_dict` and `from_dict`.
A default implementation of `PyPDFConverter` is provided in the `DefaultConverter` class.
41 changes: 2 additions & 39 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest

from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import CONVERTERS_REGISTRY, DefaultConverter, PyPDFToDocument
from haystack.components.converters.pypdf import DefaultConverter, PyPDFToDocument
from haystack.dataclasses import ByteStream


Expand All @@ -19,19 +19,13 @@ def pypdf_converter():
class TestPyPDFToDocument:
def test_init(self, pypdf_converter):
assert isinstance(pypdf_converter.converter, DefaultConverter)
assert pypdf_converter.converter_name is None

def test_init_fail_nonexisting_converter(self):
with pytest.raises(ValueError):
PyPDFToDocument(converter_name="non_existing_converter")

def test_to_dict(self, pypdf_converter):
data = pypdf_converter.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}},
"converter_name": None,
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
},
}

Expand All @@ -45,18 +39,6 @@ def test_from_dict(self):
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, DefaultConverter)
assert instance.converter_name is None

def test_from_dict_with_converter_name(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {"converter_name": "default"},
}

instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, DefaultConverter)
assert instance.converter_name == "default"

@pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
Expand Down Expand Up @@ -140,22 +122,3 @@ def from_dict(cls, data):
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content

@pytest.mark.integration
def test_custom_converter_deprecated(self, test_files_path):
from pypdf import PdfReader

paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]

class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document:
return Document(content="I don't care about converting given pdfs, I always return this")

CONVERTERS_REGISTRY["custom"] = MyCustomConverter()

converter = PyPDFToDocument(converter_name="custom")
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content

0 comments on commit c51f8ff

Please sign in to comment.