Skip to content

Commit

Permalink
refactor!: Remove routing from DocumentLanguageClassifier and rename …
Browse files Browse the repository at this point in the history
…TextLanguageClassifier (#6307)

* remove routing from DocumentLanguageClassifier

* fix MetadataRouter typo
  • Loading branch information
julian-risch committed Nov 15, 2023
1 parent 5295b40 commit 08ec492
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 38 deletions.
3 changes: 1 addition & 2 deletions haystack/preview/components/classifiers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from haystack.preview.components.classifiers.document_language_classifier import DocumentLanguageClassifier
from haystack.preview.components.classifiers.text_language_classifier import TextLanguageClassifier

__all__ = ["DocumentLanguageClassifier", "TextLanguageClassifier"]
__all__ = ["DocumentLanguageClassifier"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
@component
class DocumentLanguageClassifier:
"""
Routes documents onto different output connections depending on their language.
This is useful for routing documents to different models in a pipeline depending on their language.
Classify the language of documents and add the detected language to their metadata.
A MetadataRouter can then route them onto different output connections depending on their language.
This is useful to route documents to different models in a pipeline depending on their language.
The set of supported languages can be specified.
For routing plain text using the same logic, use the related TextLanguageClassifier component instead.
For routing plain text using the same logic, use the related TextLanguageRouter component instead.
Example usage within an indexing pipeline, storing in a Document Store
only documents written in English:
Expand All @@ -26,9 +27,11 @@ class DocumentLanguageClassifier:
p = Pipeline()
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
p.add_component(instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
p.connect("text_file_converter.documents", "language_classifier.documents")
p.connect("language_classifier.en", "writer.documents")
p.connect("language_classifier.documents", "router.documents")
p.connect("router.en", "writer.documents")
```
"""

Expand All @@ -42,17 +45,15 @@ def __init__(self, languages: Optional[List[str]] = None):
if not languages:
languages = ["en"]
self.languages = languages
component.set_output_types(
self, unmatched=List[Document], **{language: List[Document] for language in languages}
)

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Run the DocumentLanguageClassifier. This method routes the documents to different edges based on their language.
If a Document's text does not match any of the languages specified at initialization, it is routed to
a connection named "unmatched".
Run the DocumentLanguageClassifier. This method classifies the documents' language and adds it to their metadata.
If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored.
:param documents: A list of documents to route to different edges.
:param documents: A list of documents to classify their language.
:return: List of Documents with an added metadata field called language.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
Expand All @@ -66,11 +67,11 @@ def run(self, documents: List[Document]):
for document in documents:
detected_language = self.detect_language(document)
if detected_language in self.languages:
output[detected_language].append(document)
document.meta["language"] = detected_language
else:
output["unmatched"].append(document)
document.meta["language"] = "unmatched"

return output
return {"documents": documents}

def detect_language(self, document: Document) -> Optional[str]:
try:
Expand Down
3 changes: 2 additions & 1 deletion haystack/preview/components/routers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from haystack.preview.components.routers.file_type_router import FileTypeRouter
from haystack.preview.components.routers.metadata_router import MetadataRouter
from haystack.preview.components.routers.text_language_router import TextLanguageRouter

__all__ = ["FileTypeRouter", "MetadataRouter"]
__all__ = ["FileTypeRouter", "MetadataRouter", "TextLanguageRouter"]
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,23 @@


@component
class TextLanguageClassifier:
class TextLanguageRouter:
"""
Routes a text input onto one of different output connections depending on its language.
This is useful for routing queries to different models in a pipeline depending on their language.
The set of supported languages can be specified.
For routing Documents based on their language use the related DocumentLanguageClassifier component.
For routing Documents based on their language use the related DocumentLanguageClassifier component to first
classify the documents and then the MetaDataRouter to route them.
Example usage in a retrieval pipeline that passes only English language queries to the retriever:
```python
document_store = InMemoryDocumentStore()
p = Pipeline()
p.add_component(instance=TextLanguageClassifier(), name="text_language_classifier")
p.add_component(instance=TextLanguageRouter(), name="text_language_router")
p.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="retriever")
p.connect("text_language_classifier.en", "retriever.query")
p.run({"text_language_classifier": {"text": "What's your query?"}})
p.connect("text_language_router.en", "retriever.query")
p.run({"text_language_router": {"text": "What's your query?"}})
```
"""

Expand All @@ -42,15 +43,15 @@ def __init__(self, languages: Optional[List[str]] = None):

def run(self, text: str) -> Dict[str, str]:
"""
Run the TextLanguageClassifier. This method routes the text one of different edges based on its language.
Run the TextLanguageRouter. This method routes the text one of different edges based on its language.
If the text does not match any of the languages specified at initialization, it is routed to
a connection named "unmatched".
:param text: A str to route to one of different edges.
"""
if not isinstance(text, str):
raise TypeError(
"TextLanguageClassifier expects a str as input. In case you want to classify a document, please use the DocumentLanguageClassifier."
"TextLanguageRouter expects a str as input. In case you want to classify a document, please use the DocumentLanguageClassifier and MetaDataRouter."
)

output: Dict[str, str] = {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
preview:
- |
Remove routing functionality from DocumentLanguageClassifier and rename TextLanguageClassifer to TextLanguageRouter.
Classifiers in Haystack 2.x change metadata values but do not route inputs to multiple outputs. The latter is reserved for routers.
Use DocumentLanguageClassifier in combination with MetaDataRouter to classify and route documents in indexing pipelines.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_single_document(self):
def test_empty_list(self):
classifier = DocumentLanguageClassifier()
result = classifier.run(documents=[])
assert result == {"en": [], "unmatched": []}
assert result == {"documents": []}

@pytest.mark.unit
def test_detect_language(self):
Expand All @@ -36,12 +36,13 @@ def test_detect_language(self):
assert detected_language == "en"

@pytest.mark.unit
def test_route_to_en_and_unmatched(self):
def test_classify_as_en_and_unmatched(self):
classifier = DocumentLanguageClassifier()
english_document = Document(content="This is an english sentence.")
german_document = Document(content="Ein deutscher Satz ohne Verb.")
result = classifier.run(documents=[english_document, german_document])
assert result == {"en": [english_document], "unmatched": [german_document]}
assert result["documents"][0].meta["language"] == "en"
assert result["documents"][1].meta["language"] == "unmatched"

@pytest.mark.unit
def test_warning_if_no_language_detected(self, caplog):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,51 @@
import pytest

from haystack.preview import Document
from haystack.preview.components.classifiers import TextLanguageClassifier
from haystack.preview.components.routers import TextLanguageRouter


class TestTextLanguageClassifier:
class TestTextLanguageRouter:
@pytest.mark.unit
def test_non_string_input(self):
with pytest.raises(TypeError, match="TextLanguageClassifier expects a str as input."):
classifier = TextLanguageClassifier()
with pytest.raises(TypeError, match="TextLanguageRouter expects a str as input."):
classifier = TextLanguageRouter()
classifier.run(text=Document(content="This is an english sentence."))

@pytest.mark.unit
def test_list_of_string(self):
with pytest.raises(TypeError, match="TextLanguageClassifier expects a str as input."):
classifier = TextLanguageClassifier()
with pytest.raises(TypeError, match="TextLanguageRouter expects a str as input."):
classifier = TextLanguageRouter()
classifier.run(text=["This is an english sentence."])

@pytest.mark.unit
def test_empty_string(self):
classifier = TextLanguageClassifier()
classifier = TextLanguageRouter()
result = classifier.run(text="")
assert result == {"unmatched": ""}

@pytest.mark.unit
def test_detect_language(self):
classifier = TextLanguageClassifier()
classifier = TextLanguageRouter()
detected_language = classifier.detect_language("This is an english sentence.")
assert detected_language == "en"

@pytest.mark.unit
def test_route_to_en(self):
classifier = TextLanguageClassifier()
classifier = TextLanguageRouter()
english_sentence = "This is an english sentence."
result = classifier.run(text=english_sentence)
assert result == {"en": english_sentence}

@pytest.mark.unit
def test_route_to_unmatched(self):
classifier = TextLanguageClassifier()
classifier = TextLanguageRouter()
german_sentence = "Ein deutscher Satz ohne Verb."
result = classifier.run(text=german_sentence)
assert result == {"unmatched": german_sentence}

@pytest.mark.unit
def test_warning_if_no_language_detected(self, caplog):
with caplog.at_level(logging.WARNING):
classifier = TextLanguageClassifier()
classifier = TextLanguageRouter()
classifier.run(text=".")
assert "Langdetect cannot detect the language of text: ." in caplog.text

0 comments on commit 08ec492

Please sign in to comment.