# DocumentLanguageClassifier

In [1]:
%load_ext autoreload
%autoreload 2

## On its own

In [2]:
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack import Document

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
documents = [
    Document(content="Mein Name ist Jean und ich wohne in Paris."),
    Document(content="Mein Name ist Mark und ich wohne in Berlin."),
    Document(content="Mein Name ist Giorgio und ich wohne in Rome."),
    Document(content="My name is Pierre and I live in Paris"),
    Document(content="My name is Paul and I live in Berlin."),
    Document(content="My name is Alessia and I live in Rome."),
    Document(content="H√¥m nay Lan ƒë·∫øn tr∆∞·ªùng l√∫c 7 gi·ªù s√°ng")
]

In [4]:
document_classifier = DocumentLanguageClassifier(languages = ["en", "de", "vi"])
document_classifier.run(documents = documents)

{'documents': [Document(id=0ac9a37058e1916b6579ee6c9a291539ea6eb0f6005295298b40af2f8aaf9d63, content: 'Mein Name ist Jean und ich wohne in Paris.', meta: {'language': 'de'}),
  Document(id=5887d219ff2a7b92ca2ca4e55ac4fe3b158a62f3a33ee40e39aaa009f8cf9765, content: 'Mein Name ist Mark und ich wohne in Berlin.', meta: {'language': 'de'}),
  Document(id=5e38bdd62b445a8677366bf4c530b42d4e222443c4a8164ba2fe4e1db20ddfe8, content: 'Mein Name ist Giorgio und ich wohne in Rome.', meta: {'language': 'de'}),
  Document(id=b81701beac28b48fed874f79eb04912d7459efbc113397e3d69a67a905f041fc, content: 'My name is Pierre and I live in Paris', meta: {'language': 'en'}),
  Document(id=20a080c1cbca6f558b33bff71de7d66b6972e3e166a9e03b18995eea494d545a, content: 'My name is Paul and I live in Berlin.', meta: {'language': 'en'}),
  Document(id=667a031310c4f8d7f2fada3f77555438797387a9b0a6c2ae26c0f5a9b0f4287f, content: 'My name is Alessia and I live in Rome.', meta: {'language': 'en'}),
  Document(id=e307512abbdb

## In pipeline

In [5]:
from haystack import Pipeline
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.routers import MetadataRouter

In [6]:
document_store_en = InMemoryDocumentStore()
document_store_de = InMemoryDocumentStore()
document_store_vi = InMemoryDocumentStore()

In [7]:
document_classifier = DocumentLanguageClassifier(languages = ["en", "de", "vi"])
metadata_router = MetadataRouter(rules={
    "en": {"field": "meta.language", "operator": "==", "value": "en"}, 
    "de": {"field": "meta.language", "operator": "==", "value": "de"}, 
    "vi": {"field": "meta.language", "operator": "==", "value": "vi"}})

english_embedder = SentenceTransformersDocumentEmbedder()
german_embedder = SentenceTransformersDocumentEmbedder(model="PM-AI/bi-encoder_msmarco_bert-base_german")
vietnam_embedder = SentenceTransformersDocumentEmbedder(model="hiieu/halong_embedding")

en_writer = DocumentWriter(document_store = document_store_en)
de_writer = DocumentWriter(document_store = document_store_de)
vi_writer = DocumentWriter(document_store = document_store_vi)

In [8]:
indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=document_classifier, name="document_classifier")
indexing_pipeline.add_component(instance=metadata_router, name="metadata_router")

indexing_pipeline.add_component(instance=english_embedder, name="english_embedder")
indexing_pipeline.add_component(instance=german_embedder, name="german_embedder")
indexing_pipeline.add_component(instance=vietnam_embedder, name="vietnam_embedder")

indexing_pipeline.add_component(instance=en_writer, name="en_writer")
indexing_pipeline.add_component(instance=de_writer, name="de_writer")
indexing_pipeline.add_component(instance=vi_writer, name="vi_writer")

In [9]:
indexing_pipeline.connect("document_classifier.documents", "metadata_router.documents")
indexing_pipeline.connect("metadata_router.en", "english_embedder.documents")
indexing_pipeline.connect("metadata_router.de", "german_embedder.documents")
indexing_pipeline.connect("metadata_router.vi", "vietnam_embedder.documents")
indexing_pipeline.connect("english_embedder", "en_writer")
indexing_pipeline.connect("german_embedder", "de_writer")
indexing_pipeline.connect("vietnam_embedder", "vi_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x74854c5fc770>
üöÖ Components
  - document_classifier: DocumentLanguageClassifier
  - metadata_router: MetadataRouter
  - english_embedder: SentenceTransformersDocumentEmbedder
  - german_embedder: SentenceTransformersDocumentEmbedder
  - vietnam_embedder: SentenceTransformersDocumentEmbedder
  - en_writer: DocumentWriter
  - de_writer: DocumentWriter
  - vi_writer: DocumentWriter
üõ§Ô∏è Connections
  - document_classifier.documents -> metadata_router.documents (List[Document])
  - metadata_router.en -> english_embedder.documents (List[Document])
  - metadata_router.de -> german_embedder.documents (List[Document])
  - metadata_router.vi -> vietnam_embedder.documents (List[Document])
  - english_embedder.documents -> en_writer.documents (List[Document])
  - german_embedder.documents -> de_writer.documents (List[Document])
  - vietnam_embedder.documents -> vi_writer.documents (List[Document])

In [10]:
res = indexing_pipeline.run({
    "document_classifier": {
        "documents": [
            Document(content="This is an English sentence."),
            Document(content="Nhi·ªÅu ng∆∞·ªùi Vi·ªát Nam th√≠ch ƒÉn b√∫n b√≤ h∆°n l√† ph·ªü."), 
            Document(content="Dies ist ein deutscher Satz.")]
        }
    })


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.64it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 113.94it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 379.58it/s]


In [15]:
print(document_store_vi.filter_documents())
print(document_store_de.filter_documents())
print(document_store_en.filter_documents())

[Document(id=7828bf2bb39635a39784cac60427cd7b07b1c2070f315ce0b3073ea0a5d396d1, content: 'Nhi·ªÅu ng∆∞·ªùi Vi·ªát Nam th√≠ch ƒÉn b√∫n b√≤ h∆°n l√† ph·ªü.', meta: {'language': 'vi'}, embedding: vector of size 768)]
[Document(id=a3ff882d640400c1fc9dc2feec70377ebfeb94369615a419a9fa4969d669cbad, content: 'Dies ist ein deutscher Satz.', meta: {'language': 'de'}, embedding: vector of size 768)]
[Document(id=03eb817ff558f4bdc84e4ac84025f109dd32f5f537135813eb0db6947387593d, content: 'This is an English sentence.', meta: {'language': 'en'}, embedding: vector of size 768)]


In [16]:
res = indexing_pipeline.run({
    "document_classifier": {
        "documents": [
            Document(content="Data science is a difficult field to master."),
            Document(content="·ªû Vi·ªát Nam h·ªçc sinh gi·ªèi t√≠nh to√°n trong khi ·ªü M·ªπ h·ªçc sinh gi·ªèi d√πng to√°n.")]
        }
    })


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 68.14it/s]
Batches: 0it [00:00, ?it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 343.88it/s]


In [17]:
print(document_store_vi.filter_documents())
print(document_store_de.filter_documents())
print(document_store_en.filter_documents())

[Document(id=7828bf2bb39635a39784cac60427cd7b07b1c2070f315ce0b3073ea0a5d396d1, content: 'Nhi·ªÅu ng∆∞·ªùi Vi·ªát Nam th√≠ch ƒÉn b√∫n b√≤ h∆°n l√† ph·ªü.', meta: {'language': 'vi'}, embedding: vector of size 768), Document(id=a844e963df14668c8f9abeff34b4211b5f227621ff04d2b14803a8737ae2a697, content: '·ªû Vi·ªát Nam h·ªçc sinh gi·ªèi t√≠nh to√°n trong khi ·ªü M·ªπ h·ªçc sinh gi·ªèi d√πng to√°n.', meta: {'language': 'vi'}, embedding: vector of size 768)]
[Document(id=a3ff882d640400c1fc9dc2feec70377ebfeb94369615a419a9fa4969d669cbad, content: 'Dies ist ein deutscher Satz.', meta: {'language': 'de'}, embedding: vector of size 768)]
[Document(id=03eb817ff558f4bdc84e4ac84025f109dd32f5f537135813eb0db6947387593d, content: 'This is an English sentence.', meta: {'language': 'en'}, embedding: vector of size 768), Document(id=f45382c518930799322f45b33cb304177c4745e2cb068230ab54146383d2de22, content: 'Data science is a difficult field to master.', meta: {'language': 'en'}, embedding: vector of size 7

In [18]:
for vec in document_store_vi.filter_documents():
    print(vec.content)
    print(vec.meta)
    print(vec.embedding)
    print("\n")

Nhi·ªÅu ng∆∞·ªùi Vi·ªát Nam th√≠ch ƒÉn b√∫n b√≤ h∆°n l√† ph·ªü.
{'language': 'vi'}
[-0.059575799852609634, 0.06467528641223907, -0.028164446353912354, -0.026802372187376022, 0.05627518147230148, -0.06304595619440079, 0.021191854029893875, 0.013952977024018764, 0.07550060749053955, -0.04218462109565735, -0.019325023517012596, 0.06983602046966553, 0.095492422580719, -0.13055546581745148, 0.018392028287053108, 0.029443593695759773, -0.009790194220840931, 0.05078203231096268, -0.02911238558590412, -0.0007790701929479837, -0.028066975995898247, 0.06279373168945312, 8.610669465269893e-05, -0.012271805666387081, 0.06677582114934921, 0.006719446275383234, 0.0014813455054536462, -0.009660515934228897, 0.026375871151685715, 0.05030980333685875, -0.027215592563152313, 0.06045902520418167, -0.04460475221276283, -0.04111124947667122, 0.05967257544398308, 0.0016679103719070554, 0.017224086448550224, -0.02797710709273815, -0.027413377538323402, 0.02683025225996971, 0.02539050206542015, -0.03303680196