In [None]:
from haystack.dataclasses import Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack import Pipeline
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

In [None]:
EMBEDDING_MODEL = ""
EMBEDDING_DIMENSIONS = 0
VECTOR_STORE_URL = ""
COLLECTION_NAME = ""
PDFS_PATH = ""

In [None]:
document_store = QdrantDocumentStore(
		url=VECTOR_STORE_URL,
    index=COLLECTION_NAME,
    embedding_dim=EMBEDDING_DIMENSIONS,
)
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

In [None]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [None]:
document_embedder = SentenceTransformersDocumentEmbedder(
    model=EMBEDDING_MODEL,
)
document_writer = DocumentWriter(document_store)

In [None]:
from haystack import component
from typing import List

# For mixedbread-ai model only!

@component
class QueryAdder:
  """
  A component that adds a query to each doc for specific embedding models
  """
  @component.output_types(documents=List[Document])
  def run(self, documents:List[Document]):
    for doc in documents:
      if doc.content:
        doc.content = f'Represent this sentence for searching relevant passages: {doc.content}'

    return {"documents": documents}
  
query_adder = QueryAdder()

In [None]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")

# If mixedbread-ai model
# preprocessing_pipeline.add_component(instance=query_adder, name="query_adder")

preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

In [None]:
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")

# If mixedbread-ai model
# preprocessing_pipeline.connect("document_splitter", "query_adder")
# preprocessing_pipeline.connect("query_adder", "document_embedder")
# preprocessing_pipeline.connect("document_embedder", "document_writer")

# If not mixedbread
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

In [None]:
from pathlib import Path
sources = list(Path(PDFS_PATH).glob("**/*"))

In [None]:
preprocessing_pipeline.run({"file_type_router": {"sources": sources}})