In [None]:
from haystack.dataclasses import Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.joiners import DocumentJoiner
from haystack import Pipeline
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

In [None]:
CSV_PATH = ""
# Assumed CSV columns: text, title, doi, publication_name, subject_areas, etc.
EMBEDDING_MODEL = ""
EMBEDDING_DIMENSIONS = 0
VECTOR_STORE_URL = ""
COLLECTION_NAME = ""


In [None]:
import pandas as pd

# Load csv file as pandas dataframe
df = pd.read_csv(CSV_PATH)

print("Dataframe loaded")

# Drop rows where text is none or empty
df = df.dropna(subset=['text'])

In [None]:
# Print dataframe columns
print(df.columns)

In [None]:
# Create list of Haystack Documents from the dataframe where the text column is mapped to the content key
# and the other columns besides id are mapped to the meta key
documents = []
for index, row in df.iterrows():
    documents.append(Document(content=row['text'], meta={"title": row['title'], "doi": row['doi'], "publication_name": row['publication_name'], "subject_areas": row['subject_areas']}))

In [None]:
document_store = QdrantDocumentStore(
		url=VECTOR_STORE_URL,
    index=COLLECTION_NAME,
    embedding_dim=EMBEDDING_DIMENSIONS
)
document_joiner = DocumentJoiner()

In [None]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [None]:
document_embedder = SentenceTransformersDocumentEmbedder(
    model=EMBEDDING_MODEL,
)
document_writer = DocumentWriter(document_store)

In [None]:
from haystack import component
from typing import List

# For mixedbread-ai model only!

@component
class QueryAdder:
  """
  A component that adds a query to each doc for specific embedding models
  """
  @component.output_types(documents=List[Document])
  def run(self, documents:List[Document]):
    for doc in documents:
      if doc.content:
        doc.content = f'Represent this sentence for searching relevant passages: {doc.content}'

    return {"documents": documents}
  
query_adder = QueryAdder()

In [None]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")

# If mixedbread-ai model
# preprocessing_pipeline.add_component(instance=query_adder, name="query_adder")

preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

In [None]:
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")

# If mixedbread-ai model
# preprocessing_pipeline.connect("document_splitter", "query_adder")
# preprocessing_pipeline.connect("query_adder", "document_embedder")
# preprocessing_pipeline.connect("document_embedder", "document_writer")

# If not mixedbread
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

In [None]:
preprocessing_pipeline.run({"documents": documents})