In [None]:
from talk_rag_scheduler.indexing_pipeline import (
    ingest_knowledge_and_run_indexing,
    KEYWORD_INDEXING_PIPE_IMG,
    SEMANTIC_INDEXING_PIPE_IMG,
    SEMANTIC_SPLIT_INDEXING_PIPE_IMG,
)
from talk_rag_scheduler.utils import prettify_document_info
from haystack.document_stores.in_memory import InMemoryDocumentStore

from IPython.display import Image

## BM Indexing Pipeline


In [None]:
document_store = InMemoryDocumentStore()
indexing_pipeline = ingest_knowledge_and_run_indexing(
    document_store,
    indexing_mode="keyword",
)
Image(KEYWORD_INDEXING_PIPE_IMG)

## Semantic Vector Indexing Pipeline


In [None]:
document_store_semantic = InMemoryDocumentStore()
indexing_pipeline_semantic = ingest_knowledge_and_run_indexing(
    document_store_semantic,
    indexing_mode="semantic",
    ollama_embedding_model="mxbai-embed-large",
)
Image(SEMANTIC_INDEXING_PIPE_IMG)

In [None]:
documents_semantic = list(document_store_semantic.storage.values())
print(f"Number of documents: {len(documents_semantic)}")
document_semantic_generator = iter(documents_semantic)

In [None]:
document_semantic = next(document_semantic_generator)
print(prettify_document_info(document_semantic), end=f"\n{80*'='}\n\n")
print(document_semantic.content)

## Semantic Vector Indexing Pipeline with document splitting


In [None]:
document_store_semantic_split = InMemoryDocumentStore()
indexing_pipeline_semantic_split = ingest_knowledge_and_run_indexing(
    document_store_semantic_split,
    ollama_embedding_model="mxbai-embed-large",
    indexing_mode="semantic_split",
    split_by="passage",
    split_length=16,
    split_overlap=4,
    split_threshold=12,
)
Image(SEMANTIC_SPLIT_INDEXING_PIPE_IMG)

In [None]:
documents_semantic_split = list(document_store_semantic_split.storage.values())
print(f"Number of documents: {len(documents_semantic_split)}")
document_semantic_split_generator = iter(documents_semantic_split)

In [None]:
document_semantic_split = next(document_semantic_split_generator)
print(prettify_document_info(document_semantic_split), end=f"\n{80*'='}\n\n")
print(document_semantic_split.content)

## Improvement Options

- Tune the document splitting parameters
- Embed Metadata into the documents e.g. `meta_fields_to_embed` argument in `OllamaDocumentEmbedder`
- Try out different embedding models
- Try out different options in `InMemoryDocumentStore` (e.g. `bm25_algorithm`, `embedding_similarity_function`)
- Use hybrid embedding/retrieval methods & reranking

Every embedder is different, to use embedding models to their full potential, read their documentation. The presented implementation does not take the following into account:

- some embedding models require different query pre-prompts during retrieval and indexing
- some embedding models are dual encoders => different model for indexing vs retrieval
- we are ignoring the maximal sequence length of the used embedding model, which can lead to truncated embeddings