# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

import mlflow

import sys
sys.path.insert(0, '..')

In [3]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [46]:
TESTING = False
DEBUG = False
OBSERVABILITY = True
LOG_TO_MLFLOW = True

# Run metadata
RUN_NAME = "exp_007_semantic_chunking_full_refresh"
RUN_DESCRIPTION = """
# Try Semantic Chunking

## Changelog
### Compares to exp_006
- Previously do not recreate the response eval dataset. This experiment will recreate them since the nodes are generated differently by a new chunker.
"""

# Vector Store Index
RECREATE_INDEX = False
if not RECREATE_INDEX:
    COLLECTION = "huggingface__BAAI_bge_large_en_v1_5__exp_006_semantic_chunking"
    NODES_PERSIST_FP = 'data/001/exp_006_semantic_chunking/nodes.pkl'
    logger.info(f"{COLLECTION=}")
    logger.info(f"{NODES_PERSIST_FP=}")

# Retrieval eval
RECREATE_RETRIEVAL_EVAL_DATASET = False
# Currently can not reuse retrieval_eval_dataset because the retrieval evaluation is based on ids
if not RECREATE_RETRIEVAL_EVAL_DATASET:
    RETRIEVAL_EVAL_DATASET_FP = f"data/001/exp_006_semantic_chunking/llamaindex_blog_retrieval_eval_dataset.json"
    logger.info(f"{RETRIEVAL_EVAL_DATASET_FP=}")

# Response eval
RECREATE_SYNTHETIC_EVAL_DATASET = True
if not RECREATE_SYNTHETIC_EVAL_DATASET:
    RESPONSE_EVAL_DATASET_FP = f"dataset at data/001/exp_007_semantic_chunking_full_refresh/llamaindex_blog_response_eval_dataset.json"
    logger.info(f"{RESPONSE_EVAL_DATASET_FP=}")

[32m2024-07-25 12:55:46.362[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mCOLLECTION='huggingface__BAAI_bge_large_en_v1_5__exp_006_semantic_chunking'[0m
[32m2024-07-25 12:55:46.363[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mNODES_PERSIST_FP='data/001/exp_006_semantic_chunking/nodes.pkl'[0m
[32m2024-07-25 12:55:46.364[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mRETRIEVAL_EVAL_DATASET_FP='data/001/exp_006_semantic_chunking/llamaindex_blog_retrieval_eval_dataset.json'[0m


In [6]:
if OBSERVABILITY:
    import phoenix as px
    px.launch_app()
    import llama_index.core
    llama_index.core.set_global_handler("arize_phoenix")

I0000 00:00:1721886852.718689 3152400 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [7]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [8]:
if LOG_TO_MLFLOW:
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

In [9]:
NOTEBOOK_CACHE_DP = f'data/001/{RUN_NAME}'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

if LOG_TO_MLFLOW:
    mlflow.log_param("NOTEBOOK_CACHE_DP", NOTEBOOK_CACHE_DP)

# Load data

In [10]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs-v2.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [11]:
len(data)

160

In [12]:
data[:5]

[{'title': 'Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations',
  'content': "This is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation framework, developers can experiment with different configurations, fine-tuning their LLM-based applications for optimal performance. About UpTrain UpTrain  [ github  ||  website  ||  docs ] is an open-source platform to evaluate and improve LLM applications. It provides grades for 20+ preconfigured checks (covering language, code, embedding use cases), performs root cause analyses on instances of failu

# Check data

In [13]:
data[0]['content']

"This is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation framework, developers can experiment with different configurations, fine-tuning their LLM-based applications for optimal performance. About UpTrain UpTrain  [ github  ||  website  ||  docs ] is an open-source platform to evaluate and improve LLM applications. It provides grades for 20+ preconfigured checks (covering language, code, embedding use cases), performs root cause analyses on instances of failure cases and provides guidance for resolving them. Key Highlights: Data Security:  As an open

# Prepare documents

In [14]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_input_data", len(input_data))

[32m2024-07-25 12:54:13.662[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=160[0m


In [15]:
from llama_index.core import Document

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags']),
        'url': record['url']
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [16]:
documents[0]

Document(id_='24a0c6e4-def2-4cd2-9600-ba71e14a9804', embedding=None, metadata={'title': 'Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations', 'author': 'Uptrain', 'date': 'Mar 19, 2024', 'tags': 'AI, Evaluation, Rag', 'url': 'https://www.llamaindex.ai/blog/supercharge-your-llamaindex-rag-pipeline-with-uptrain-evaluations'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations\nThis is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation fram

In [17]:
documents[1].metadata

{'title': 'LlamaIndex Newsletter 2024-04-02',
 'author': 'LlamaIndex',
 'date': 'Apr 2, 2024',
 'tags': 'LLM',
 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-04-02'}

In [18]:
if LOG_TO_MLFLOW:
    mlflow.log_param("len_documents", len(documents))

## Setting LLM

In [19]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [20]:
# LLM_OPTION = 'openai'
# LLM_OPTION = 'ollama'
LLM_OPTION = 'togetherai'

# LLM_MODEL_NAME = 'llama3'
# LLM_MODEL_NAME = 'gpt-3.5-turbo'
LLM_MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'

# EMBED_OPTION = 'openai'
# EMBED_OPTION = 'togetherai'
# EMBED_OPTION = 'ollama'
EMBED_OPTION = 'huggingface'

# EMBED_MODEL_NAME = 'llama3'
# EMBED_MODEL_NAME = 'togethercomputer/m2-bert-80M-2k-retrieval'
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)
    mlflow.log_param("LLM_MODEL_NAME", LLM_MODEL_NAME)
    mlflow.log_param("EMBED_OPTION", EMBED_OPTION)
    mlflow.log_param("EMBED_MODEL_NAME", EMBED_MODEL_NAME)

In [21]:
# LLM options
if LLM_OPTION == 'ollama':
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    llm = Ollama(base_url=base_url, model=LLM_MODEL_NAME, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
elif LLM_OPTION == 'openai':
    from llama_index.llms.openai import OpenAI
    llm = OpenAI(model=LLM_MODEL_NAME)
elif LLM_OPTION == 'togetherai':
    from llama_index.llms.together import TogetherLLM
    llm = TogetherLLM(model=LLM_MODEL_NAME)

# Embed options
if EMBED_OPTION == 'huggingface':
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    embed_model = HuggingFaceEmbedding(
        model_name=EMBED_MODEL_NAME
    )
elif EMBED_OPTION == 'openai':
    from llama_index.embeddings.openai import OpenAIEmbedding
    embed_model = OpenAIEmbedding()
elif EMBED_OPTION == 'togetherai':
    from llama_index.embeddings.together import TogetherEmbedding
    embed_model = TogetherEmbedding(EMBED_MODEL_NAME)
elif EMBED_OPTION == 'ollama':
    from llama_index.embeddings.ollama import OllamaEmbedding
    embed_model = OllamaEmbedding(
        model_name=EMBED_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )

logger.info(f"LLM:\n{repr(llm)}")
logger.info(f"Embed model:\n{repr(embed_model)}")

[32m2024-07-25 12:54:20.033[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mLLM:
TogetherLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x76f2c01bd750>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x76f2cc385d00>, completion_to_prompt=<function default_completion_to_prompt at 0x76f2cc2200e0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='meta-llama/Meta-Llama-3-8B-Instruct-Lite', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='3cf613093b6eb9b479c341126dc8d3761c67f9340d0a4a8e1fdc62ed41b58126', api_base='https://api.together.xyz/v1', api_version='', context_window=3900, is_chat_model=True, is_function_calling_model=False, tokenizer=None)[0m
[32m2024-07-25 12:54:20.034[0m | [1mINFO    [0m | [36m__main__[0

In [22]:
embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
Settings.embed_model = embed_model
Settings.llm = llm

logger.info(f"{embed_model_dim=}")

if LOG_TO_MLFLOW:
    mlflow.log_param("embedding_model_dim", embed_model_dim)
    mlflow.log_param("LLM_MODEL", repr(llm))
    mlflow.log_param("EMBEDDING_MODEL", repr(embed_model))

[32m2024-07-25 12:54:20.735[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1membed_model_dim=1024[0m


# Index embeddings

In [23]:
import string

def substitute_punctuation(text):
    # Create a translation table that maps each punctuation character to an underscore
    translator = str.maketrans(string.punctuation, '_' * len(string.punctuation))
    # Translate the text using the translation table
    return text.translate(translator)

In [24]:
if RECREATE_INDEX:
    collection_raw_name = f"{EMBED_OPTION}__{EMBED_MODEL_NAME}__{RUN_NAME}"
    COLLECTION = substitute_punctuation(collection_raw_name)
    NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
logger.info(f"{COLLECTION=}")
logger.info(f"{NODES_PERSIST_FP=}")

if LOG_TO_MLFLOW:
    mlflow.log_param(f"COLLECTION", COLLECTION)
    mlflow.log_param(f"NODES_PERSIST_FP", NODES_PERSIST_FP)

[32m2024-07-25 12:54:20.911[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mCOLLECTION='huggingface__BAAI_bge_large_en_v1_5__exp_006_semantic_chunking'[0m
[32m2024-07-25 12:54:20.911[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNODES_PERSIST_FP='data/001/exp_006_semantic_chunking/nodes.pkl'[0m


## Qdrant as VectorStore

In [25]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [26]:
qdrantdb = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
aqdrantdb = qdrant_client.AsyncQdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
collection_exists = qdrantdb.collection_exists(COLLECTION)
if RECREATE_INDEX or not collection_exists:
    if collection_exists:
        logger.info(f"Deleting existing Qdrant collection...")
        qdrantdb.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
    logger.info(f"Creating new Qdrant collection...")
    qdrantdb.create_collection(
        COLLECTION,
        vectors_config=VectorParams(size=embed_model_dim, distance=Distance.COSINE),
    )
else:
    logger.info(f"Use existing Qdrant collection")
db_collection = qdrantdb.get_collection(COLLECTION)
vector_store = QdrantVectorStore(
    client=qdrantdb,
    collection_name=COLLECTION,
    aclient=aqdrantdb,
    prefer_grpc=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

[32m2024-07-25 12:54:21.779[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mUse existing Qdrant collection[0m
WARNI [llama_index.vector_stores.qdrant.base] Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [27]:
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser

# CHUNKER = SentenceSplitter
# CHUNKER_CONFIG = {
#     "chunk_size": 512,
#     "chunk_overlap": 10
# }
# Observation: When using SemanticSplitterNodeParser it could take about 5 mins
# eating GPU to do something (assuming prep work) and after that would run the ingestion pipeline 
CHUNKER = SemanticSplitterNodeParser
CHUNKER_CONFIG = {
    "buffer_size": 1,
    "breakpoint_percentile_threshold": 95,
    "embed_model": embed_model
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        logged_v = v
        if not (isinstance(v, int) or isinstance(v, str)):
            logged_v = repr(v)
        mlflow.log_param(f"CHUNKER__{k}", logged_v)

In [28]:
t0 = time.perf_counter()
# TODO: TO understand the differences between points_count and indexed_vector_counts.
# Here indexed_vector_counts = 0
db_collection_count = db_collection.points_count

if db_collection_count > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing DB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        logger.info(f"Loading cached `nodes` at {NODES_PERSIST_FP}...")
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new DB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            CHUNKER(**CHUNKER_CONFIG),
            TitleExtractor(),
            embed_model,
        ],
        vector_store = vector_store
    )

    num_workers = None
    # Currently setting num_workers leads to error `AttributeError: 'HuggingFaceEmbedding' object has no attribute '_model'`
    # num_workers = os.cpu_count() - 1
    # logger.info(f"Running Ingestion Pipeline with {num_workers=}...")
    nodes = await pipeline.arun(documents=documents, num_workers=num_workers)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
t1 = time.perf_counter()

[32m2024-07-25 12:54:21.930[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading index from existing DB...[0m
[32m2024-07-25 12:54:21.931[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mLoading cached `nodes` at data/001/exp_006_semantic_chunking/nodes.pkl...[0m


In [29]:
logger.info(f"Indexing {len(documents)} into VectorStoreIndex took {t1 - t0:,.0f}s")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_nodes", len(nodes))

[32m2024-07-25 12:54:22.754[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mIndexing 160 into VectorStoreIndex took 1s[0m


# Query engine

In [30]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.schema import MetadataMode
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

In [31]:
from src.features.append_reference.custom_query_engine import ManualAppendReferenceQueryEngine

In [32]:
RETRIEVAL_TOP_K = 5
RERANK_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3
# APPEND_REF_MODE = 'response_synthesizer'
APPEND_REF_MODE = 'query_engine'

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    mlflow.log_param("RERANK_TOP_K", RERANK_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [33]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

reranker = FlagEmbeddingReranker(model="BAAI/bge-reranker-large", top_n=RERANK_TOP_K)
node_postprocessors.append(reranker)

if APPEND_REF_MODE == 'response_synthesizer':
    response_synthesizer = ManualAppendReferenceSynthesizer(verbose=0)
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )
elif APPEND_REF_MODE == 'query_engine':
    response_synthesizer = get_response_synthesizer()
    query_engine = ManualAppendReferenceQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )
else:
    response_synthesizer = get_response_synthesizer()
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )

if LOG_TO_MLFLOW:
    mlflow.log_param("reranker", repr(reranker))
    mlflow.log_param("response_synthesizer", repr(response_synthesizer))
    mlflow.log_param("query_engine", repr(query_engine))

In [34]:
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_response,
)

In [35]:
question = "How can we address points of failures in RAG pipeline?"
response = query_engine.query(question)
display_response(response, show_source=True, show_metadata=True, show_source_metadata=True)

**`Final Response:`** To address points of failures in RAG pipeline, you can leverage the QueryPipeline's declarative orchestration abstraction. This allows you to compose both sequential chains and directed acyclic graphs (DAGs) of arbitrary complexity, making it easier to identify and troubleshoot potential issues. By using the QueryPipeline, you can reduce boilerplate code and increase readability, which can help you pinpoint and resolve problems more efficiently. Additionally, the QueryPipeline's end-to-end observability feature enables you to get callback integration across the entire pipeline, even for arbitrarily nested DAGs, making it easier to debug and identify potential failures.


Sources:
- [Introducing Query Pipelines](https://www.llamaindex.ai/blog/introducing-query-pipelines-025dc2bb0537)
- [RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex](https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089)

---

**`Source Node 1/2`**

**Node ID:** c1d767a4-bd20-4c9d-94e0-638fb3132515<br>**Similarity:** -4.749475955963135<br>**Text:** Check out our comprehensive  introduction guide , as well as our  docs page  for more details. Ex...<br>**Metadata:** {'title': 'Introducing Query Pipelines', 'author': 'Jerry Liu', 'date': 'Jan 8, 2024', 'tags': 'Llamaindex, Retrieval Augmented, LLM, AI', 'url': 'https://www.llamaindex.ai/blog/introducing-query-pipelines-025dc2bb0537', 'document_title': 'Based on the analysis, I would suggest the following comprehensive title:\n\n"Introducing Query Pipelines: A Declarative Orchestration Abstraction for Advanced RAG Pipelines - Query Pipeline and its Components: Running, Defining Custom Queries, and Supported Modules - Conclusion and Additional Resources for Query Pipelines"\n\nThis title captures the main themes and entities found in the context, including the introduction of Query Pipelines, its components, and the conclusion and additional resources related to query pipelines. It provides a clear and concise summary of the content, making it easy to understand the topic and its scope.'}<br>

---

**`Source Node 2/2`**

**Node ID:** 793c1e5e-6147-4514-9e43-8d35ab064df1<br>**Similarity:** -5.54605770111084<br>**Text:** RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Power...<br>**Metadata:** {'title': 'RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex', 'author': 'Harshad Suryawanshi', 'date': 'Feb 2, 2024', 'tags': 'Rag, No Code, Llamaindex, OpenAI, Code Generation', 'url': 'https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089', 'document_title': 'Based on the context, I would suggest the following comprehensive title:\n\n"RAGArch: A No-Code RAG Pipeline Configuration and One-Click Code Generation Tool Powered by LlamaIndex: A Collaborative Platform for Developers and AI Enthusiasts to Accelerate Idea-to-Implementation Pipelines"\n\nThis title captures the main entities and themes present in the context, including RAGArch, LlamaIndex, no-code platform, collaboration, developers, AI enthusiasts, and idea-to-implementation pipelines. It provides a clear and concise summary of the platform\'s purpose and functionality, making it an effective title for the document.'}<br>

{'c1d767a4-bd20-4c9d-94e0-638fb3132515': {'title': 'Introducing Query Pipelines',
  'author': 'Jerry Liu',
  'date': 'Jan 8, 2024',
  'tags': 'Llamaindex, Retrieval Augmented, LLM, AI',
  'url': 'https://www.llamaindex.ai/blog/introducing-query-pipelines-025dc2bb0537',
  'document_title': 'Based on the analysis, I would suggest the following comprehensive title:\n\n"Introducing Query Pipelines: A Declarative Orchestration Abstraction for Advanced RAG Pipelines - Query Pipeline and its Components: Running, Defining Custom Queries, and Supported Modules - Conclusion and Additional Resources for Query Pipelines"\n\nThis title captures the main themes and entities found in the context, including the introduction of Query Pipelines, its components, and the conclusion and additional resources related to query pipelines. It provides a clear and concise summary of the content, making it easy to understand the topic and its scope.'},
 '793c1e5e-6147-4514-9e43-8d35ab064df1': {'title': 'RAGArch: 

# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [36]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [37]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [38]:
RETRIEVAL_NUM_SAMPLE_NODES = 10
RETRIEVAL_NUM_SAMPLE_NODES = min(len(nodes), RETRIEVAL_NUM_SAMPLE_NODES)
RETRIEVAL_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
# RETRIEVAL_EVAL_LLM_MODEL = 'gpt-4'
RETRIEVAL_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [47]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes
else:
    logger.info(f"Loading retrieval_eval_nodes from {RETRIEVAL_EVAL_DATASET_FP}...")
    with open(RETRIEVAL_EVAL_DATASET_FP, 'r') as f:
        retrieval_eval_nodes = json.load(f)

[32m2024-07-25 12:55:59.642[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mLoading retrieval_eval_nodes from data/001/exp_006_semantic_chunking/llamaindex_blog_retrieval_eval_dataset.json...[0m


In [48]:
QUESTION_GEN_QUERY = """
You are a Retriever Evaluator. Your task is to generate {num_questions_per_chunk} questions to assess the accuracy/relevancy of an information retrieval system.
The information retrieval system would then be asked your generated question and assessed on how well it can look up and return the correct context.

IMPORTANT RULES:
- Restrict the generated questions to the context information provided.
- Do not mention anything about the context in the generated questions.
- The generated questions should be diverse in nature and in difficulty across the documents.
- When being asked the generated question, a human with no prior knowledge can still answer perfectly given the input context.
"""
QA_GENERATE_PROMPT_TMPL = f"""
Context information is below.

---------------------
{{context_str}}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

{QUESTION_GEN_QUERY}
"""

if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    # Use good model to generate the eval dataset
    from llama_index.llms.openai import OpenAI
    retrieval_eval_llm = OpenAI(model=RETRIEVAL_EVAL_LLM_MODEL, **RETRIEVAL_EVAL_LLM_MODEL_CONFIG)

    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes,
        llm=retrieval_eval_llm,
        num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK,
        qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL
    )
    logger.info(f"Persisting synthetic retrieval eval dataset to {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-25 12:56:00.800[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m39[0m - [1mLoading existing synthetic retrieval eval dataset at data/001/exp_006_semantic_chunking/llamaindex_blog_retrieval_eval_dataset.json...[0m


In [49]:
retrieval_eval_dataset.queries

{'492819bb-c1ee-43da-a90c-0f89d14442e0': 'What new features have been introduced in LlamaCloud, and which platforms have been integrated with it?',
 'd1542279-9c49-4576-83fb-65b9c706251f': 'How has Scaleport AI utilized LlamaCloud and LlamaIndex to enhance their development speed and sales across different industries?',
 'a4f2465b-5c0a-4373-bc3a-736b61269f99': 'How can the alignment and safety of LLMs and LMMs be evaluated?',
 '652a5aeb-c95f-4199-a9bf-354c7b397753': 'What are some important dimensions to consider when evaluating LLMs and LMMs, aside from knowledge and reasoning capabilities?',
 '8d6701d8-2439-4d22-b319-ff9aaa145949': 'Who conducted a workshop at the LlamaIndex + Replit Pune Generative AI meetup?',
 '42a0603b-004d-40ae-ad08-8e4ac7ba3ad6': 'Which individuals were involved in the webinar on LLM Challenges in Production?',
 '543c9bd0-1725-48c0-9375-7ab9cc1c87f4': 'How does the MultiOn agent manage the action of sending an email through the web browser?',
 'b7a74a1a-f305-4d

### Evaluate

In [50]:
from llama_index.core.evaluation import RetrieverEvaluator

In [51]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

In [52]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    eval_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        eval_dict = {
            "query": eval_result.query,
            "expected_ids": eval_result.expected_ids,
            "retrieved_texts": eval_result.retrieved_texts,
            **metric_dict
        }
        eval_dicts.append(eval_dict)

    full_df = pd.DataFrame(eval_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df, full_df

In [53]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df, retrieval_eval_results_full_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_5_retrieval_eval,0.7,0.5725,0.14,0.7,0.5725,0.204842


In [54]:
retrieval_eval_results_full_df

Unnamed: 0,query,expected_ids,retrieved_texts,hit_rate,mrr,precision,recall,ap,ndcg
0,What new features have been introduced in Llam...,[503ebcc2-3c47-4215-a6c8-d5cd3a52ac00],[Introducing LlamaCloud and LlamaParse\nToday ...,0.0,0.0,0.0,0.0,0.0,0.0
1,How has Scaleport AI utilized LlamaCloud and L...,[503ebcc2-3c47-4215-a6c8-d5cd3a52ac00],[Case Study: How Scaleport.ai Accelerated Deve...,1.0,0.25,0.2,1.0,0.25,0.146068
2,How can the alignment and safety of LLMs and L...,[e0877b54-7ce0-4778-bef7-2051f8ef9ebf],[Though studies have shown that strong LLMs ca...,1.0,1.0,0.2,1.0,1.0,0.33916
3,What are some important dimensions to consider...,[e0877b54-7ce0-4778-bef7-2051f8ef9ebf],[Though studies have shown that strong LLMs ca...,1.0,1.0,0.2,1.0,1.0,0.33916
4,Who conducted a workshop at the LlamaIndex + R...,[7522899d-0ec2-40f0-b367-c8e3342fa283],[Ravi Theja conducted a workshop at LlamaIn...,1.0,1.0,0.2,1.0,1.0,0.33916
5,Which individuals were involved in the webinar...,[7522899d-0ec2-40f0-b367-c8e3342fa283],[Raymond \n P.S. — This article is titled Par...,0.0,0.0,0.0,0.0,0.0,0.0
6,How does the MultiOn agent manage the action o...,[8aac6c35-0afc-41b8-9412-d7778d8b2cb9],"[3. Send Email through MultiOn : Finally, the ...",1.0,1.0,0.2,1.0,1.0,0.33916
7,Where can developers find information about th...,[8aac6c35-0afc-41b8-9412-d7778d8b2cb9],[Automate online tasks with MultiOn and LlamaI...,1.0,0.5,0.2,1.0,0.5,0.213986
8,What is the cost for evaluation of the Prometh...,[575a1ebd-5b35-4c14-a826-7ef4ec3b4654],[This is in line with the information provided...,1.0,1.0,0.2,1.0,1.0,0.33916
9,How does the Prometheus model differ from GPT-...,[575a1ebd-5b35-4c14-a826-7ef4ec3b4654],[This is in line with the information provided...,1.0,1.0,0.2,1.0,1.0,0.33916


In [55]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)
    retrieval_eval_results_full_df.to_html(f"{NOTEBOOK_CACHE_DP}/retrieval_eval_results_full_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/retrieval_eval_results_full_df.html", "retrieval_eval_results_full_df")

#### Error Analysis

In [56]:
retrieval_eval_irrelevance_df = (
    retrieval_eval_results_full_df
    .loc[lambda df: df['hit_rate'].lt(1)]
    .sort_values(['hit_rate', 'mrr', 'precision', 'recall', 'ap', 'ndcg'])
)
retrieval_eval_irrelevance_df

Unnamed: 0,query,expected_ids,retrieved_texts,hit_rate,mrr,precision,recall,ap,ndcg
0,What new features have been introduced in Llam...,[503ebcc2-3c47-4215-a6c8-d5cd3a52ac00],[Introducing LlamaCloud and LlamaParse\nToday ...,0.0,0.0,0.0,0.0,0.0,0.0
5,Which individuals were involved in the webinar...,[7522899d-0ec2-40f0-b367-c8e3342fa283],[Raymond \n P.S. — This article is titled Par...,0.0,0.0,0.0,0.0,0.0,0.0
10,How does the system ensure that each user only...,[e3ad9dd8-2c6d-482e-9497-94252f75b327],[Retrieving Privacy-Safe Documents Over A Netw...,0.0,0.0,0.0,0.0,0.0,0.0
14,How does the RetrieverEvaluator module enhance...,[b1788bca-a4eb-4657-8554-6631fb3fdc58],[Keyword Queries GTR retriever recall rate 5. ...,0.0,0.0,0.0,0.0,0.0,0.0
16,How does the Launch of Seven Advanced Retrieva...,[f3802677-a18d-4a3c-a066-ffa5b53296a3],[Building Scalable RAG Applications with Llama...,0.0,0.0,0.0,0.0,0.0,0.0
17,What enhancement was achieved in structured me...,[f3802677-a18d-4a3c-a066-ffa5b53296a3],[Fine-Tuning Embeddings for RAG with Synthetic...,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
for i, row in retrieval_eval_irrelevance_df.reset_index(drop=True).iterrows():
    print(f"\n\n============Error #{i+1}=============\n\n")
    print(f"Query:\n{row.query}\n")
    expected_contexts = [json.loads(record.payload['_node_content'])['text'] for record in qdrantdb.retrieve(COLLECTION, ids=row.expected_ids)]
    expected_contexts = '\n\n'.join(expected_contexts)
    print(f"Expected Contexts:\n{expected_contexts}\n")
    contexts = '\n\n'.join(row.retrieved_texts)
    print(f"Retrieved Contexts:\n{contexts}\n")





Query:
What new features have been introduced in LlamaCloud, and which platforms have been integrated with it?

Expected Contexts:
LlamaIndex Newsletter 2024-07-23
Hello, Llama Followers! 🦙 Welcome to this week’s edition of the LlamaIndex newsletter! We’re thrilled to share some exciting updates about our products, including LlamaCloud, LlamaParse, and LlamaAgents. You’ll also find success stories with LlamaCloud, extensive guides, in-depth tutorials, and information about upcoming hackathons. 🤩  The highlights: LlamaCloud Updates:  New features including LlamaCloud Chat, enhanced Teams collaboration, and expanded integrations with Notion, Slack, Jira, and SharePoint.  Blogpost ,  Tweet . Scaleport AI’s Accelerated Development with LlamaCloud:  Scaleport AI boosts development speed and sales with LlamaCloud and LlamaIndex, improving data handling and OCR accuracy across multiple industries.  Blogpost . Claude Sonnet-3.5 Integration with LlamaParse:  Integration of Claude Sonnet-3.5

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [58]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
),
# Below question is hard because LLM needs to follow the URL in the blog to get the information to answer
("How does the MemoryCache project by Mozilla utilize PrivateGPT_AI and LlamaIndex to enhance personal knowledge management while maintaining privacy? Provide a brief overview of the project and its key features.",
"""
The MemoryCache project by Mozilla aims to transform local desktop environments into on-device AI agents, utilizing PrivateGPT_AI and LlamaIndex to enhance personal knowledge management. It saves browser history and other local files to the user’s machine, allowing a local AI model to ingest and augment responses. This approach maintains privacy by avoiding cloud-based processing, focusing instead on generating insights from personal data. The project emphasizes creating a personalized AI experience that mirrors the original vision of personal computers as companions for thought.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [59]:
def evaluate_labelled_rag_dataset(response_eval_dataset, response_eval_prediction_dataset, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        # "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
        "contexts": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions),
        total=len(response_eval_dataset.examples)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)
        evals["contexts"].append(prediction.contexts)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
        "contexts": evals['contexts'],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
        pd.Series(evals['contexts'], name='contexts')
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [60]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [61]:
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
# RESPONSE_EVAL_LLM_MODEL = 'gpt-4'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 1
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10
RESPONSE_NUM_SAMPLE_DOCUMENTS = min(len(documents), RESPONSE_NUM_SAMPLE_DOCUMENTS)
BATCH_SIZE = 16

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [62]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-25 12:56:19.320[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [64]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
    logger.info(f"Creating synthetic response eval dataset...")
    # Use good model to generate the eval dataset
    from llama_index.llms.openai import OpenAI
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=response_eval_llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        question_gen_query=QUESTION_GEN_QUERY,  # Reuse the same format from the above Retrieval Question Gen Query
        show_progress=True,
        workers=(os.cpu_count() - 1)
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    logger.info(f"Persisting synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-25 12:56:58.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mCreating synthetic response eval dataset...[0m


Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:06<00:00,  4.18it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.63s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,

In [65]:
synthetic_response_eval_prediction_dataset = await synthetic_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=BATCH_SIZE, show_progress=True
)

Batch processing of predictions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:12<00:00,  1.26it/s]
Batch processing of predictions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:10<00:00,  1.18it/s]


In [66]:
synthetic_mean_scores_df, synthetic_deep_eval_df = evaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    synthetic_response_eval_prediction_dataset,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

  0%|          | 0/29 [00:00<?, ?it/s]

In [67]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,3.793103
mean_relevancy_score,0.827586
mean_faithfulness_score,0.793103


In [68]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,How is the property graph index different from...,\nThe property graph index is different from t...,1.0,4.5,1.0,"[In the previous integration, the graph was re..."
1,What is the purpose of using the SchemaLLMPath...,\nThe purpose of using the SchemaLLMPathExtrac...,1.0,4.0,1.0,"[Now, let’s examine our retriever options. At ..."
2,How can you accelerate the process of extracti...,\nYou can accelerate the process of extracting...,1.0,2.0,1.0,[Note the increased cost of $0.60 USD per page...
3,How can you customize the property graph index...,\nYou can customize the property graph index i...,1.0,4.0,1.0,[O’Brien expressed confidence that the dispute...
4,What methods are used in the custom retriever ...,\nThe custom retriever uses the `entity_extrac...,1.0,3.0,1.0,"[Now, let’s examine our retriever options. At ..."
5,How many sections are included in the OpenAI C...,\nThe OpenAI Cookbook for evaluating RAG syste...,1.0,4.5,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
6,How does KOSMOS-2 contribute to the multi-moda...,\nKOSMOS-2 contributes to the multi-modal prot...,1.0,4.5,1.0,"[Building My Own ChatGPT Vision with PaLM, KOS..."
7,How does the application handle user interface...,\nThe application does not explicitly discuss ...,0.0,2.0,0.0,[It's remarkable how seamless the integration ...
8,How does the application handle user interacti...,\nThe application ensures an engaging and cont...,1.0,4.5,1.0,"[Your responses always descriptive. "" \n ..."
9,How has the field of AI and large language mod...,\nThe field of AI and large language models ha...,1.0,4.0,1.0,"[Notebook , Tweet . ✍️ Tutorials: Bhavesh Bha..."


In [69]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

#### Error Analysis

In [70]:
synthetic_response_eval_dataset_dict = dict()
for example in synthetic_response_eval_dataset.examples:
    synthetic_response_eval_dataset_dict[example.query] = {
        "reference_answer": example.reference_answer,
        "reference_contexts": example.reference_contexts,
    }

In [71]:
synthetic_deep_eval_df.sort_values(['relevancy_score', 'correctness_score', 'faithfulness_score'])

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
16,What is the length of the 2021 Lyft SEC 10-K d...,\nThe 2021 Lyft SEC 10-K document is not expli...,0.0,1.0,0.0,[It goes into details about the US-China trade...
7,How does the application handle user interface...,\nThe application does not explicitly discuss ...,0.0,2.0,0.0,[It's remarkable how seamless the integration ...
24,What new offering integrates local LLMs and em...,\nLlamaCloud.\n\n\nSources:\n- [Introducing Ll...,0.0,3.0,0.0,[(We’ve also launched a new version of our we...
12,How does the integration between Create-llama ...,\nThe integration between Create-llama and Lla...,0.0,3.0,1.0,"[Blogpost , Tweet . create-llama Integrated ..."
28,What is the author's name of the document on b...,\nThe author's name of the document on becomin...,0.0,5.0,0.0,"[If you find my content valuable, don’t hesita..."
2,How can you accelerate the process of extracti...,\nYou can accelerate the process of extracting...,1.0,2.0,1.0,[Note the increased cost of $0.60 USD per page...
14,What are the two ways of feeding text into the...,"\nDoc: 9, Relevance: 7\nDoc: 3, Relevance: 4\n...",1.0,3.0,0.0,[A question is also provided.\n Respond with ...
4,What methods are used in the custom retriever ...,\nThe custom retriever uses the `entity_extrac...,1.0,3.0,1.0,"[Now, let’s examine our retriever options. At ..."
19,What is the purpose of setting up Tonic Validate?,\nThe purpose of setting up Tonic Validate is ...,1.0,3.0,1.0,[I highly recommend utilizing the UI to make v...
10,What is the purpose of using Medium articles f...,\nThe purpose of using Medium articles from 20...,1.0,3.5,1.0,"[LlamaIndex Newsletter 2023–12–12\nHowdy, Llam..."


In [72]:
synthetic_response_eval_irrelevance_df = (
    synthetic_deep_eval_df
    .loc[lambda df: df['relevancy_score'].lt(1)]
    .sort_values(['relevancy_score', 'correctness_score', 'faithfulness_score'])
)

for i, row in synthetic_response_eval_irrelevance_df.reset_index(drop=True).iterrows():
    print(f"\n\n==============Error #{i+1}===============\n\n")
    print(f"Query:\n{row.query}\n")
    contexts = '\n\n'.join(row.contexts)
    print(f"Context:\n{contexts}\n")
    print(f"Answer:\n{row.answer}\n----\n")
    expected = synthetic_response_eval_dataset_dict.get(row.query)
    if not expected:
        logger.error(f"Could not find query {row.query} in synthetic_response_eval_dataset_dict!")
        continue
    expected_answer = expected['reference_answer']
    print(f"Expected Answer:\n{expected_answer}\n")
    expected_contexts = expected['reference_contexts']
    print(f"Expected Contexts:\n{expected_contexts}\n")





Query:
What is the length of the 2021 Lyft SEC 10-K document?

Context:
It goes into details about the US-China trade war, slowing economic growth, Brexit, and more (keep in mind 2019 is pre-COVID). Token Usage and Latency The document contains around ~170K tokens. For some reason, this number is not reflected on the Anthropic usage logs (the “Prompt Tokens” section seems capped at 10240). But the Prompt Length (in characters) is logged, as well as the model latency. Given the pricing, ~170K tokens would be equivalent to $1.5–2 USD. A query through one Uber SEC-10K takes around  150   seconds , including all LLM calls. This is actually a bit faster than repeated calls to ChatGPT/davinci. Each ChatGPT/davinci call (with the 4K token window maximized), empirically can take 6–10 seconds to complete →  125–250 seconds ( or more). Analyzing Multiple Documents A popular example in our  previous blog post  was showcasing that you could compare/contrast different documents with LlamaIndex 

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [73]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [74]:
curated_response_eval_prediction_dataset = await curated_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=BATCH_SIZE, show_progress=True
)

Batch processing of predictions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.32s/it]


In [75]:
curated_mean_scores_df, curated_deep_eval_df = evaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    curated_response_eval_prediction_dataset,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

  0%|          | 0/4 [00:00<?, ?it/s]

In [76]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.5
mean_relevancy_score,1.0
mean_faithfulness_score,1.0


In [77]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,What are key features of llama-agents?,"\nDistributed Service-Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.\n\nCommunication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.\n\nDefine agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.\n\nEase of deployment: launch, scale, and monitor each agent and your control plane independently.\n\nScalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service.\n\n\nSources:\n- [Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems](https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems)\n\n\n",1.0,,1.0,"[Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems\nWe're excited to announce the alpha release of llama-agents , a new open-source framework designed to simplify the process of building, iterating, and deploying multi-agent AI systems and turn your agents into production microservices. Whether you're working on complex question-answering systems, collaborative AI assistants, or distributed AI workflows, llama-agents provides the tools and structure you need to bring your ideas to life. Key Features of llama-agents Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task. Ease of deployment: launch, scale and monitor each agent and your control plane independently. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service Let's dive into how you can start using llama-agents to build your own multi-agent systems. Getting Started with llama-agents First, install the framework using pip: pip install llama-agents llama-index-agent-openai Basic System Setup Here's a simple example of how to set up a basic multi-agent system using llama-agents. First we’ll bring in our dependencies and set up our control plane, which contains our LLM-powered orchestrator import dotenv\ndotenv.load_dotenv() # our .env file defines OPENAI_API_KEY \n from llama_agents import (\n AgentService,\n ControlPlaneServer,\n SimpleMessageQueue,\n AgentOrchestrator,\n)\n from llama_index.core.agent import FunctionCallingAgentWorker\n from llama_index.core.tools import FunctionTool\n from llama_index.llms.openai import OpenAI\n import logging\n\n # turn on logging so we can see the system working \nlogging.getLogger( ""llama_agents"" ).setLevel(logging.INFO)\n\n # Set up the message queue and control plane \nmessage_queue = SimpleMessageQueue()\ncontrol_plane = ControlPlaneServer(\n message_queue=message_queue,\n orchestrator=AgentOrchestrator(llm=OpenAI()),\n) Next we create our tools using LlamaIndex’s existing abstractions, provide those tools to an agent, and turn that agent into an independent microservice: # create a tool \n def get_the_secret_fact () -> str :\n """"""Returns the secret fact."""""" \n return ""The secret fact is: A baby llama is called a 'Cria'."" \n\ntool = FunctionTool.from_defaults(fn=get_the_secret_fact)\n\n # Define an agent \nworker = FunctionCallingAgentWorker.from_tools([tool], llm=OpenAI())\nagent = worker.as_agent()\n\n # Create an agent service \nagent_service = AgentService(\n agent=agent,\n message_queue=message_queue,\n description= ""General purpose assistant"" ,\n service_name= ""assistant"" ,\n) Finally we launch the service and the control plane. Note that here we’re using a helper function to run a single query through the system and then exit; next we’ll show how to deploy this to production. # Set up the launcher for local testing \n from llama_agents import LocalLauncher\n\nlauncher = LocalLauncher(\n [agent_service],\n control_plane,\n message_queue,\n)\n\n # Run a single query through the system \nresult = launcher.launch_single( ""What's the secret fact?"" )\n print (result) Deploying Your Multi-Agent System Once you've tested your system locally, you can deploy it as a set of services for real production use. Here's how you might set that up. , We’re actively seeking public feedback on what works for you and what doesn’t. Dive in! llama-agents provides a powerful, flexible framework for building complex multi-agent AI systems. Whether you're prototyping a new idea or scaling to production, llama-agents offers the tools you need to bring your AI vision to life. Check out the repo to learn more, especially our library of examples . We're excited to see what the community builds with llama-agents . Happy coding!]"
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,\nThe two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook are the Retrieval System and Response Generation.\n\n\nSources:\n- [OpenAI Cookbook: Evaluating RAG systems](https://www.llamaindex.ai/blog/openai-cookbook-evaluating-rag-systems-fe393c61fb93)\n\n\n,1.0,5.0,1.0,"[OpenAI Cookbook: Evaluating RAG systems\nWe’re excited to unveil our OpenAI Cookbook , a guide to evaluating Retrieval-Augmented Generation (RAG) systems using LlamaIndex. We hope you’ll find it useful in enhancing the effectiveness of your RAG systems, and we’re thrilled to share it with you. The OpenAI Cookbook has three sections: Understanding Retrieval-Augmented Generation (RAG): provides a detailed overview of RAG systems, including the various stages involved in building the RAG system. Building RAG with LlamaIndex: Here, we dive into the practical aspects, demonstrating how to construct a RAG system using LlamaIndex, specifically applied to Paul Graham’s essay, utilizing the VectorStoreIndex . Evaluating RAG with LlamaIndex: The final section focuses on assessing the RAG system’s performance in two critical areas: the Retrieval System and Response Generation. We use our unique synthetic dataset generation method, generate_question_context_pairs to conduct thorough evaluations in these areas. Our goal with this cookbook is to provide the community with an essential resource for effectively evaluating and enhancing RAG systems developed using LlamaIndex. , Join us in exploring the depths of RAG system evaluation and discover how to leverage the full potential of your RAG implementations with LlamaIndex. Keep building with LlamaIndex!🦙]"
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,\nThe two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).\n\n\nSources:\n- [Boosting RAG: Picking the Best Embedding & Reranker models](https://www.llamaindex.ai/blog/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83)\n\n\n,1.0,,1.0,"[However, actual outcomes may differ based on data characteristics, dataset size, and other variables like chunk_size, similarity_top_k, and so on. The table below showcases the evaluation results based on the metrics of Hit Rate and Mean Reciprocal Rank (MRR): Analysis: Performance by Embedding: OpenAI : Showcases top-tier performance, especially with the CohereRerank (0.926966 hit rate, 0.86573 MRR) and bge-reranker-large (0.910112 hit rate, 0.855805 MRR), indicating strong compatibility with reranking tools. bge-large : Experiences significant improvement with rerankers, with the best results from CohereRerank (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with CohereRerank (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0.836049 MRR. Voyage : Has strong initial performance that is further amplified by CohereRerank (0.91573 hit rate, 0.851217 MRR), suggesting high responsiveness to reranking. JinaAI : Very strong performance, sees notable gains with bge-reranker-large (0.938202 hit rate, 0.868539 MRR) and CohereRerank (0.932584 hit rate, 0.873689), indicating that reranking significantly boosts its performance. Google-PaLM : The model demonstrates strong performance, with measurable gains when using the CohereRerank (0.910112 hit rate, 0.855712 MRR). This indicates that reranking provides a clear boost to its overall results. Impact of Rerankers : WithoutReranker : This provides the baseline performance for each embedding. bge-reranker-base : Generally improves both hit rate and MRR across embeddings. bge-reranker-large : This reranker frequently offers the highest or near-highest MRR for embeddings. For several embeddings, its performance rivals or surpasses that of the CohereRerank . CohereRerank : Consistently enhances performance across all embeddings, often providing the best or near-best results. Necessity of Rerankers : The data clearly indicates the significance of rerankers in refining search results. Nearly all embeddings benefit from reranking, showing improved hit rates and MRRs. Rerankers, especially CohereRerank , have demonstrated their capability to transform any embedding into a competitive one. Overall Superiority : When considering both hit rate and MRR, the combinations of OpenAI + CohereRerank and JinaAI-Base + bge-reranker-large/ CohereRerank emerge as top contenders. However, the consistent improvement brought by the CohereRerank/ bge-reranker-large rerankers across various embeddings make them the standout choice for enhancing search quality, regardless of the embedding in use. In summary, to achieve the peak performance in both hit rate and MRR, the combination of OpenAI or JinaAI-Base embeddings with the CohereRerank/bge-reranker-large reranker stands out. Please be aware that our benchmarks are intended to offer a reproducible script for your own data. Nevertheless, treat these figures as estimates and proceed with caution when interpreting them. Conclusions: In this blog post, we have demonstrated how to evaluate and enhance retriever performance using various embeddings and rerankers. Below are our final conclusions. Embeddings : The OpenAI and JinaAI-Base embeddings, especially when paired with the CohereRerank/bge-reranker-large reranker, set the gold standard for both hit rate and MRR. Rerankers : The influence of rerankers, particularly CohereRerank/bge-reranker-large , cannot be overstated. They play a key role in improving the MRR for many embeddings, showing their importance in making search results better. Foundation is Key : Choosing the right embedding for the initial search is essential; even the best reranker can’t help much if the basic search results aren’t good. Working Together: To get the best out of retrievers, it’s important to find the right mix of embeddings and rerankers. This study shows how important it is to carefully test and find the best pairing., Boosting RAG: Picking the Best Embedding & Reranker models\nUPDATE : The pooling method for the Jina AI embeddings has been adjusted to use mean pooling, and the results have been updated accordingly. Notably, the JinaAI-v2-base-en with bge-reranker-large now exhibits a Hit Rate of 0.938202 and an MRR (Mean Reciprocal Rank) of 0.868539 and with CohereRerank exhibits a Hit Rate of 0.932584, and an MRR of 0.873689. When building a Retrieval Augmented Generation (RAG) pipeline, one key component is the Retriever. We have a variety of embedding models to choose from, including OpenAI, CohereAI, and open-source sentence transformers. Additionally, there are several rerankers available from CohereAI and sentence transformers. But with all these options, how do we determine the best mix for top-notch retrieval performance? How do we know which embedding model fits our data best? Or which reranker boosts our results the most? In this blog post, we’ll use the Retrieval Evaluation module from LlamaIndex to swiftly determine the best combination of embedding and reranker models. Let's dive in! Let’s first start with understanding the metrics available in Retrieval Evaluation Understanding Metrics in Retrieval Evaluation: To gauge the efficacy of our retrieval system, we primarily relied on two widely accepted metrics: Hit Rate and Mean Reciprocal Rank (MRR) . Let’s delve into these metrics to understand their significance and how they operate. Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses. Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on. Now that we’ve established the scope and familiarized ourselves with the metrics, it’s time to dive into the experiment. For a hands-on experience, you can also follow along using our Google Colab Notebook Setting Up the Environment !pip install llama-index sentence-transformers cohere anthropic voyageai protobuf pypdf Setting Up the Keys openai_api_key = 'YOUR OPENAI API KEY' \ncohere_api_key = 'YOUR COHEREAI API KEY' \nanthropic_api_key = 'YOUR ANTHROPIC API KEY' \nopenai.api_key = openai_api_key Download the Data We will use Llama2 paper for this experiment. Let’s download the paper. ]"
3,How does the MemoryCache project by Mozilla utilize PrivateGPT_AI and LlamaIndex to enhance personal knowledge management while maintaining privacy? Provide a brief overview of the project and its key features.,"\nThe MemoryCache project by Mozilla utilizes PrivateGPT_AI and LlamaIndex to enhance personal knowledge management while maintaining privacy by integrating private data and knowledge sources. This project allows users to store and query their private data, enabling them to process the data once and then query it for various downstream applications. The project's key features include the ability to connect private knowledge sources using LlamaIndex connectors, load in documents, and utilize LLMs to construct final answers.\n\n\nSources:\n- [Build a ChatGPT with your Private Data using LlamaIndex and MongoDB](https://www.llamaindex.ai/blog/build-a-chatgpt-with-your-private-data-using-llamaindex-and-mongodb-b09850eb154c)\n\n\n",1.0,4.0,1.0,"[The requirement to continuously maintain updates from the underlying data sources. Being able to persist this data enables processing the data once and then being able to query it for various downstream applications. MongoDB Atlas MongoDB offers a free forever Atlas cluster in the public cloud service of your choice. This can be accomplished very quickly by following this tutorial . , Or you can get started directly here . Use of LLMs LlamaIndex uses LangChain’s (another popular framework for building Generative AI applications) LLM modules and allows for customizing the underlying LLM to be used (default being OpenAI’s text-davinci-003 model). The chosen LLM is always used by LlamaIndex to construct the final answer and is sometimes used during index creation as well. The workflow Connect private knowledge sources using LlamaIndex connectors (offered through LlamaHub ). Load in the Documents. ]"


In [78]:
for context in curated_deep_eval_df.iloc[2]['contexts']:
    print(context)
    print('-' * 10)

However, actual outcomes may differ based on data characteristics, dataset size, and other variables like chunk_size, similarity_top_k, and so on. The table below showcases the evaluation results based on the metrics of Hit Rate and Mean Reciprocal Rank (MRR): Analysis: Performance by Embedding: OpenAI : Showcases top-tier performance, especially with the  CohereRerank  (0.926966 hit rate, 0.86573 MRR) and  bge-reranker-large  (0.910112 hit rate, 0.855805 MRR), indicating strong compatibility with reranking tools. bge-large : Experiences significant improvement with rerankers, with the best results from  CohereRerank  (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with  CohereRerank  (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0

In [79]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [80]:
if LOG_TO_MLFLOW:
    mlflow.end_run()

# Archive