# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

import mlflow

import sys
sys.path.insert(0, '..')

In [3]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [5]:
TESTING = False
DEBUG = False
OBSERVABILITY = True
LOG_TO_MLFLOW = True

In [6]:
if OBSERVABILITY:
    import phoenix as px
    px.launch_app()
    import llama_index.core
    llama_index.core.set_global_handler("arize_phoenix")

I0000 00:00:1721879287.498035 2834072 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [7]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [8]:
RUN_NAME = "exp_005_tune_eval_qa_dataset"
if LOG_TO_MLFLOW:
    RUN_DESCRIPTION = """
# Making the RAG outputs referenced sources

## Changelog
### Compares to exp_004
- Do not use GPT-4 because of high cost (generate 40 pairs of question-context for retrieval evaluation cost 0.4 USD already)
"""
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

In [9]:
NOTEBOOK_CACHE_DP = f'data/001/{RUN_NAME}'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

if LOG_TO_MLFLOW:
    mlflow.log_param("NOTEBOOK_CACHE_DP", NOTEBOOK_CACHE_DP)

# Load data

In [10]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs-v2.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [11]:
len(data)

160

In [12]:
data[:5]

[{'title': 'Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations',
  'content': "This is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation framework, developers can experiment with different configurations, fine-tuning their LLM-based applications for optimal performance. About UpTrain UpTrain  [ github  ||  website  ||  docs ] is an open-source platform to evaluate and improve LLM applications. It provides grades for 20+ preconfigured checks (covering language, code, embedding use cases), performs root cause analyses on instances of failu

# Check data

In [13]:
data[0]['content']

"This is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation framework, developers can experiment with different configurations, fine-tuning their LLM-based applications for optimal performance. About UpTrain UpTrain  [ github  ||  website  ||  docs ] is an open-source platform to evaluate and improve LLM applications. It provides grades for 20+ preconfigured checks (covering language, code, embedding use cases), performs root cause analyses on instances of failure cases and provides guidance for resolving them. Key Highlights: Data Security:  As an open

# Prepare documents

In [14]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_input_data", len(input_data))

[32m2024-07-25 10:48:08.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=160[0m


In [15]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags']),
        'url': record['url']
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [16]:
documents[0]

Document(id_='4592c99d-6a97-4e0e-87a2-160423007c10', embedding=None, metadata={'title': 'Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations', 'author': 'Uptrain', 'date': 'Mar 19, 2024', 'tags': 'AI, Evaluation, Rag', 'url': 'https://www.llamaindex.ai/blog/supercharge-your-llamaindex-rag-pipeline-with-uptrain-evaluations'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Supercharge your LlamaIndex RAG Pipeline with UpTrain Evaluations\nThis is a guest post from Uptrain. We are excited to announce the recent integration of LlamaIndex with UpTrain - an open-source LLM evaluation framework to evaluate your RAG pipelines and experiment with different configurations. As an increasing number of companies are graduating their LLM prototypes to production-ready systems, robust evaluations provide a systematic framework to make decisions rather than going with the ‘vibes’. By combining LlamaIndex's flexibility and UpTrain's evaluation fram

In [17]:
documents[1].metadata

{'title': 'LlamaIndex Newsletter 2024-04-02',
 'author': 'LlamaIndex',
 'date': 'Apr 2, 2024',
 'tags': 'LLM',
 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-04-02'}

In [18]:
if LOG_TO_MLFLOW:
    mlflow.log_param("len_documents", len(documents))

## Setting LLM

In [19]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [20]:
# LLM_OPTION = 'openai'
# LLM_OPTION = 'ollama'
LLM_OPTION = 'togetherai'

# LLM_MODEL_NAME = 'llama3'
# LLM_MODEL_NAME = 'gpt-3.5-turbo'
LLM_MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'

# EMBED_OPTION = 'openai'
# EMBED_OPTION = 'togetherai'
# EMBED_OPTION = 'ollama'
EMBED_OPTION = 'huggingface'

# EMBED_MODEL_NAME = 'llama3'
# EMBED_MODEL_NAME = 'togethercomputer/m2-bert-80M-2k-retrieval'
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)
    mlflow.log_param("LLM_MODEL_NAME", LLM_MODEL_NAME)
    mlflow.log_param("EMBED_OPTION", EMBED_OPTION)
    mlflow.log_param("EMBED_MODEL_NAME", EMBED_MODEL_NAME)

In [21]:
# LLM options
if LLM_OPTION == 'ollama':
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    llm = Ollama(base_url=base_url, model=LLM_MODEL_NAME, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
elif LLM_OPTION == 'openai':
    from llama_index.llms.openai import OpenAI
    llm = OpenAI(model=LLM_MODEL_NAME)
elif LLM_OPTION == 'togetherai':
    from llama_index.llms.together import TogetherLLM
    llm = TogetherLLM(model=LLM_MODEL_NAME)

# Embed options
if EMBED_OPTION == 'huggingface':
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    embed_model = HuggingFaceEmbedding(
        model_name=EMBED_MODEL_NAME
    )
elif EMBED_OPTION == 'openai':
    from llama_index.embeddings.openai import OpenAIEmbedding
    embed_model = OpenAIEmbedding()
elif EMBED_OPTION == 'togetherai':
    from llama_index.embeddings.together import TogetherEmbedding
    embed_model = TogetherEmbedding(EMBED_MODEL_NAME)
elif EMBED_OPTION == 'ollama':
    from llama_index.embeddings.ollama import OllamaEmbedding
    embed_model = OllamaEmbedding(
        model_name=EMBED_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )

logger.info(f"LLM:\n{repr(llm)}")
logger.info(f"Embed model:\n{repr(embed_model)}")

[32m2024-07-25 10:48:14.230[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mLLM:
TogetherLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x708d117bfe10>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x708d1aba49a0>, completion_to_prompt=<function default_completion_to_prompt at 0x708d1a10ad40>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='meta-llama/Meta-Llama-3-8B-Instruct-Lite', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='3cf613093b6eb9b479c341126dc8d3761c67f9340d0a4a8e1fdc62ed41b58126', api_base='https://api.together.xyz/v1', api_version='', context_window=3900, is_chat_model=True, is_function_calling_model=False, tokenizer=None)[0m
[32m2024-07-25 10:48:14.231[0m | [1mINFO    [0m | [36m__main__[0

In [22]:
embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
Settings.embed_model = embed_model
Settings.llm = llm

logger.info(f"{embed_model_dim=}")

if LOG_TO_MLFLOW:
    mlflow.log_param("embedding_model_dim", embed_model_dim)
    mlflow.log_param("LLM_MODEL", repr(llm))
    mlflow.log_param("EMBEDDING_MODEL", repr(embed_model))

[32m2024-07-25 10:48:14.811[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1membed_model_dim=1024[0m


# Index embeddings

## Qdrant as VectorStore

In [23]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [24]:
import string

def substitute_punctuation(text):
    # Create a translation table that maps each punctuation character to an underscore
    translator = str.maketrans(string.punctuation, '_' * len(string.punctuation))
    # Translate the text using the translation table
    return text.translate(translator)

In [25]:
RECREATE_INDEX = False

# collection_raw_name = f"{EMBED_OPTION}__{EMBED_MODEL_NAME}__{RUN_NAME}"
# COLLECTION = substitute_punctuation(collection_raw_name)
COLLECTION = "huggingface__BAAI_bge_large_en_v1_5__exp_003_reranker_flag_embedding_bge_large"

logger.info(f"{COLLECTION=}")

# NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
NODES_PERSIST_FP = 'data/001/exp_003_reranker_flag_embedding_bge_large/nodes.pkl'

logger.info(f"{NODES_PERSIST_FP=}")

if LOG_TO_MLFLOW:
    mlflow.log_param(f"COLLECTION", COLLECTION)

[32m2024-07-25 10:48:15.738[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mCOLLECTION='huggingface__BAAI_bge_large_en_v1_5__exp_003_reranker_flag_embedding_bge_large'[0m
[32m2024-07-25 10:48:15.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mNODES_PERSIST_FP='data/001/exp_003_reranker_flag_embedding_bge_large/nodes.pkl'[0m


In [26]:
qdrantdb = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
aqdrantdb = qdrant_client.AsyncQdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
collection_exists = qdrantdb.collection_exists(COLLECTION)
if RECREATE_INDEX or not collection_exists:
    if collection_exists:
        logger.info(f"Deleting existing Qdrant collection...")
        qdrantdb.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
    logger.info(f"Creating new Qdrant collection...")
    qdrantdb.create_collection(
        COLLECTION,
        vectors_config=VectorParams(size=embed_model_dim, distance=Distance.COSINE),
    )
else:
    logger.info(f"Use existing Qdrant collection")
db_collection = qdrantdb.get_collection(COLLECTION)
vector_store = QdrantVectorStore(
    client=qdrantdb,
    collection_name=COLLECTION,
    aclient=aqdrantdb,
    prefer_grpc=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

[32m2024-07-25 10:48:15.827[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mUse existing Qdrant collection[0m
WARNI [llama_index.vector_stores.qdrant.base] Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [27]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [28]:
t0 = time.perf_counter()
# TODO: TO understand the differences between points_count and indexed_vector_counts.
# Here indexed_vector_counts = 0
db_collection_count = db_collection.points_count

if db_collection_count > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing DB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        logger.info(f"Loading cached `nodes` at {NODES_PERSIST_FP}...")
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new DB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embed_model,
        ],
        vector_store = vector_store
    )

    num_workers = None
    # Currently setting num_workers leads to error `AttributeError: 'HuggingFaceEmbedding' object has no attribute '_model'`
    # num_workers = os.cpu_count() - 1
    # logger.info(f"Running Ingestion Pipeline with {num_workers=}...")
    nodes = await pipeline.arun(documents=documents, num_workers=num_workers)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
t1 = time.perf_counter()

[32m2024-07-25 10:48:15.962[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading index from existing DB...[0m
[32m2024-07-25 10:48:15.963[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mLoading cached `nodes` at data/001/exp_003_reranker_flag_embedding_bge_large/nodes.pkl...[0m


In [29]:
logger.info(f"Indexing {len(documents)} into VectorStoreIndex took {t1 - t0:,.0f}s")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_nodes", len(nodes))

[32m2024-07-25 10:48:16.872[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mIndexing 160 into VectorStoreIndex took 1s[0m


# Query engine

In [30]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.schema import MetadataMode
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

In [31]:
from src.features.append_reference.custom_query_engine import ManualAppendReferenceQueryEngine

In [32]:
RETRIEVAL_TOP_K = 5
RERANK_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3
# APPEND_REF_MODE = 'response_synthesizer'
APPEND_REF_MODE = 'query_engine'

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    mlflow.log_param("RERANK_TOP_K", RERANK_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [33]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

reranker = FlagEmbeddingReranker(model="BAAI/bge-reranker-large", top_n=RERANK_TOP_K)
node_postprocessors.append(reranker)

if APPEND_REF_MODE == 'response_synthesizer':
    response_synthesizer = ManualAppendReferenceSynthesizer(verbose=0)
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )
elif APPEND_REF_MODE == 'query_engine':
    response_synthesizer = get_response_synthesizer()
    query_engine = ManualAppendReferenceQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )
else:
    response_synthesizer = get_response_synthesizer()
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=node_postprocessors,
    )

if LOG_TO_MLFLOW:
    mlflow.log_param("reranker", repr(reranker))
    mlflow.log_param("response_synthesizer", repr(response_synthesizer))
    mlflow.log_param("query_engine", repr(query_engine))

In [34]:
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_response,
)

In [35]:
question = "How can we address points of failures in RAG pipeline?"
response = query_engine.query(question)
display_response(response, show_source=True, show_metadata=True, show_source_metadata=True)

**`Final Response:`** To address points of failures in RAG pipeline, you can leverage the live testing feature provided by RAGArch. This feature allows you to instantly test your RAG pipeline with your own data and see how different configurations affect the outcome. This enables you to identify and troubleshoot potential issues early on, ensuring that your pipeline is working as expected. Additionally, the one-click code generation feature can help you quickly generate the Python code for your custom RAG pipeline, allowing you to implement and test your pipeline more efficiently.


Sources:
- [RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex](https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089)

---

**`Source Node 1/2`**

**Node ID:** cb7da65e-0eae-42f1-81a6-1a50c6e0e9a7<br>**Similarity:** -5.265542507171631<br>**Text:** RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Power...<br>**Metadata:** {'title': 'RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex', 'author': 'Harshad Suryawanshi', 'date': 'Feb 2, 2024', 'tags': 'Rag, No Code, Llamaindex, OpenAI, Code Generation', 'url': 'https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089', 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"RAGArch: A No-Code RAG Pipeline Configuration and One-Click Code Generation Tool for Streamlined AI Development, File Handling, and Large Language Model Selection for Natural Language Processing"\n\nThis title captures the main entities and themes present in the context, including:\n\n* RAGArch: the tool itself\n* No-code pipeline configuration\n* One-click code generation\n* AI development\n* File handling\n* Large Language Model selection\n* Natural Language Processing (NLP)\n\nThis title provides a clear and concise overview of the document\'s content, making it easy for readers to understand the main topics and themes discussed.'}<br>

---

**`Source Node 2/2`**

**Node ID:** b9ae64b7-7f37-4523-ae89-7b2c9c6da607<br>**Similarity:** -5.767147541046143<br>**Text:** def   generate_rag_pipeline ( file, llm, embed_model, node_parser, response_mode, vector_store ):...<br>**Metadata:** {'title': 'RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex', 'author': 'Harshad Suryawanshi', 'date': 'Feb 2, 2024', 'tags': 'Rag, No Code, Llamaindex, OpenAI, Code Generation', 'url': 'https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089', 'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"RAGArch: A No-Code RAG Pipeline Configuration and One-Click Code Generation Tool for Streamlined AI Development, File Handling, and Large Language Model Selection for Natural Language Processing"\n\nThis title captures the main entities and themes present in the context, including:\n\n* RAGArch: the tool itself\n* No-code pipeline configuration\n* One-click code generation\n* AI development\n* File handling\n* Large Language Model selection\n* Natural Language Processing (NLP)\n\nThis title provides a clear and concise overview of the document\'s content, making it easy for readers to understand the main topics and themes discussed.'}<br>

{'cb7da65e-0eae-42f1-81a6-1a50c6e0e9a7': {'title': 'RAGArch: Building a No-Code RAG Pipeline Configuration & One-Click RAG Code Generation Tool Powered by LlamaIndex',
  'author': 'Harshad Suryawanshi',
  'date': 'Feb 2, 2024',
  'tags': 'Rag, No Code, Llamaindex, OpenAI, Code Generation',
  'url': 'https://www.llamaindex.ai/blog/ragarch-building-a-no-code-rag-pipeline-configuration-one-click-rag-code-generation-tool-powered-b6e8eeb70089',
  'document_title': 'Based on the provided context, I would suggest the following comprehensive title:\n\n"RAGArch: A No-Code RAG Pipeline Configuration and One-Click Code Generation Tool for Streamlined AI Development, File Handling, and Large Language Model Selection for Natural Language Processing"\n\nThis title captures the main entities and themes present in the context, including:\n\n* RAGArch: the tool itself\n* No-code pipeline configuration\n* One-click code generation\n* AI development\n* File handling\n* Large Language Model selection\n* N

# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [36]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [37]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [38]:
RECREATE_RETRIEVAL_EVAL_DATASET = True
# Currently can not reuse retrieval_eval_dataset because the retrieval evaluation is based on ids
# RETRIEVAL_EVAL_DATASET_FP = f"data/001/exp_001_v3/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_NUM_SAMPLE_NODES = 10
RETRIEVAL_NUM_SAMPLE_NODES = min(len(nodes), RETRIEVAL_NUM_SAMPLE_NODES)
RETRIEVAL_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
# RETRIEVAL_EVAL_LLM_MODEL = 'gpt-4'
RETRIEVAL_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [39]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes
else:
    logger.info(f"Loading retrieval_eval_nodes from {RETRIEVAL_EVAL_DATASET_FP}...")
    with open(RETRIEVAL_EVAL_DATASET_FP, 'r') as f:
        retrieval_eval_nodes = json.load(f)

[32m2024-07-25 10:48:31.674[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mSampling 10 nodes for retrieval evaluation...[0m


In [40]:
QUESTION_GEN_QUERY = """
You are a Retriever Evaluator. Your task is to generate {num_questions_per_chunk} questions to assess the accuracy/relevancy of an information retrieval system.
The information retrieval system would then be asked your generated question and assessed on how well it can look up and return the correct context.

IMPORTANT RULES:
- Restrict the generated questions to the context information provided.
- Do not mention anything about the context in the generated questions.
- The generated questions should be diverse in nature and in difficulty across the documents.
- When being asked the generated question, a human with no prior knowledge can still answer perfectly given the input context.
"""
QA_GENERATE_PROMPT_TMPL = f"""
Context information is below.

---------------------
{{context_str}}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

{QUESTION_GEN_QUERY}
"""

if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    # Use good model to generate the eval dataset
    from llama_index.llms.openai import OpenAI
    retrieval_eval_llm = OpenAI(model=RETRIEVAL_EVAL_LLM_MODEL, **RETRIEVAL_EVAL_LLM_MODEL_CONFIG)

    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes,
        llm=retrieval_eval_llm,
        num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK,
        qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-25 10:48:32.183[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mCreating new synthetic retrieval eval dataset...[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.07it/s]


In [41]:
retrieval_eval_dataset.queries

{'900e3f99-47b8-4865-a707-ce8ca130abf1': 'How does the GPT-3 ReAct agent differ from the GPT-4 ReAct agent in terms of tools used?',
 'be445808-76dd-4832-b36b-9887832a24ba': 'What is the purpose of the Simple Router agent in the given setup?',
 'c4784b90-0bb6-4943-8f02-fdcb8d89104b': 'How does the system handle user preferences and activities to personalize article recommendations?',
 '618a20b5-bd9b-4361-9b99-39a3e61df777': 'What methods are used to perform Named Entity Recognition in the system?',
 'b1960fe9-2cd6-4b85-9974-a1ba95a6c039': 'How can you benefit from incremental syncs in a stream that supports it?',
 '5c576f6c-5fa9-42ce-8b38-47f00ccc3034': 'What are some of the custom sources available as pip packages for Airbyte?',
 '8d9577a6-89eb-4b25-9611-63ac9e2480fa': 'What are the scopes that need to be added in order to install the Slack app?',
 '23a72032-6d40-4657-8295-208cf396639c': 'What environment variables need to be set and what values should be included in the .env file for

### Evaluate

In [42]:
from llama_index.core.evaluation import RetrieverEvaluator

In [43]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

In [44]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    eval_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        eval_dict = {
            "query": eval_result.query,
            "expected_ids": eval_result.expected_ids,
            "retrieved_texts": eval_result.retrieved_texts,
            **metric_dict
        }
        eval_dicts.append(eval_dict)

    full_df = pd.DataFrame(eval_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df, full_df

In [45]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df, retrieval_eval_results_full_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_5_retrieval_eval,0.473684,0.307018,0.094737,0.473684,0.307018,0.118228


In [46]:
retrieval_eval_results_full_df

Unnamed: 0,query,expected_ids,retrieved_texts,hit_rate,mrr,precision,recall,ap,ndcg
0,How does the GPT-3 ReAct agent differ from the...,[64af897b-400d-465d-9b44-3db5cd82401b],"[agent_chain.run(input=""How much cash did Uber...",0.0,0.0,0.0,0.0,0.0,0.0
1,What is the purpose of the Simple Router agent...,[64af897b-400d-465d-9b44-3db5cd82401b],"[agent_chain.run(input=""How much cash did Uber...",0.0,0.0,0.0,0.0,0.0,0.0
2,How does the system handle user preferences an...,[fe557b79-4c6f-4b13-9f69-38c82723ca06],"[Sophisticated Named-Entity Recognition, Text...",0.0,0.0,0.0,0.0,0.0,0.0
3,What methods are used to perform Named Entity ...,[fe557b79-4c6f-4b13-9f69-38c82723ca06],"[After the preprocessing of the data, we colle...",1.0,1.0,0.2,1.0,1.0,0.33916
4,How can you benefit from incremental syncs in ...,[7f444bd5-230c-4fb1-b39d-8358322a8064],[For example you can still benefit from incre...,1.0,1.0,0.2,1.0,1.0,0.33916
5,What are some of the custom sources available ...,[7f444bd5-230c-4fb1-b39d-8358322a8064],[if you have implemented your own custom Airby...,1.0,0.25,0.2,1.0,0.25,0.146068
6,What are the scopes that need to be added in o...,[29dca1d0-bafd-47d8-9fad-def250d4cf27],[Click the “Permissions” link in the bottom ri...,1.0,1.0,0.2,1.0,1.0,0.33916
7,What environment variables need to be set and ...,[29dca1d0-bafd-47d8-9fad-def250d4cf27],[Otherwise we'll do nothing. @flask_app.route(...,1.0,0.5,0.2,1.0,0.5,0.213986
8,"Based on the context provided, the task is to ...",[c25ee90d-807b-4bf1-9336-633b2060c034],[from llama_index.evaluation import QueryRe...,0.0,0.0,0.0,0.0,0.0,0.0
9,How does the performance of Mistral compare to...,[3b002c76-2225-4aa1-84e0-b8227bc952a4],"[However, Mistral is not displayed on the Know...",0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)
    retrieval_eval_results_full_df.to_html(f"{NOTEBOOK_CACHE_DP}/retrieval_eval_results_full_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/retrieval_eval_results_full_df.html", "retrieval_eval_results_full_df")

#### Error Analysis

In [48]:
retrieval_eval_irrelevance_df = (
    retrieval_eval_results_full_df
    .loc[lambda df: df['hit_rate'].lt(1)]
    .sort_values(['hit_rate', 'mrr', 'precision', 'recall', 'ap', 'ndcg'])
)
retrieval_eval_irrelevance_df

Unnamed: 0,query,expected_ids,retrieved_texts,hit_rate,mrr,precision,recall,ap,ndcg
0,How does the GPT-3 ReAct agent differ from the...,[64af897b-400d-465d-9b44-3db5cd82401b],"[agent_chain.run(input=""How much cash did Uber...",0.0,0.0,0.0,0.0,0.0,0.0
1,What is the purpose of the Simple Router agent...,[64af897b-400d-465d-9b44-3db5cd82401b],"[agent_chain.run(input=""How much cash did Uber...",0.0,0.0,0.0,0.0,0.0,0.0
2,How does the system handle user preferences an...,[fe557b79-4c6f-4b13-9f69-38c82723ca06],"[Sophisticated Named-Entity Recognition, Text...",0.0,0.0,0.0,0.0,0.0,0.0
8,"Based on the context provided, the task is to ...",[c25ee90d-807b-4bf1-9336-633b2060c034],[from llama_index.evaluation import QueryRe...,0.0,0.0,0.0,0.0,0.0,0.0
9,How does the performance of Mistral compare to...,[3b002c76-2225-4aa1-84e0-b8227bc952a4],"[However, Mistral is not displayed on the Know...",0.0,0.0,0.0,0.0,0.0,0.0
10,At what model size do Llama 2 and Mistral reac...,[3b002c76-2225-4aa1-84e0-b8227bc952a4],[The x-axes on the graphs represent model size...,0.0,0.0,0.0,0.0,0.0,0.0
12,What types of data sources can SuperAGI proces...,[7b7ae6d2-0d60-47a8-8f69-83331cfcc7e0],[Understand your dataset \n \n The first step...,0.0,0.0,0.0,0.0,0.0,0.0
14,What are the four stages involved in the solut...,[f5ccf620-f5bb-42e2-9f08-fcb336ce5678],[This is where D-ID\n comes into play.\n \n \...,0.0,0.0,0.0,0.0,0.0,0.0
15,How does the system handle custom transformati...,[744e2fef-01a5-4167-81fc-143c89772273],[FAQ What’s the difference between a QueryPip...,0.0,0.0,0.0,0.0,0.0,0.0
16,Can the system ingest documents directly into ...,[744e2fef-01a5-4167-81fc-143c89772273],[A Query Engine to Combine Structured Analytic...,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
for i, row in retrieval_eval_irrelevance_df.reset_index(drop=True).iterrows():
    print(f"\n\n============Error #{i+1}=============\n\n")
    print(f"Query:\n{row.query}\n")
    expected_contexts = [json.loads(record.payload['_node_content'])['text'] for record in qdrantdb.retrieve(COLLECTION, ids=row.expected_ids)]
    expected_contexts = '\n\n'.join(expected_contexts)
    print(f"Expected Contexts:\n{expected_contexts}\n")
    contexts = '\n\n'.join(row.retrieved_texts)
    print(f"Retrieved Contexts:\n{contexts}\n")





Query:
How does the GPT-3 ReAct agent differ from the GPT-4 ReAct agent in terms of tools used?

Expected Contexts:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    children_indices=[march_index, june_index, sept_index],
    index_summaries=[
        "Provides information about Uber quarterly financials ending March 2022",
        "Provides information about Uber quarterly financials ending June 2022",
        "Provides information about Uber quarterly financials ending September 2022"
    ]
) The graph can be queried with a  ComposableGraphQueryEngine  : # define decompose_transform 
decompose_transform = DecomposeQueryTransform(verbose= True )

 # define custom query engines 
custom_query_engines = {}
 for  index  in  [march_index, june_index, sept_index]:
    query_engine = index.as_query_engine(service_context=service_context)
    query_engine = TransformQueryEngine(
        query_engine,
        query_transform=decompose_transform,
        transform_extra_info={ 'in

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [66]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
),
# Below question is hard because LLM needs to follow the URL in the blog to get the information to answer
("How does the MemoryCache project by Mozilla utilize PrivateGPT_AI and LlamaIndex to enhance personal knowledge management while maintaining privacy? Provide a brief overview of the project and its key features.",
"""
The MemoryCache project by Mozilla aims to transform local desktop environments into on-device AI agents, utilizing PrivateGPT_AI and LlamaIndex to enhance personal knowledge management. It saves browser history and other local files to the user’s machine, allowing a local AI model to ingest and augment responses. This approach maintains privacy by avoiding cloud-based processing, focusing instead on generating insights from personal data. The project emphasizes creating a personalized AI experience that mirrors the original vision of personal computers as companions for thought.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [51]:
def evaluate_labelled_rag_dataset(response_eval_dataset, response_eval_prediction_dataset, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
        "contexts": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)
        evals["contexts"].append(prediction.contexts)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
        "contexts": evals['contexts'],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
        pd.Series(evals['contexts'], name='contexts')
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [52]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [53]:
RECREATE_SYNTHETIC_EVAL_DATASET = True
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
# RESPONSE_EVAL_DATASET_FP = f"data/001/exp_001_v3/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
# RESPONSE_EVAL_LLM_MODEL = 'gpt-4'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 1
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10
RESPONSE_NUM_SAMPLE_DOCUMENTS = min(len(documents), RESPONSE_NUM_SAMPLE_DOCUMENTS)
BATCH_SIZE = 16

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [54]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-25 10:49:06.756[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [55]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # Use good model to generate the eval dataset
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=response_eval_llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        question_gen_query=QUESTION_GEN_QUERY,  # Reuse the same format from the above Retrieval Question Gen Query
        show_progress=True,
        workers=(os.cpu_count() - 1)
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-25 10:49:08.012[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating synthetic response eval dataset...[0m


Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:05<00:00,  4.89it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.38s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.32s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,

In [56]:
synthetic_response_eval_prediction_dataset = await synthetic_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=BATCH_SIZE, show_progress=True
)

Batch processing of predictions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:24<00:00,  1.17it/s]


In [57]:
synthetic_mean_scores_df, synthetic_deep_eval_df = evaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    synthetic_response_eval_prediction_dataset,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

In [58]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,3.758621
mean_relevancy_score,0.793103
mean_faithfulness_score,0.793103


In [59]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,How is the property graph index different from...,\nThe property graph index is different from t...,1.0,4.5,1.0,[Customizing property graph index in LlamaInde...
1,What is the purpose of using the SchemaLLMPath...,\nThe purpose of using the SchemaLLMPathExtrac...,1.0,4.5,1.0,"[In this example, we will use the SchemaLLMPa..."
2,How can you accelerate the process of extracti...,\nYou can accelerate the process of extracting...,1.0,4.5,1.0,"[Now that we have defined the graph schema, we..."
3,How can you adjust the similarity_threshold an...,\nYou can adjust the similarity_threshold and ...,1.0,4.5,1.0,"[size(allCombinedResults)-1, 1) as combinedRes..."
4,What methods are used in the custom retriever ...,\nThe custom retriever uses both Vector search...,1.0,3.5,1.0,[results_df = pd.DataFrame()\n\nembed_name = ...
5,How does the OpenAI Cookbook aim to enhance th...,\nThe OpenAI Cookbook aims to enhance the effe...,1.0,4.5,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
6,How does the project integrate KOSMOS-2 and Pa...,\nThe project integrates KOSMOS-2 and PaLM wit...,1.0,4.5,1.0,[Google PaLM API adds the layer of linguisti...
7,How is the sidebar enhanced to improve credibi...,\nThe sidebar is not mentioned in the provided...,0.0,1.0,0.0,[It contains a detailed analysis of all risk f...
8,How does the application handle user interacti...,\nThe application handles user interaction and...,1.0,2.5,1.0,[This triggers the outer loop to run the conti...
9,How has the field of AI and large language mod...,\nThe field of AI and large language models ha...,1.0,4.0,1.0,"[Docs , Tweet . RA-DIT: We drew inspiration ..."


In [60]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

#### Error Analysis

In [61]:
synthetic_deep_eval_df.sort_values(['relevancy_score', 'correctness_score', 'faithfulness_score'])

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
7,How is the sidebar enhanced to improve credibi...,\nThe sidebar is not mentioned in the provided...,0.0,1.0,0.0,[It contains a detailed analysis of all risk f...
16,What is the length of the Lyft SEC 10-K document?,\nThe length of the Lyft SEC 10-K document is ...,0.0,1.0,0.0,[If Uber is unable to attract or maintain a cr...
12,How does the integration between Create-llama ...,\nThe integration between Create-llama and E2B...,0.0,2.0,0.0,"[Code , Tweet . We have introduced return_di..."
11,What is the purpose of using a vision model to...,\nThe purpose of using a vision model to filte...,0.0,2.0,1.0,[The SimpleMultiModalQueryEngine first retri...
28,How can one explore the author's Huggingface p...,"\nTo explore the author's HuggingFace profile,...",0.0,3.0,0.0,[)\n\nquery_engine = index. as_query_engine (\...
19,What is the purpose of setting up Tonic Validate?,\nTo create tests for Tonic Validate and to im...,0.0,3.0,1.0,[Setting up Tonic Validate To set up Tonic Val...
8,How does the application handle user interacti...,\nThe application handles user interaction and...,1.0,2.5,1.0,[This triggers the outer loop to run the conti...
24,What new offering was launched by LlamaIndex t...,\nThe new offering launched by LlamaIndex that...,1.0,3.0,1.0,"[Each dataset, designed as a QA set, integrate..."
4,What methods are used in the custom retriever ...,\nThe custom retriever uses both Vector search...,1.0,3.5,1.0,[results_df = pd.DataFrame()\n\nembed_name = ...
27,How does a tree index organize data?,\nA tree index organizes data by building a tr...,1.0,4.0,0.0,"[if child_branch_factor=2, a query will choose..."


In [62]:
synthetic_response_eval_irrelevance_df = (
    synthetic_deep_eval_df
    .loc[lambda df: df['relevancy_score'].lt(1)]
    .sort_values(['relevancy_score', 'correctness_score', 'faithfulness_score'])
)

for i, row in synthetic_response_eval_irrelevance_df.iterrows():
    print(f"Query:\n{row.query}\n")
    contexts = '\n\n'.join(row.contexts)
    print(f"Context:\n{contexts}\n")
    print(f"Answer:\n{row.answer}\n----\n")

Query:
How is the sidebar enhanced to improve credibility and engagement?

Context:
It contains a detailed analysis of all risk factors in bullet points, and offers a comparison across time for all bullet points. Query 3 response = query_engine.query("Analyze Uber revenue growth and risk factors over the last few quarters") Response (intermediate steps): > Current query: Analyze Uber revenue growth and risk factors over quarters
> New query:  What is Uber's revenue growth and risk factors for the quarter ending March 2022?
> Current query: Analyze Uber revenue growth and risk factors over quarters
> New query:  What is Uber's revenue growth and risk factors for the quarter ending March 2022?
> Current query: Analyze Uber revenue growth and risk factors over quarters
> New query:  What is Uber's revenue growth and risk factors for the quarter ending June 2022?
> Current query: Analyze Uber revenue growth and risk factors over quarters
> New query:  What is Uber's revenue growth and risk

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [67]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [68]:
curated_response_eval_prediction_dataset = await curated_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=BATCH_SIZE, show_progress=True
)

Batch processing of predictions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.36s/it]


In [69]:
curated_mean_scores_df, curated_deep_eval_df = evaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    curated_response_eval_prediction_dataset,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

In [70]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.0
mean_relevancy_score,0.75
mean_faithfulness_score,0.75


In [71]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,What are key features of llama-agents?,"\nDistributed Service-Oriented Architecture: Every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.\n\nCommunication via Standardized API Interfaces: Interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.\n\nEase of Deployment: Launch, scale, and monitor each agent and your control plane independently.\n\n\nSources:\n- [Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems](https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems)\n\n",1.0,4.0,1.0,"[Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems\nWe're excited to announce the alpha release of llama-agents , a new open-source framework designed to simplify the process of building, iterating, and deploying multi-agent AI systems and turn your agents into production microservices. Whether you're working on complex question-answering systems, collaborative AI assistants, or distributed AI workflows, llama-agents provides the tools and structure you need to bring your ideas to life. Key Features of llama-agents Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task. Ease of deployment: launch, scale and monitor each agent and your control plane independently. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service Let's dive into how you can start using llama-agents to build your own multi-agent systems. Getting Started with llama-agents First, install the framework using pip: pip install llama-agents llama-index-agent-openai Basic System Setup Here's a simple example of how to set up a basic multi-agent system using llama-agents., tool = FunctionTool.from_defaults(fn=get_the_secret_fact)\n\n # create our agents \nworker1 = FunctionCallingAgentWorker.from_tools([tool], llm=OpenAI())\nworker2 = FunctionCallingAgentWorker.from_tools([], llm=OpenAI())\nagent1 = worker1.as_agent()\nagent2 = worker2.as_agent() We turn those agents into services: agent_server_1 = AgentService(\n agent=agent1,\n message_queue=message_queue,\n description= ""Useful for getting the secret fact."" ,\n service_name= ""secret_fact_agent"" ,\n host= ""localhost"" ,\n port= 8003 \n)\nagent_server_2 = AgentService(\n agent=agent2,\n message_queue=message_queue,\n description= ""Useful for getting random dumb facts."" ,\n service_name= ""dumb_fact_agent"" ,\n host= ""localhost"" ,\n port= 8004 \n) And finally we launch each service as an independent agent. Here we’re doing them all from a single script, but each of these could be a totally separate service, launched and scaled independently: from llama_agents import ServerLauncher, CallableMessageConsumer\n\n # Additional human consumer \n def handle_result ( message ) -> None :\n print ( f""Got result:"" , message.data)\n\n\n # the final result is published to a ""human"" consumer \n # so we define one to handle it! \nhuman_consumer = CallableMessageConsumer(\n handler=handle_result, message_type= ""human"" \n)\n\n # Define Launcher \nlauncher = ServerLauncher(\n [agent_server_1, agent_server_2],\n control_plane,\n message_queue,\n additional_consumers=[human_consumer]\n)\n\nlauncher.launch_servers() Real-time monitoring One of the coolest debugging features of our multi-agent system is our agent monitor, which is built right in. You launch it like this: llama-agents monitor --control-plane-url http://127.0.0.1:8000 Once launched, you get an intuitive, point-and-click terminal application.]"
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,\nThe two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook are the Retrieval System and Response Generation.\n\n\nSources:\n[OpenAI Cookbook: Evaluating RAG systems](https://www.llamaindex.ai/blog/openai-cookbook-evaluating-rag-systems-fe393c61fb93)\n- [Evaluating the Ideal Chunk Size for a RAG System using LlamaIndex](https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5)\n\n,1.0,5.0,1.0,"[OpenAI Cookbook: Evaluating RAG systems\nWe’re excited to unveil our OpenAI Cookbook , a guide to evaluating Retrieval-Augmented Generation (RAG) systems using LlamaIndex. We hope you’ll find it useful in enhancing the effectiveness of your RAG systems, and we’re thrilled to share it with you. The OpenAI Cookbook has three sections: Understanding Retrieval-Augmented Generation (RAG): provides a detailed overview of RAG systems, including the various stages involved in building the RAG system. Building RAG with LlamaIndex: Here, we dive into the practical aspects, demonstrating how to construct a RAG system using LlamaIndex, specifically applied to Paul Graham’s essay, utilizing the VectorStoreIndex . Evaluating RAG with LlamaIndex: The final section focuses on assessing the RAG system’s performance in two critical areas: the Retrieval System and Response Generation. We use our unique synthetic dataset generation method, generate_question_context_pairs to conduct thorough evaluations in these areas. Our goal with this cookbook is to provide the community with an essential resource for effectively evaluating and enhancing RAG systems developed using LlamaIndex. Join us in exploring the depths of RAG system evaluation and discover how to leverage the full potential of your RAG implementations with LlamaIndex. Keep building with LlamaIndex!🦙, Faithfulness Evaluator — It is useful for measuring if the response was hallucinated and measures if the response from a query engine matches any source nodes. Relevancy Evaluator — It is useful for measuring if the query was actually answered by the response and measures if the response + source nodes match the query. # We will use GPT-4 for evaluating the responses\ngpt4 = OpenAI(temperature=0, model=""gpt-4"")\n\n# Define service context for GPT-4 for evaluation\nservice_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)\n\n# Define Faithfulness and Relevancy Evaluators which are based on GPT-4\nfaithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)\nrelevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4) Response Evaluation For A Chunk Size We evaluate each chunk_size based on 3 metrics. Average Response Time. Average Faithfulness. Average Relevancy. Here’s a function, evaluate_response_time_and_accuracy , that does just that which has: VectorIndex Creation. Building the Query Engine. Metrics Calculation. # Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size \n # We use GPT-3.5-Turbo to generate response and GPT-4 to evaluate it. \n def evaluate_response_time_and_accuracy ( chunk_size, eval_questions ):\n """"""\n Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.\n \n Parameters:\n chunk_size (int): The size of data chunks being processed.\n \n Returns:\n tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.\n """"""]"
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,\nThe two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).\n\n\nSources:\n- [Boosting RAG: Picking the Best Embedding & Reranker models](https://www.llamaindex.ai/blog/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83)\n\n,1.0,5.0,1.0,"[bge-large : Experiences significant improvement with rerankers, with the best results from CohereRerank (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with CohereRerank (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0.836049 MRR. Voyage : Has strong initial performance that is further amplified by CohereRerank (0.91573 hit rate, 0.851217 MRR), suggesting high responsiveness to reranking. JinaAI : Very strong performance, sees notable gains with bge-reranker-large (0.938202 hit rate, 0.868539 MRR) and CohereRerank (0.932584 hit rate, 0.873689), indicating that reranking significantly boosts its performance. Google-PaLM : The model demonstrates strong performance, with measurable gains when using the CohereRerank (0.910112 hit rate, 0.855712 MRR). This indicates that reranking provides a clear boost to its overall results. Impact of Rerankers : WithoutReranker : This provides the baseline performance for each embedding. bge-reranker-base : Generally improves both hit rate and MRR across embeddings. bge-reranker-large : This reranker frequently offers the highest or near-highest MRR for embeddings. For several embeddings, its performance rivals or surpasses that of the CohereRerank . CohereRerank : Consistently enhances performance across all embeddings, often providing the best or near-best results., Necessity of Rerankers : The data clearly indicates the significance of rerankers in refining search results. Nearly all embeddings benefit from reranking, showing improved hit rates and MRRs. Rerankers, especially CohereRerank , have demonstrated their capability to transform any embedding into a competitive one. Overall Superiority : When considering both hit rate and MRR, the combinations of OpenAI + CohereRerank and JinaAI-Base + bge-reranker-large/ CohereRerank emerge as top contenders. However, the consistent improvement brought by the CohereRerank/ bge-reranker-large rerankers across various embeddings make them the standout choice for enhancing search quality, regardless of the embedding in use. In summary, to achieve the peak performance in both hit rate and MRR, the combination of OpenAI or JinaAI-Base embeddings with the CohereRerank/bge-reranker-large reranker stands out. Please be aware that our benchmarks are intended to offer a reproducible script for your own data. Nevertheless, treat these figures as estimates and proceed with caution when interpreting them. Conclusions: In this blog post, we have demonstrated how to evaluate and enhance retriever performance using various embeddings and rerankers. Below are our final conclusions. Embeddings : The OpenAI and JinaAI-Base embeddings, especially when paired with the CohereRerank/bge-reranker-large reranker, set the gold standard for both hit rate and MRR. Rerankers : The influence of rerankers, particularly CohereRerank/bge-reranker-large , cannot be overstated. They play a key role in improving the MRR for many embeddings, showing their importance in making search results better.]"
3,How does the MemoryCache project by Mozilla utilize PrivateGPT_AI and LlamaIndex to enhance personal knowledge management while maintaining privacy? Provide a brief overview of the project and its key features.,"\nThe MemoryCache project by Mozilla is not mentioned in the provided context. The context only discusses the integration of LlamaIndex and MongoDB to build a ChatGPT with private data, focusing on large language models, knowledge retrieval, and generation. There is no information about the MemoryCache project, PrivateGPT_AI, or their relationship with LlamaIndex.\n\n\nSources:\n- [Build a ChatGPT with your Private Data using LlamaIndex and MongoDB](https://www.llamaindex.ai/blog/build-a-chatgpt-with-your-private-data-using-llamaindex-and-mongodb-b09850eb154c)\n\n",0.0,2.0,0.0,"[Example question: Who is the most recent UK prime minister? There are 2 main paradigms currently for extending the amazing reasoning and knowledge generation capabilities of LLMs: Model finetuning and in-context learning. Model Finetuning can be more complex and expensive to operationalize. There are also some open questions like how to delete information from a fine-tuned model to ensure you comply with local laws (ex. GDPR in Europe), and for changing data you need to fine-tune again constantly. In-context learning requires inserting the new data as part of the input prompts to the LLM. To perform this data augmentation in a secure, high performance and cost-effective manner is where tools like LlamaIndex and MongoDB Developer Data Platform can help. Introduction to LlamaIndex LlamaIndex provides a simple, flexible interface to connect LLMs with external data. Offers data connectors to various data sources and data formats (APIs, PDFs, docs, etc). Provides indices over the unstructured and structured data for use with LLMs. Structures external information so that it can be used with the prompt window limitations of any LLM. Exposes a query interface which takes in an input prompt and returns a knowledge-augmented output. MongoDB as the Datastore It is effortless to store the ingested documents (i.e. Node objects), index metadata, etc to MongoDB using the inbuilt abstractions in LlamaIndex. There is an option to store the “documents” as an actual collection in MongoDB using MongoDocumentStore . There is an option to persist the “Indexes” using the MongoIndexStore . Storing LlamaIndex’s documents and indexes in a database becomes necessary in a couple of scenarios: Use cases with large datasets may require more than in-memory storage. Ingesting and processing data from various sources (for example, PDFs, Google Docs, Slack). The requirement to continuously maintain updates from the underlying data sources., Build a ChatGPT with your Private Data using LlamaIndex and MongoDB\nCo-authors: Prakul Agarwal — Senior Product Manager, Machine Learning at MongoDB Jerry Liu — co-founder at LlamaIndex Update (6/22/2023): The preferred way to use LlamaIndex + MongoDB is now with our MongoDBAtlasVectorSearch class. Take a look at our guide here: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/MongoDBAtlasVectorSearch.html Summary Large Language Models (LLMs) like ChatGPT have revolutionized the way users can get answers to their questions. However, the “knowledge” of LLMs is restricted by what they were trained on, which for ChatGPT means publicly available information on the internet till September 2021. How can LLMs answer questions using private knowledge sources like your company’s data and unlock its true transformative power? This blog will discuss how LlamaIndex and MongoDB can enable you to achieve this outcome quickly. The attached notebook provides a code walkthrough on how to query any PDF document using English queries. Background Traditionally, AI has been used to analyze data, identify patterns and make predictions based on existing data. The recent advancements have led to AI becoming better at generating new things (rather than just analyzing existing things). This is referred to as Generative AI. Generative AI is powered mainly by machine learning models called Large Language Models (LLM). LLMs are pre-trained on large quantities of publicly available text. There are various proprietary LLMs from companies like OpenAI, Cohere, AI21, as well as a lot of emerging open-source LLMs like Llama, Dolly, etc. There are 2 main scenarios where the knowledge of LLMs falls short: Private data such as your company’s internal knowledge base spread across PDFs, Google Docs, Wiki pages, and applications like Salesforce and Slack Newer data than when the LLMs were last trained.]"


In [72]:
for context in curated_deep_eval_df.iloc[2]['contexts']:
    print(context)
    print('-' * 10)

bge-large : Experiences significant improvement with rerankers, with the best results from  CohereRerank  (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with  CohereRerank  (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0.836049 MRR. Voyage : Has strong initial performance that is further amplified by  CohereRerank  (0.91573 hit rate, 0.851217 MRR), suggesting high responsiveness to reranking. JinaAI : Very strong performance, sees notable gains with  bge-reranker-large  (0.938202 hit rate, 0.868539 MRR) and  CohereRerank  (0.932584 hit rate, 0.873689), indicating that reranking significantly boosts its performance. Google-PaLM : The model demonstrates strong performance, with measurable gains when using the  CohereRerank (0.910112

In [73]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [74]:
if LOG_TO_MLFLOW:
    mlflow.end_run()

# Archive