# Set up

In [1]:
import os
import pickle
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

import mlflow

In [2]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [4]:
TESTING = False
DEBUG = True
LOG_TO_MLFLOW = True

In [5]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [6]:
if LOG_TO_MLFLOW:
    RUN_NAME = "exp_001_v3"
    RUN_DESCRIPTION = """
# Qdrant with TogetherAI Llama3 model

## Changelog
### Compares to exp_001_v2
- Use OpenAI GPT-3.5-Turbo for generating question-context retrieval evaluation dataset
"""
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

In [7]:
NOTEBOOK_CACHE_DP = f'data/001/{RUN_NAME}'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

if LOG_TO_MLFLOW:
    mlflow.log_param("NOTEBOOK_CACHE_DP", NOTEBOOK_CACHE_DP)

# Load data

In [8]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [9]:
len(data)

159

In [10]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [11]:
data[0]['content']



# Prepare documents

In [12]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_input_data", len(input_data))

[32m2024-07-23 15:03:16.049[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=159[0m


In [13]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [14]:
documents[0]



In [15]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

In [16]:
if LOG_TO_MLFLOW:
    mlflow.log_param("len_documents", len(documents))

## Setting LLM

In [17]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [18]:
# LLM_OPTION = 'openai'
# LLM_OPTION = 'ollama'
LLM_OPTION = 'togetherai'

# LLM_MODEL_NAME = 'llama3'
# LLM_MODEL_NAME = 'gpt-3.5-turbo'
LLM_MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'

# EMBED_OPTION = 'openai'
# EMBED_OPTION = 'togetherai'
# EMBED_OPTION = 'ollama'
EMBED_OPTION = 'huggingface'

# EMBED_MODEL_NAME = 'llama3'
# EMBED_MODEL_NAME = 'togethercomputer/m2-bert-80M-2k-retrieval'
EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)
    mlflow.log_param("LLM_MODEL_NAME", LLM_MODEL_NAME)
    mlflow.log_param("EMBED_OPTION", EMBED_OPTION)
    mlflow.log_param("EMBED_MODEL_NAME", EMBED_MODEL_NAME)

In [19]:
# LLM options
if LLM_OPTION == 'ollama':
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    llm = Ollama(base_url=base_url, model=LLM_MODEL_NAME, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
elif LLM_OPTION == 'openai':
    from llama_index.llms.openai import OpenAI
    llm = OpenAI(model=LLM_MODEL_NAME)
elif LLM_OPTION == 'togetherai':
    from llama_index.llms.together import TogetherLLM
    llm = TogetherLLM(model=LLM_MODEL_NAME)

# Embed options
if EMBED_OPTION == 'huggingface':
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    embed_model = HuggingFaceEmbedding(
        model_name=EMBED_MODEL_NAME
    )
elif EMBED_OPTION == 'openai':
    from llama_index.embeddings.openai import OpenAIEmbedding
    embed_model = OpenAIEmbedding()
elif EMBED_OPTION == 'togetherai':
    from llama_index.embeddings.together import TogetherEmbedding
    embed_model = TogetherEmbedding(EMBED_MODEL_NAME)
elif EMBED_OPTION == 'ollama':
    from llama_index.embeddings.ollama import OllamaEmbedding
    embed_model = OllamaEmbedding(
        model_name=EMBED_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )

logger.info(f"LLM:\n{repr(llm)}")
logger.info(f"Embed model:\n{repr(embed_model)}")

[32m2024-07-23 15:03:24.780[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mLLM:
TogetherLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7ba24ab8f790>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7ba2baac9080>, completion_to_prompt=<function default_completion_to_prompt at 0x7ba2bab37420>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='meta-llama/Meta-Llama-3-8B-Instruct-Lite', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='3cf613093b6eb9b479c341126dc8d3761c67f9340d0a4a8e1fdc62ed41b58126', api_base='https://api.together.xyz/v1', api_version='', context_window=3900, is_chat_model=True, is_function_calling_model=False, tokenizer=None)[0m
[32m2024-07-23 15:03:24.782[0m | [1mINFO    [0m | [36m__main__[0

In [20]:
embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
Settings.embed_model = embed_model
Settings.llm = llm

if LOG_TO_MLFLOW:
    mlflow.log_param("embedding_model_dim", embed_model_dim)
    mlflow.log_param("LLM_MODEL", repr(llm))
    mlflow.log_param("EMBEDDING_MODEL", repr(embed_model))

# Index embeddings

## Qdrant as VectorStore

In [21]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [22]:
import string

def substitute_punctuation(text):
    # Create a translation table that maps each punctuation character to an underscore
    translator = str.maketrans(string.punctuation, '_' * len(string.punctuation))
    # Translate the text using the translation table
    return text.translate(translator)

collection_raw_name = f"{EMBED_OPTION}__{EMBED_MODEL_NAME}"
logger.info(f"{substitute_punctuation(collection_raw_name)=}")

[32m2024-07-23 15:03:26.179[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1msubstitute_punctuation(collection_raw_name)='huggingface__BAAI_bge_small_en_v1_5'[0m


In [23]:
RECREATE_INDEX = False

COLLECTION = substitute_punctuation(collection_raw_name)

NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
NODES_PERSIST_FP = 'data/001/exp_001_qdrant_togetherai_llama3/nodes.pkl'

if LOG_TO_MLFLOW:
    mlflow.log_param(f"COLLECTION", COLLECTION)

In [24]:
qdrantdb = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
aqdrantdb = qdrant_client.AsyncQdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
collection_exists = qdrantdb.collection_exists(COLLECTION)
if RECREATE_INDEX or not collection_exists:
    if collection_exists:
        logger.info(f"Deleting existing Qdrant collection...")
        qdrantdb.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
    logger.info(f"Creating new Qdrant collection...")
    qdrantdb.create_collection(
        COLLECTION,
        vectors_config=VectorParams(size=embed_model_dim, distance=Distance.COSINE),
    )
else:
    logger.info(f"Use existing Qdrant collection")
db_collection = qdrantdb.get_collection(COLLECTION)
vector_store = QdrantVectorStore(
    client=qdrantdb,
    collection_name=COLLECTION,
    aclient=aqdrantdb,
    prefer_grpc=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

[32m2024-07-23 15:03:28.383[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mUse existing Qdrant collection[0m


Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [25]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [26]:
t0 = time.perf_counter()
# TODO: TO understand the differences between points_count and indexed_vector_counts.
# Here indexed_vector_counts = 0
db_collection_count = db_collection.points_count

if db_collection_count > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing DB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new DB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embed_model,
        ],
        vector_store = vector_store
    )

    num_workers = os.cpu_count() - 1
    logger.info(f"Running Ingestion Pipeline with {num_workers=}")
    nodes = await pipeline.arun(documents=documents, num_workers=num_workers)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
t1 = time.perf_counter()
logger.info(f"Indexing {len(documents)} into VectorStoreIndex took {t1 - t0:,.0f}s")

[32m2024-07-23 15:03:29.732[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading index from existing DB...[0m
[32m2024-07-23 15:03:30.410[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mIndexing 159 into VectorStoreIndex took 1s[0m


In [27]:
logger.info(f"Indexed {len(nodes)} nodes into Vector Store")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_nodes", len(nodes))

[32m2024-07-23 15:03:32.290[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mIndexed 808 nodes into Vector Store[0m


In [23]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [24]:
RECREATE_INDEX = False

COLLECTION = 'togetherai'
NOTEBOOK_CACHE_DP = 'data/001/togetherai'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [25]:
db = chromadb.PersistentClient(path=f"{NOTEBOOK_CACHE_DP}/chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX or not collection_exists:
    logger.info(f"Creating new ChromaDB collection...")
    if collection_exists:
        logger.info(f"Deleting existing ChromaDB collection...")
        db.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-23 12:09:50.358[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mUse existing ChromaDB collection[0m


In [26]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [27]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [28]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-23 12:09:51.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading index from existing ChromaDB...[0m


#### Inspect nodes

# Query engine

In [28]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [29]:
RETRIEVAL_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [30]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors,
)

In [31]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node f0812544-41a5-4374-9c28-4a89704c920f] [Similarity score:             0.708697] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node dbc2d800-7fe5-4d49-81cf-a6f5f6d5b1ce] [Similarity score:             0.691203] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...


[32m2024-07-23 15:03:39.286[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is an AI agents platform designed to facilitate the autonomous completion of tasks in any web environment. It empowers developers to build AI agents that can manage online activities from start to finish, handling everything from simple data retrieval to complex interactions.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [32]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [33]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [34]:
RECREATE_RETRIEVAL_EVAL_DATASET = True
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_NUM_SAMPLE_NODES = 10
RETRIEVAL_NUM_SAMPLE_NODES = min(len(nodes), RETRIEVAL_NUM_SAMPLE_NODES)
RETRIEVAL_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RETRIEVAL_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [35]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes
else:
    logger.info(f"Loading retrieval_eval_nodes from {RETRIEVAL_EVAL_DATASET_FP}...")
    with open(RETRIEVAL_EVAL_DATASET_FP, 'r') as f:
        retrieval_eval_nodes = json.load(f)

[32m2024-07-23 15:03:47.078[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mSampling 10 nodes for retrieval evaluation...[0m


In [37]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    # Use good model to generate the eval dataset
    from llama_index.llms.openai import OpenAI
    retrieval_eval_llm = OpenAI(model=RETRIEVAL_EVAL_LLM_MODEL, **RETRIEVAL_EVAL_LLM_MODEL_CONFIG)

    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes, llm=retrieval_eval_llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-23 15:04:47.476[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCreating new synthetic retrieval eval dataset...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.28s/it]


In [40]:
retrieval_eval_dataset.queries

{'4ad13f0d-6096-4ca7-974e-471a728dffc6': 'How do complex and unconstrained agent interaction techniques, such as ReAct, differ from simple and constrained agent interaction mechanisms in terms of their approach to handling data queries?',
 'b62f9886-c4fc-47fa-b754-4fcaa45ab95a': 'How can agents, specifically those integrated with LlamaIndex query engines, assist users in performing complex user queries across multiple data sources, and what are the potential benefits and drawbacks of utilizing these agents in data tasks?',
 'b90b6c86-e498-447a-b177-d6a6aec2d3b5': 'How does LlamaIndex simplify the evaluation process for LLM and RAG apps, and what are the four key metrics it assesses these apps on?',
 'd2970e24-d222-42ab-927b-99c095a95848': 'Describe the various integrations and enhancements made by LlamaIndex, such as its integration with AskMarvinAI and RunGPT by JinaAI. How do these integrations improve the functionality and usability of LlamaIndex for users?',
 '13857f83-40ae-4e6f-87

### Evaluate

In [41]:
from llama_index.core.evaluation import RetrieverEvaluator

In [42]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node 83bb5ac9-f883-48dd-8efa-b4ef00a3d9f7] [Similarity score:             0.890583] The agent then reasons that it needs to call the  read_search_data  tool, which will query the in...
> [Node b3bbaa9c-83fa-42c7-a22e-e0d7d3f5afc0] [Similarity score:             0.887746] As a result some of our existing query capabilities contain “agent-like” components: we have quer...
> Top 2 nodes:
> [Node 8cf60c44-e9e5-4448-a8c6-216130f7e88e] [Similarity score:             0.85269] It repeats these steps in an iterative loop until the task is complete. There are other interacti...
> [Node 7f1a0138-0baa-4746-88fc-3087b254559f] [Similarity score:             0.815742] Dumber LLM Agents Need More Constraints and Better Tools
Summary In this article, we compare how ...
> Top 2 nodes:
> [Node 3b5ad0bd-36ff-41f3-8197-81d880eeeb19] [Similarity score:             0.823939] Next, we can add the code that queries LlamaIndex: def   get_responses ( questions ):
    llm_ans...
> [Node 91ffc125

In [43]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [44]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_2_retrieval_eval,0.368421,0.368421,0.184211,0.368421,0.368421,0.225896


In [45]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [46]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [47]:
def evaluate_labelled_rag_dataset(response_eval_dataset, response_eval_prediction_dataset, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
        "contexts": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)
        evals["contexts"].append(prediction.contexts)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
        "contexts": evals['contexts'],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
        pd.Series(evals['contexts'], name='contexts')
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [48]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [49]:
RECREATE_SYNTHETIC_EVAL_DATASET = True
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10
RESPONSE_NUM_SAMPLE_DOCUMENTS = min(len(documents), RESPONSE_NUM_SAMPLE_DOCUMENTS)

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [50]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-23 15:06:27.218[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [52]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # Use good model to generate the eval dataset
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=response_eval_llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
        workers=(os.cpu_count() - 1)
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-23 15:06:49.522[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating synthetic response eval dataset...[0m


Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

> Adding chunk: LlamaIndex Newsletter 2024-03-05
Greetings, Lla...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: LlamaIndex turns 1!
It’s our birthday! One year...
> Adding chunk: LlamaIndex Newsletter 2023–12–19
What’s up, Lla...
> Adding chunk: 👀 Community Demos : MemoryCache: Mozilla’s new ...
> Adding chunk: The latest updates to LlamaCloud
To build a pro...
> Adding chunk: Build and Evaluate LLM Apps with LlamaIndex and...
> Adding chunk: Finally, the third feedback function checks how...
> Adding chunk: Agentic RAG With LlamaIndex
The topic of Agenti...
> Adding chunk: These connectors can work with
  APIs, PDFs, SQ...
> Adding chunk: LlamaIndex Accelerates Enterprise Generative AI...
> Adding chunk: Introducing Llama Packs
Today we’re excited to ...
> Adding chunk: Special thanks to Logan Markewich and Andrei Fa...
> Adding chunk: LlamaIndex Newsletter 2023–11–14
Hello Llama Fr...
> Adding chunk: Finally, we released a guide to craft a GPT Bu

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  7.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.03s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.25s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [53]:
synthetic_response_eval_prediction_dataset = await synthetic_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=8, show_progress=True
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 89285fc8-241c-41b1-b377-1d49ceccbae1] [Similarity score:             0.797784] Tweet . We introduced day-0 integrations with the MistralAI LLMs (mistral-tiny, mistral-small, mi...
> [Node cae32e51-5fd7-437c-a89e-6fcc6af464b7] [Similarity score:             0.78116] Tweet AI Chatbot Starter (from the DataStax team), a web server powered by AstraDB and LlamaIndex...
> Top 2 nodes:
> [Node e009b8ae-616f-47c3-9cf9-20fa1c683ac3] [Similarity score:             0.899186] OpenAI Cookbook: Evaluating RAG systems
We’re excited to unveil our  OpenAI Cookbook , a guide to...
> [Node 019b26a2-e46a-4df2-909e-0e551e27f1a8] [Similarity score:             0.842587] Define objective function 
 def   objective_function ( params_dict ):
    chunk_size = params_dic...
> Top 2 nodes:
> [Node d9029ced-46e4-4009-996e-410b91376c7c] [Similarity score:             0.840824] Introducing LlamaCloud and LlamaParse
Today is a big day for the LlamaIndex ecosystem: we are ann...
> [Node edf2df9d

Batch processing of predictions:  50%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 4/8 [00:07<00:07,  1.96s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5161614868990574 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.7919424367226937 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.52s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 3a98052b-19dc-4e9a-9387-993ebe280653] [Similarity score:             0.843707] Transforming Natural Language to SQL and Insights for E-commerce with LlamaIndex, GPT3.5, and Str...
> [Node f14ba7ba-1d1f-4d68-aafa-0b7e536b0744] [Similarity score:             0.835205] Its integration ensures a smooth transition from user inputs to database insights, culminating in...
> Top 2 nodes:
> [Node 886fdc63-e4b0-4e8a-89e3-6eb1cfa94361] [Similarity score:             0.86808] This feature enables transparency, re-use, and generally more rapid development velocity. Improve...
> [Node d7c67a6d-57d9-4ab2-bd82-e39294b964e7] [Similarity score:             0.850782] The latest updates to LlamaCloud
To build a production-quality LLM agent over your data, you need...
> Top 2 nodes:
> [Node dbba146b-8249-4221-a2b6-0e5475bd9e17] [Similarity score:             0.839747] min )


feedbacks = [f_lang_match, f_qa_relevance, f_qs_relevance]

l = TruLlama(app=query_engine...
> [Node e8b5aa20

Batch processing of predictions:  50%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 4/8 [00:06<00:05,  1.44s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.891192177247035 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.07405918574574444 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.48s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 390ed7c9-6024-4f86-bcca-5eba197a02d3] [Similarity score:             0.773554] Agentic RAG With LlamaIndex
The topic of Agentic RAG explores how agents can be incorporated into...
> [Node 924ed153-bb6e-4044-9431-205e36c96274] [Similarity score:             0.759326] Say that you want to build a chatbot Define the dataset (here it’s a web page, can also be a loca...
> Top 2 nodes:
> [Node 9929ccb4-3d3b-4e9d-80ac-dc86ff8df3c1] [Similarity score:             0.838954] Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
We'...
> [Node b8de4586-9cc8-4278-9c60-ec62b9419239] [Similarity score:             0.838108] Codebase . 🗺️ Guides: Guide  to Building an Agentic RAG Service with our comprehensive notebook t...
> Top 2 nodes:
> [Node 390ed7c9-6024-4f86-bcca-5eba197a02d3] [Similarity score:             0.824891] Agentic RAG With LlamaIndex
The topic of Agentic RAG explores how agents can be incorporated into...
> [Node c9fa91c

Batch processing of predictions:  50%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 4/8 [00:05<00:05,  1.28s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.2825351111462948 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.32931889319086505 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.44s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/6 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 20d04a54-70b8-427e-82e4-f08088283ed1] [Similarity score:             0.70927] Tonic Validate  tutorial on Implementing integration tests for LlamaIndex. Chia Jeng Yang   tutor...
> [Node 7de978b1-b255-4247-b99f-2985fd3486d6] [Similarity score:             0.702937] Tweet ✍️ Tutorials: Build a best-in-class RAG application using Qdrant as a vector store, Jina AI...
> Top 2 nodes:
> [Node 201b4f5f-38d0-4a42-bf6d-9eeae460dff9] [Similarity score:             0.781271] Let’s take a look at the downloaded pack in  voyage_pack/base.py  , and swap out the OpenAI LLM f...
> [Node 80b13eac-a7e7-4ef4-a4ab-c3fd886e7959] [Similarity score:             0.764892] There are 19 folders in here. The main integration categories are: llms embeddings multi_modal_ll...
> Top 2 nodes:
> [Node 20d04a54-70b8-427e-82e4-f08088283ed1] [Similarity score:             0.788622] Tonic Validate  tutorial on Implementing integration tests for LlamaIndex. Chia Jeng Yang   tutor...
> [Node 9f15d4bf

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:12<00:00,  2.06s/it]


In [54]:
synthetic_mean_scores_df, synthetic_deep_eval_df = evaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    synthetic_response_eval_prediction_dataset,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

> Adding chunk: LlamaIndex newsletter 2023–10–24
Hello Llama Fa...
> Adding chunk: Notebook ,  Tweet We launched revamped Python d...
> Adding chunk: LlamaIndex newsletter 2023–10–24
Hello Llama Fa...
> Adding chunk: Notebook ,  Tweet We launched revamped Python d...
> Adding chunk: Introducing LlamaCloud and LlamaParse
Today is ...
> Adding chunk: Launching the first GenAI-native document parsi...
> Adding chunk: Introducing LlamaCloud and LlamaParse
Today is ...
> Adding chunk: Launching the first GenAI-native document parsi...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: LlamaIndex Newsletter 2023–12–05
Hello Llama Co...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: LlamaIndex Newsletter 2023–12–05
Hello Llama Co...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Define objective function 
 def   objective_fun...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re 

In [55]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,3.96
mean_relevancy_score,0.9
mean_faithfulness_score,0.966667


In [56]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,How does the new feature released by LlamaInde...,"The new feature, QueryFusionRetriever, allows ...",1.0,2.5,1.0,[LlamaIndex newsletter 2023–10–24\nHello Llama...
1,Discuss the advancements made by LlamaIndex in...,LlamaIndex has made significant advancements i...,1.0,3.0,1.0,[Introducing LlamaCloud and LlamaParse\nToday ...
2,Explain the three main sections of the OpenAI ...,The OpenAI Cookbook for evaluating RAG systems...,1.0,,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
3,How does the OpenAI Cookbook suggest evaluatin...,The OpenAI Cookbook suggests evaluating the pe...,1.0,,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
4,How has LlamaIndex evolved over the past year ...,LlamaIndex has experienced significant growth ...,1.0,4.5,1.0,[It’s what gets us up in the morning and keeps...
5,Can you explain the significance of the Retrie...,RAG technology plays a crucial role in LlamaIn...,1.0,,1.0,[MultiModal RAG for Advanced Video Processing ...
6,How does the partnership with Google Gemini be...,The partnership with Google Gemini benefits Ll...,1.0,,1.0,[Linking the resources again below: Gemini (te...
7,Describe the Multi-Doc SEC 10Q Dataset launche...,"The Multi-Doc SEC 10Q Dataset, launched by Taq...",1.0,4.5,1.0,[Tweet . We introduced day-0 integrations with...
8,How does the MemoryCache project by Mozilla ut...,The MemoryCache project by Mozilla utilizes Pr...,0.0,4.0,1.0,[Query “How does GPT4 do on the bar exam?” Res...
9,Discuss the significance of integrating Na2SQL...,The integration of Na2SQL with Llama Index is ...,1.0,4.5,1.0,[Transforming Natural Language to SQL and Insi...


In [57]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [58]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [59]:
curated_response_eval_prediction_dataset = await curated_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=8, show_progress=True
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node e009b8ae-616f-47c3-9cf9-20fa1c683ac3] [Similarity score:             0.855356] OpenAI Cookbook: Evaluating RAG systems
We’re excited to unveil our  OpenAI Cookbook , a guide to...
> [Node 0020f723-6a45-4396-9543-f414375e8936] [Similarity score:             0.781137] LlamaIndex Newsletter 2023–12–05
Hello Llama Community 🦙, We are excited to collaborate with Deep...
> Top 2 nodes:
> [Node 685a355d-9332-41a9-96bd-a4903ec86777] [Similarity score:             0.824755] Implemented by the user.

        """ 
         return  self._retrieve(query_bundle)

     async ...
> [Node d175dd37-b2cb-4772-9709-4cc90227bac0] [Similarity score:             0.81401] # Extract keys from queries and relevant_docs that need to be removed 
    queries_relevant_docs_...
> Top 2 nodes:
> [Node 78ead847-3d01-4db3-8483-59a88f95b1d7] [Similarity score:             0.797389] First we’ll bring in our dependencies and set up our control plane, which contains our LLM-powere...
> [Node 9929ccb4

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.85s/it]


In [60]:
curated_mean_scores_df, curated_deep_eval_df = evaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    curated_response_eval_prediction_dataset,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

> Adding chunk: First we’ll bring in our dependencies and set u...
> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: First we’ll bring in our dependencies and set u...
> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: LlamaIndex Newsletter 2023–12–05
Hello Llama Co...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: LlamaIndex Newsletter 2023–12–05
Hello Llama Co...
> Adding chunk: Implemented by the user.

        """ 
        ...
> Adding chunk: # Extract keys from queries and relevant_docs t...
> Adding chunk: Implemented by the user.

        """ 
        ...
> Adding chunk: # Extract keys from queries and relevant_docs t...


In [61]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.666667
mean_relevancy_score,1.0
mean_faithfulness_score,1.0


In [62]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,What are key features of llama-agents?,"Distributed Service-Oriented Architecture, Communication via standardized API interfaces, Pass messages between agents using a message queue, Define agentic and explicit orchestration flows, Ease of deployment, Scalability and resource management.",1.0,4.5,1.0,"[First we’ll bring in our dependencies and set up our control plane, which contains our LLM-powered orchestrator import dotenv\ndotenv.load_dotenv() # our .env file defines OPENAI_API_KEY \n from llama_agents import (\n AgentService,\n ControlPlaneServer,\n SimpleMessageQueue,\n AgentOrchestrator,\n)\n from llama_index.core.agent import FunctionCallingAgentWorker\n from llama_index.core.tools import FunctionTool\n from llama_index.llms.openai import OpenAI\n import logging\n\n # turn on logging so we can see the system working \nlogging.getLogger( ""llama_agents"" ).setLevel(logging.INFO)\n\n # Set up the message queue and control plane \nmessage_queue = SimpleMessageQueue()\ncontrol_plane = ControlPlaneServer(\n message_queue=message_queue,\n orchestrator=AgentOrchestrator(llm=OpenAI()),\n) Next we create our tools using LlamaIndex’s existing abstractions, provide those tools to an agent, and turn that agent into an independent microservice: # create a tool \n def get_the_secret_fact () -> str :\n """"""Returns the secret fact."""""" \n return ""The secret fact is: A baby llama is called a 'Cria'."" \n\ntool = FunctionTool.from_defaults(fn=get_the_secret_fact)\n\n # Define an agent \nworker = FunctionCallingAgentWorker.from_tools([tool], llm=OpenAI())\nagent = worker.as_agent()\n\n # Create an agent service \nagent_service = AgentService(\n agent=agent,\n message_queue=message_queue,\n description= ""General purpose assistant"" ,\n service_name= ""assistant"" ,\n) Finally we launch the service and the control plane. Note that here we’re using a helper function to run a single query through the system and then exit; next we’ll show how to deploy this to production. # Set up the launcher for local testing \n from llama_agents import LocalLauncher\n\nlauncher = LocalLauncher(\n [agent_service],\n control_plane,\n message_queue,\n)\n\n # Run a single query through the system \nresult = launcher.launch_single( ""What's the secret fact?"", Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems\nWe're excited to announce the alpha release of llama-agents , a new open-source framework designed to simplify the process of building, iterating, and deploying multi-agent AI systems and turn your agents into production microservices. Whether you're working on complex question-answering systems, collaborative AI assistants, or distributed AI workflows, llama-agents provides the tools and structure you need to bring your ideas to life. Key Features of llama-agents Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task. Ease of deployment: launch, scale and monitor each agent and your control plane independently. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service Let's dive into how you can start using llama-agents to build your own multi-agent systems. Getting Started with llama-agents First, install the framework using pip: pip install llama-agents llama-index-agent-openai Basic System Setup Here's a simple example of how to set up a basic multi-agent system using llama-agents.]"
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,The two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook are the Retrieval System and Response Generation.,1.0,4.5,1.0,"[OpenAI Cookbook: Evaluating RAG systems\nWe’re excited to unveil our OpenAI Cookbook , a guide to evaluating Retrieval-Augmented Generation (RAG) systems using LlamaIndex. We hope you’ll find it useful in enhancing the effectiveness of your RAG systems, and we’re thrilled to share it with you. The OpenAI Cookbook has three sections: Understanding Retrieval-Augmented Generation (RAG): provides a detailed overview of RAG systems, including the various stages involved in building the RAG system. Building RAG with LlamaIndex: Here, we dive into the practical aspects, demonstrating how to construct a RAG system using LlamaIndex, specifically applied to Paul Graham’s essay, utilizing the VectorStoreIndex . Evaluating RAG with LlamaIndex: The final section focuses on assessing the RAG system’s performance in two critical areas: the Retrieval System and Response Generation. We use our unique synthetic dataset generation method, generate_question_context_pairs to conduct thorough evaluations in these areas. Our goal with this cookbook is to provide the community with an essential resource for effectively evaluating and enhancing RAG systems developed using LlamaIndex. Join us in exploring the depths of RAG system evaluation and discover how to leverage the full potential of your RAG implementations with LlamaIndex. Keep building with LlamaIndex!🦙, LlamaIndex Newsletter 2023–12–05\nHello Llama Community 🦙, We are excited to collaborate with DeepLearningAI and TruEraAI to launch an extensive course on advanced Retrieval-Augmented Generation (RAG) and its evaluations. The course includes Sentence Window Retrieval, Auto-merging Retrieval, and Evaluations with TruLensML, providing practical tools for enhanced learning and application. To make the most of this learning opportunity, we invite you to take the course . We appreciate your support and are always excited to see your projects and videos. Feel free to share them at news@llamaindex.ai . Also, remember to subscribe to our newsletter on our website for the latest updates and to connect with our vibrant community. 🤩 First, the highlights: Launch of Seven Advanced Retrieval LlamaPacks : Simplifies building advanced RAG systems to nearly a single line of code, offering techniques like Hybrid Fusion and Auto-merging Retriever. Tweet . Introduction of the OpenAI Cookbook : A comprehensive guide for evaluating RAG systems with LlamaIndex, covering system understanding, building, and performance evaluation. Blog , Notebook Speed Enhancement in Structured Metadata Extraction : Achieved 2x to 10x faster processing in extracting structured metadata from text, boosting RAG performance. Docs , Tweet . We launched versions 3 of RAGs , our project that lets you use natural language to generate a RAG bot customized to your needs. This version incorporates web search, so your bot can incorporate answers fresh from the web. Tweet . Core guide for Full-Stack LLM App Development : Simplifies complex app development with tools like ‘create-llama’ for full-stack apps, ‘SEC Insights’ for multi-document processing, and ‘LlamaIndex Chat’ for chatbot customization. ✨ Feature Releases and Enhancements: We’ve launched seven advanced retrieval LlamaPacks, serving as templates to easily build advanced RAG systems. These packs simplify the process to almost a single line of code, moving away from the traditional notebook approach.]"
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,The two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).,1.0,5.0,1.0,"[Implemented by the user.\n\n """""" \n return self._retrieve(query_bundle)\n\n async def aretrieve ( self, str_or_query_bundle: QueryType ) -&gt; List [NodeWithScore]:\n if isinstance (str_or_query_bundle, str ):\n str_or_query_bundle = QueryBundle(str_or_query_bundle)\n return await self._aretrieve(str_or_query_bundle)\n\ncustom_retriever = CustomRetriever(vector_retriever) Evaluation: To evaluate our retriever, we computed the Mean Reciprocal Rank (MRR) and Hit Rate metrics: retriever_evaluator = RetrieverEvaluator.from_metric_names(\n [ ""mrr"" , ""hit_rate"" ], retriever=custom_retriever\n)\neval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset) Results: We put various embedding models and rerankers to the test. Here are the models we considered: Embedding Models : OpenAI Embedding Voyage Embedding CohereAI Embedding (v2.0/ v3.0) Jina Embeddings (small/ base) BAAI/bge-large-en Google PaLM Embedding Rerankers : CohereAI bge-reranker-base bge-reranker-large It’s worth mentioning that these results provide a solid insight into performance for this particular dataset and task. However, actual outcomes may differ based on data characteristics, dataset size, and other variables like chunk_size, similarity_top_k, and so on. The table below showcases the evaluation results based on the metrics of Hit Rate and Mean Reciprocal Rank (MRR): Analysis: Performance by Embedding: OpenAI : Showcases top-tier performance, especially with the CohereRerank (0.926966 hit rate, 0.86573 MRR) and bge-reranker-large (0.910112 hit rate, 0.855805 MRR), indicating strong compatibility with reranking tools., # Extract keys from queries and relevant_docs that need to be removed \n queries_relevant_docs_keys_to_remove = {\n k for k, v in qa_dataset.queries.items()\n if 'Here are 2' in v or 'Here are two' in v\n }\n\n # Filter queries and relevant_docs using dictionary comprehensions \n filtered_queries = {\n k: v for k, v in qa_dataset.queries.items()\n if k not in queries_relevant_docs_keys_to_remove\n }\n filtered_relevant_docs = {\n k: v for k, v in qa_dataset.relevant_docs.items()\n if k not in queries_relevant_docs_keys_to_remove\n }\n\n # Create a new instance of EmbeddingQAFinetuneDataset with the filtered data \n return EmbeddingQAFinetuneDataset(\n queries=filtered_queries,\n corpus=qa_dataset.corpus,\n relevant_docs=filtered_relevant_docs\n )\n\n # filter out pairs with phrases `Here are 2 questions based on provided context` \nqa_dataset = filter_qa_dataset(qa_dataset) Custom Retriever: To identify the optimal retriever, we employ a combination of an embedding model and a reranker. Initially, we establish a base VectorIndexRetriever . Upon retrieving the nodes, we then introduce a reranker to further refine the results. It’s worth noting that for this particular experiment, we’ve set similarity_top_k to 10 and picked top-5 with reranker. However, feel free to adjust this parameter based on the needs of your specific experiment. We are showing the code here with OpenAIEmbedding , please refer to the notebook for code with other embeddings.]"


In [63]:
for context in curated_deep_eval_df.iloc[2]['contexts']:
    print(context)
    print('-' * 10)

Implemented by the user.

        """ 
         return  self._retrieve(query_bundle)

     async   def   aretrieve ( self, str_or_query_bundle: QueryType ) -&gt;  List [NodeWithScore]:
         if   isinstance (str_or_query_bundle,  str ):
            str_or_query_bundle = QueryBundle(str_or_query_bundle)
         return   await  self._aretrieve(str_or_query_bundle)

custom_retriever = CustomRetriever(vector_retriever) Evaluation: To evaluate our retriever, we computed the Mean Reciprocal Rank (MRR) and Hit Rate metrics: retriever_evaluator = RetrieverEvaluator.from_metric_names(
    [ "mrr" ,  "hit_rate" ], retriever=custom_retriever
)
eval_results =  await  retriever_evaluator.aevaluate_dataset(qa_dataset) Results: We put various embedding models and rerankers to the test. Here are the models we considered: Embedding Models : OpenAI Embedding Voyage Embedding CohereAI Embedding  (v2.0/ v3.0) Jina Embeddings  (small/ base) BAAI/bge-large-en Google PaLM Embedding Rerankers : CohereAI b

In [64]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [65]:
if LOG_TO_MLFLOW:
    mlflow.end_run()

# Archive