# Set up

In [1]:
import os
import pickle
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

import mlflow

In [2]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [4]:
TESTING = False
DEBUG = True
LOG_TO_MLFLOW = True

In [5]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [6]:
RUN_NAME = "exp_002_bge_large"
if LOG_TO_MLFLOW:
    RUN_DESCRIPTION = """
# Qdrant with TogetherAI Llama3 model

## Changelog
### Compares to exp_001
- Try larger embedding models to see if retrieval improves
"""
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

In [7]:
NOTEBOOK_CACHE_DP = f'data/001/{RUN_NAME}'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

if LOG_TO_MLFLOW:
    mlflow.log_param("NOTEBOOK_CACHE_DP", NOTEBOOK_CACHE_DP)

# Load data

In [8]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [9]:
len(data)

159

In [10]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [11]:
data[0]['content']



# Prepare documents

In [12]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_input_data", len(input_data))

[32m2024-07-23 17:50:55.458[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=159[0m


In [13]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [14]:
documents[0]



In [15]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

In [16]:
if LOG_TO_MLFLOW:
    mlflow.log_param("len_documents", len(documents))

## Setting LLM

In [17]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [18]:
# LLM_OPTION = 'openai'
# LLM_OPTION = 'ollama'
LLM_OPTION = 'togetherai'

# LLM_MODEL_NAME = 'llama3'
# LLM_MODEL_NAME = 'gpt-3.5-turbo'
LLM_MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'

# EMBED_OPTION = 'openai'
# EMBED_OPTION = 'togetherai'
# EMBED_OPTION = 'ollama'
EMBED_OPTION = 'huggingface'

# EMBED_MODEL_NAME = 'llama3'
# EMBED_MODEL_NAME = 'togethercomputer/m2-bert-80M-2k-retrieval'
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)
    mlflow.log_param("LLM_MODEL_NAME", LLM_MODEL_NAME)
    mlflow.log_param("EMBED_OPTION", EMBED_OPTION)
    mlflow.log_param("EMBED_MODEL_NAME", EMBED_MODEL_NAME)

In [19]:
# LLM options
if LLM_OPTION == 'ollama':
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    llm = Ollama(base_url=base_url, model=LLM_MODEL_NAME, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
elif LLM_OPTION == 'openai':
    from llama_index.llms.openai import OpenAI
    llm = OpenAI(model=LLM_MODEL_NAME)
elif LLM_OPTION == 'togetherai':
    from llama_index.llms.together import TogetherLLM
    llm = TogetherLLM(model=LLM_MODEL_NAME)

# Embed options
if EMBED_OPTION == 'huggingface':
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    embed_model = HuggingFaceEmbedding(
        model_name=EMBED_MODEL_NAME
    )
elif EMBED_OPTION == 'openai':
    from llama_index.embeddings.openai import OpenAIEmbedding
    embed_model = OpenAIEmbedding()
elif EMBED_OPTION == 'togetherai':
    from llama_index.embeddings.together import TogetherEmbedding
    embed_model = TogetherEmbedding(EMBED_MODEL_NAME)
elif EMBED_OPTION == 'ollama':
    from llama_index.embeddings.ollama import OllamaEmbedding
    embed_model = OllamaEmbedding(
        model_name=EMBED_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )

logger.info(f"LLM:\n{repr(llm)}")
logger.info(f"Embed model:\n{repr(embed_model)}")

[32m2024-07-23 17:51:07.314[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mLLM:
TogetherLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7402fd428850>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7402fdf19120>, completion_to_prompt=<function default_completion_to_prompt at 0x7402fdf834c0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='meta-llama/Meta-Llama-3-8B-Instruct-Lite', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='3cf613093b6eb9b479c341126dc8d3761c67f9340d0a4a8e1fdc62ed41b58126', api_base='https://api.together.xyz/v1', api_version='', context_window=3900, is_chat_model=True, is_function_calling_model=False, tokenizer=None)[0m
[32m2024-07-23 17:51:07.315[0m | [1mINFO    [0m | [36m__main__[0

In [20]:
embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
Settings.embed_model = embed_model
Settings.llm = llm

if LOG_TO_MLFLOW:
    mlflow.log_param("embedding_model_dim", embed_model_dim)
    mlflow.log_param("LLM_MODEL", repr(llm))
    mlflow.log_param("EMBEDDING_MODEL", repr(embed_model))

# Index embeddings

## Qdrant as VectorStore

In [21]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [22]:
import string

def substitute_punctuation(text):
    # Create a translation table that maps each punctuation character to an underscore
    translator = str.maketrans(string.punctuation, '_' * len(string.punctuation))
    # Translate the text using the translation table
    return text.translate(translator)

collection_raw_name = f"{EMBED_OPTION}__{EMBED_MODEL_NAME}"
logger.info(f"{substitute_punctuation(collection_raw_name)=}")

[32m2024-07-23 17:51:20.271[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1msubstitute_punctuation(collection_raw_name)='huggingface__BAAI_bge_large_en_v1_5'[0m


In [23]:
RECREATE_INDEX = True

COLLECTION = substitute_punctuation(collection_raw_name)

NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
# NODES_PERSIST_FP = 'data/001/exp_001_qdrant_togetherai_llama3/nodes.pkl'

if LOG_TO_MLFLOW:
    mlflow.log_param(f"COLLECTION", COLLECTION)

In [24]:
qdrantdb = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
aqdrantdb = qdrant_client.AsyncQdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
collection_exists = qdrantdb.collection_exists(COLLECTION)
if RECREATE_INDEX or not collection_exists:
    if collection_exists:
        logger.info(f"Deleting existing Qdrant collection...")
        qdrantdb.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
    logger.info(f"Creating new Qdrant collection...")
    qdrantdb.create_collection(
        COLLECTION,
        vectors_config=VectorParams(size=embed_model_dim, distance=Distance.COSINE),
    )
else:
    logger.info(f"Use existing Qdrant collection")
db_collection = qdrantdb.get_collection(COLLECTION)
vector_store = QdrantVectorStore(
    client=qdrantdb,
    collection_name=COLLECTION,
    aclient=aqdrantdb,
    prefer_grpc=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

[32m2024-07-23 17:51:21.778[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mDeleting existing Qdrant collection...[0m
[32m2024-07-23 17:51:21.786[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1mDeleting persisted nodes object at data/001/exp_002_bge_large/nodes.pkl...[0m
[32m2024-07-23 17:51:21.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mCreating new Qdrant collection...[0m


Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [25]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [26]:
t0 = time.perf_counter()
# TODO: TO understand the differences between points_count and indexed_vector_counts.
# Here indexed_vector_counts = 0
db_collection_count = db_collection.points_count

if db_collection_count > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing DB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new DB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embed_model,
        ],
        vector_store = vector_store
    )

    num_workers = None
    # Currently setting num_workers leads to error `AttributeError: 'HuggingFaceEmbedding' object has no attribute '_model'`
    # num_workers = os.cpu_count() - 1
    # logger.info(f"Running Ingestion Pipeline with {num_workers=}...")
    nodes = await pipeline.arun(documents=documents, num_workers=num_workers)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
t1 = time.perf_counter()
logger.info(f"Indexing {len(documents)} into VectorStoreIndex took {t1 - t0:,.0f}s")

[32m2024-07-23 17:51:36.365[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mCreating new DB index...[0m


> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: All parameters are required
        
        If...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: The PostgresML Managed Index is doing embedding...
> Adding chunk: LlamaIndex Newsletter 2024-06-04
Hello, LlamaIn...
> Adding chunk: Blogpost ,  Tweet . We have integrated with Mil...
> Adding chunk: Memary is a fully open-source reference impleme...
> Adding chunk: Batch inference with MyMagic AI and LlamaIndex
...
> Adding chunk: import  asyncio
 from  llama_index.llms.mymagic...
> Adding chunk: The user queries will be aggregated and append

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [27]:
logger.info(f"Indexed {len(nodes)} nodes into Vector Store")
if LOG_TO_MLFLOW:
    mlflow.log_param("len_nodes", len(nodes))

[32m2024-07-23 18:04:04.649[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mIndexed 808 nodes into Vector Store[0m


In [None]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [None]:
RECREATE_INDEX = False

COLLECTION = 'togetherai'
NOTEBOOK_CACHE_DP = 'data/001/togetherai'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [25]:
db = chromadb.PersistentClient(path=f"{NOTEBOOK_CACHE_DP}/chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX or not collection_exists:
    logger.info(f"Creating new ChromaDB collection...")
    if collection_exists:
        logger.info(f"Deleting existing ChromaDB collection...")
        db.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-23 12:09:50.358[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mUse existing ChromaDB collection[0m


In [26]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [27]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [28]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-23 12:09:51.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading index from existing ChromaDB...[0m


#### Inspect nodes

# Query engine

In [33]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [34]:
RETRIEVAL_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [35]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors,
)

In [36]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node deb0944c-3a66-420a-b18a-33349ea5c923] [Similarity score:             0.657916] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node 61475053-55a3-47c6-a1f5-92879757a8e5] [Similarity score:             0.63385] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...


[32m2024-07-24 10:03:49.429[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is an AI agents platform designed to facilitate the autonomous completion of tasks in any web environment. It empowers developers to build AI agents that can manage online activities from start to finish, handling everything from simple data retrieval to complex interactions.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [37]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [38]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [48]:
RECREATE_RETRIEVAL_EVAL_DATASET = True
# Currently can not reuse retrieval_eval_dataset because the retrieval evaluation is based on ids
# RETRIEVAL_EVAL_DATASET_FP = f"data/001/exp_001_v3/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_NUM_SAMPLE_NODES = 10
RETRIEVAL_NUM_SAMPLE_NODES = min(len(nodes), RETRIEVAL_NUM_SAMPLE_NODES)
RETRIEVAL_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RETRIEVAL_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [49]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes
else:
    logger.info(f"Loading retrieval_eval_nodes from {RETRIEVAL_EVAL_DATASET_FP}...")
    with open(RETRIEVAL_EVAL_DATASET_FP, 'r') as f:
        retrieval_eval_nodes = json.load(f)

[32m2024-07-24 10:05:08.086[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mSampling 10 nodes for retrieval evaluation...[0m


In [50]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    # Use good model to generate the eval dataset
    from llama_index.llms.openai import OpenAI
    retrieval_eval_llm = OpenAI(model=RETRIEVAL_EVAL_LLM_MODEL, **RETRIEVAL_EVAL_LLM_MODEL_CONFIG)

    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes, llm=retrieval_eval_llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-24 10:05:09.690[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCreating new synthetic retrieval eval dataset...[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.56s/it]


In [51]:
retrieval_eval_dataset.queries

{'a0308b12-3a62-4763-b3f5-0497f22d16d2': 'How do complex and unconstrained agent interaction techniques, such as ReAct, differ from simple and constrained agent interaction mechanisms in terms of their approach to handling data queries?',
 'c81bd84e-9978-4da9-bebc-37607bd94bab': 'How can agents, specifically those integrated with LlamaIndex query engines, assist in performing complex user queries across multiple data sources and synthesizing insights for users?',
 'b8611759-7643-48ff-a926-1d144def53e3': 'How does LlamaIndex simplify the evaluation process for LLM and RAG apps, and what are the four key metrics it assesses these apps on?',
 'eb369767-7e81-4a07-a3de-29b0826d9b88': 'Describe the integration of LlamaIndex with various tools and frameworks mentioned in the document, and explain how these integrations enhance the functionality and capabilities of LlamaIndex.',
 'e37ae996-fd00-4277-b695-4362d547f5bc': "How does the integration of videos and code snippets enhance the viewer's 

### Evaluate

In [52]:
from llama_index.core.evaluation import RetrieverEvaluator

In [53]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node 87ffd15d-0c04-49c1-b087-a33eb2c0d444] [Similarity score:             0.809213] It repeats these steps in an iterative loop until the task is complete. There are other interacti...
> [Node d5280cda-0a90-4982-a02d-59a3eaaec876] [Similarity score:             0.778968] Dumber LLM Agents Need More Constraints and Better Tools
Summary In this article, we compare how ...
> Top 2 nodes:
> [Node 8c640df7-e039-4425-8878-f70986cc643b] [Similarity score:             0.835663] As a result some of our existing query capabilities contain “agent-like” components: we have quer...
> [Node 109f25a3-1a96-4981-89c1-9481978b5d9f] [Similarity score:             0.831301] The agent then reasons that it needs to call the  read_search_data  tool, which will query the in...
> Top 2 nodes:
> [Node acccfcb5-6e27-4c14-afc6-6328d0538c84] [Similarity score:             0.788382] The evaluations demonstrated here will help you quickly find what’s affecting the quality of your...
> [Node d78e374

In [54]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [55]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_2_retrieval_eval,0.473684,0.421053,0.236842,0.473684,0.421053,0.266618


In [56]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [57]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [58]:
def evaluate_labelled_rag_dataset(response_eval_dataset, response_eval_prediction_dataset, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
        "contexts": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)
        evals["contexts"].append(prediction.contexts)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
        "contexts": evals['contexts'],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
        pd.Series(evals['contexts'], name='contexts')
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [59]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [60]:
RECREATE_SYNTHETIC_EVAL_DATASET = False
# RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_DATASET_FP = f"data/001/exp_001_v3/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10
RESPONSE_NUM_SAMPLE_DOCUMENTS = min(len(documents), RESPONSE_NUM_SAMPLE_DOCUMENTS)

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [61]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-24 10:10:58.301[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [62]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # Use good model to generate the eval dataset
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=response_eval_llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
        workers=(os.cpu_count() - 1)
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-24 10:10:59.339[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mLoading existing synthetic response eval dataset at data/001/exp_001_v3/llamaindex_blog_response_eval_dataset.json...[0m


In [63]:
synthetic_response_eval_prediction_dataset = await synthetic_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=8, show_progress=True
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node d14146be-9c16-4437-9597-a6460aa0b37a] [Similarity score:             0.809745] Launching the first GenAI-native document parsing platform
Our mission at LlamaIndex is to connec...
> [Node 9b3bd895-09d8-4f85-b5b2-6c6ad4dd27d5] [Similarity score:             0.803162] LlamaIndex Newsletter 2024-03-19
Greetings, LlamaIndex enthusiasts! 🦙 Welcome to another exciting...
> Top 2 nodes:
> [Node cd0c5df2-0cd7-4ca9-802e-1d1d706bf164] [Similarity score:             0.876471] OpenAI Cookbook: Evaluating RAG systems
We’re excited to unveil our  OpenAI Cookbook , a guide to...
> [Node 78354e9f-fbd8-4ca1-bf15-a75b17992baf] [Similarity score:             0.79349] Setup Weaviate Client url = 'cluster URL'
api_key = 'your api key'

client = get_weaviate_client(...
> Top 2 nodes:
> [Node acf53e3e-a1b0-4341-ae71-2f1e2f048930] [Similarity score:             0.775634] Tweet . We introduced day-0 integrations with the MistralAI LLMs (mistral-tiny, mistral-small, mi...
> [Node 8b5e6871

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.55it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 42e4eb0c-3488-4599-918b-35e8798f73d5] [Similarity score:             0.842008] Wrap A LlamaIndex App with TruLens With TruLens, you can wrap LlamaIndex query engines with a Tru...
> [Node 36455c92-7685-4da6-a81f-ec026ec4dc88] [Similarity score:             0.830373] Build and Evaluate LLM Apps with LlamaIndex and TruLens
Authors:  Anupam Datta, Shayak Sen, Jerry...
> Top 2 nodes:
> [Node 378632bf-a7ea-40b5-a254-d8c2586e25d0] [Similarity score:             0.795793] Retrieving Privacy-Safe Documents Over A Network
In a  recent blog post , we introduced our  llam...
> [Node a9521ec4-62c9-4c8c-9798-6ed7b67e579e] [Similarity score:             0.787769] There are 2 main paradigms currently for extending the amazing reasoning and knowledge generation...
> Top 2 nodes:
> [Node 36455c92-7685-4da6-a81f-ec026ec4dc88] [Similarity score:             0.837976] Build and Evaluate LLM Apps with LlamaIndex and TruLens
Authors:  Anupam Datta, Shayak Sen, Jerry...
> [Node 8abfc95

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  2.69it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 6e3de820-cdfc-40ab-b64f-1d2c00382be3] [Similarity score:             0.904004] LlamaIndex Accelerates Enterprise Generative AI with NVIDIA NIM
Generative AI is rapidly transfor...
> [Node ed199b1b-2dca-48c8-9cfc-340fdca3fa7e] [Similarity score:             0.775254] (For me, I usually overestimate what I can achieve by 3–10x!) With this in mind, I tried to set a...
> Top 2 nodes:
> [Node 57707274-0eb2-4175-b98d-e3fb5034ad42] [Similarity score:             0.826259] Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
We'...
> [Node 75e7cc55-23e3-4223-aecb-2ab4722b5cd1] [Similarity score:             0.814519] Codebase . 🗺️ Guides: Guide  to Building an Agentic RAG Service with our comprehensive notebook t...
> Top 2 nodes:
> [Node 6e3de820-cdfc-40ab-b64f-1d2c00382be3] [Similarity score:             0.906435] LlamaIndex Accelerates Enterprise Generative AI with NVIDIA NIM
Generative AI is rapidly transfor...
> [Node e2d16de

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.20it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/6 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 7d3fda45-2028-4a1d-b8b2-4c47adab8ee5] [Similarity score:             0.807019] Register for free! ✨ Feature Releases and Enhancements: We introduced the LlamaIndex 0.9 version ...
> [Node cec1e71e-330c-4eb5-ade9-9eeb1ab95ca6] [Similarity score:             0.798853] LlamaIndex Newsletter 2023–11–21
Hello Llama Fam 🦙 What an amazing week we’ve had! We’re excited ...
> Top 2 nodes:
> [Node e40b7bb4-d699-4e0f-aab6-0f386ed54edb] [Similarity score:             0.815946] Mervin Praison’s   tutorial  on using llama-agents, detailing the framework’s purpose, a step-by-...
> [Node 3aaf1433-c6ca-49cc-98fe-c117932a836b] [Similarity score:             0.815772] Guide  to Building an Agent in LlamaIndex: Our comprehensive guide which covers building a basic ...
> Top 2 nodes:
> [Node d1f5a699-c778-44fe-b1f6-a8c0974f4000] [Similarity score:             0.751528] We now accommodate custom models that align with the OpenAI-compatible API. 🎥 Webinars: Wenqi Gla...
> [Node 4214f9a

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.16it/s]


In [64]:
synthetic_mean_scores_df, synthetic_deep_eval_df = evaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    synthetic_response_eval_prediction_dataset,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

> Adding chunk: Querying a network of knowledge with llama-inde...
> Adding chunk: Alex has heard about these insightful documents...
> Adding chunk: Querying a network of knowledge with llama-inde...
> Adding chunk: Alex has heard about these insightful documents...
> Adding chunk: Launching the first GenAI-native document parsi...
> Adding chunk: LlamaIndex Newsletter 2024-03-19
Greetings, Lla...
> Adding chunk: Launching the first GenAI-native document parsi...
> Adding chunk: LlamaIndex Newsletter 2024-03-19
Greetings, Lla...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Setup Weaviate Client url = 'cluster URL'
api_k...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Setup Weaviate Client url = 'cluster URL'
api_k...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Below, we list a select few of the evaluation n...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re 

In [65]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.208333
mean_relevancy_score,0.966667
mean_faithfulness_score,1.0


In [66]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,How does the new feature released by LlamaInde...,"The new feature, llama-index-networks, enables...",1.0,4.0,1.0,[Querying a network of knowledge with llama-in...
1,Discuss the advancements made by LlamaIndex in...,LlamaIndex has made significant advancements i...,1.0,3.5,1.0,[Launching the first GenAI-native document par...
2,Explain the three main sections of the OpenAI ...,The OpenAI Cookbook for evaluating RAG systems...,1.0,,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
3,How does the OpenAI Cookbook suggest evaluatin...,The OpenAI Cookbook suggests evaluating the pe...,1.0,4.5,1.0,[OpenAI Cookbook: Evaluating RAG systems\nWe’r...
4,How has LlamaIndex evolved over the past year ...,"Over the past year, LlamaIndex has experienced...",1.0,,1.0,[LlamaIndex turns 1!\nIt’s our birthday! One y...
5,Can you explain the significance of the Retrie...,RAG technology plays a crucial role in the dev...,1.0,4.0,1.0,[MultiModal RAG for Advanced Video Processing ...
6,How does the partnership with Google Gemini be...,The partnership with Google Gemini benefits Ll...,1.0,5.0,1.0,[LlamaIndex + Gemini\n(co-authored by Jerry Li...
7,Describe the Multi-Doc SEC 10Q Dataset launche...,"The Multi-Doc SEC 10Q Dataset, launched by Taq...",1.0,4.5,1.0,[Tweet . We introduced day-0 integrations with...
8,How does the MemoryCache project by Mozilla ut...,The MemoryCache project by Mozilla utilizes Pr...,0.0,,1.0,[Retrieving Privacy-Safe Documents Over A Netw...
9,Discuss the significance of integrating Na2SQL...,The integration of Na2SQL with Llama Index is ...,1.0,4.5,1.0,[Its integration ensures a smooth transition f...


In [67]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [68]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [69]:
curated_response_eval_prediction_dataset = await curated_response_eval_dataset.amake_predictions_with(
    predictor=query_engine, batch_size=8, show_progress=True
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node cd0c5df2-0cd7-4ca9-802e-1d1d706bf164] [Similarity score:             0.822134] OpenAI Cookbook: Evaluating RAG systems
We’re excited to unveil our  OpenAI Cookbook , a guide to...
> [Node 597f4385-0f4e-4bdf-b17c-9deb5b274d9e] [Similarity score:             0.793708] chunk_sizes = [ 128 ,  256 ,  512 ,  1024 ,  2048 ]

 for  chunk_size  in  chunk_sizes:
  avg_res...
> Top 2 nodes:
> [Node 2c85936f-3734-4425-8d17-12fe2a509549] [Similarity score:             0.739013] bge-large : Experiences significant improvement with rerankers, with the best results from  Coher...
> [Node dc28df72-ed41-41f5-b9d6-05a284d8a2bc] [Similarity score:             0.736419] Boosting RAG: Picking the Best Embedding & Reranker models
UPDATE : The pooling method for the Ji...
> Top 2 nodes:
> [Node 57707274-0eb2-4175-b98d-e3fb5034ad42] [Similarity score:             0.791887] Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
We'...
> [Node dbeeb5b

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.02s/it]


In [70]:
curated_mean_scores_df, curated_deep_eval_df = evaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    curated_response_eval_prediction_dataset,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

0it [00:00, ?it/s]

> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: import  dotenv
dotenv.load_dotenv()  # our .env...
> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: import  dotenv
dotenv.load_dotenv()  # our .env...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: chunk_sizes = [ 128 ,  256 ,  512 ,  1024 ,  20...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: chunk_sizes = [ 128 ,  256 ,  512 ,  1024 ,  20...
> Adding chunk: bge-large : Experiences significant improvement...
> Adding chunk: Boosting RAG: Picking the Best Embedding & Rera...
> Adding chunk: bge-large : Experiences significant improvement...
> Adding chunk: Boosting RAG: Picking the Best Embedding & Rera...


In [71]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.5
mean_relevancy_score,1.0
mean_faithfulness_score,1.0


In [72]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score,contexts
0,What are key features of llama-agents?,"Distributed Service-Oriented Architecture: Every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.\n\nCommunication via standardized API interfaces: Interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.\n\nDefine agentic and explicit orchestration flows: Developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.\n\nEase of deployment: Launch, scale, and monitor each agent and your control plane independently.\n\nScalability and resource management: Use our built-in observability tools to monitor the quality and performance of the system and each individual agent service.",1.0,,1.0,"[Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems\nWe're excited to announce the alpha release of llama-agents , a new open-source framework designed to simplify the process of building, iterating, and deploying multi-agent AI systems and turn your agents into production microservices. Whether you're working on complex question-answering systems, collaborative AI assistants, or distributed AI workflows, llama-agents provides the tools and structure you need to bring your ideas to life. Key Features of llama-agents Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task. Ease of deployment: launch, scale and monitor each agent and your control plane independently. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service Let's dive into how you can start using llama-agents to build your own multi-agent systems. Getting Started with llama-agents First, install the framework using pip: pip install llama-agents llama-index-agent-openai Basic System Setup Here's a simple example of how to set up a basic multi-agent system using llama-agents., import dotenv\ndotenv.load_dotenv() # our .env defines OPENAI_API_KEY \n from llama_index.core import VectorStoreIndex, Document\n from llama_index.core.agent import FnAgentWorker\n from llama_index.core import PromptTemplate\n from llama_index.core.query_pipeline import QueryPipeline\n from llama_index.core.query_engine import RetrieverQueryEngine\n from llama_agents import (\n AgentService,\n ControlPlaneServer,\n SimpleMessageQueue,\n PipelineOrchestrator,\n ServiceComponent,\n)\n from llama_agents.launchers import LocalLauncher\n from llama_index.llms.openai import OpenAI\n import logging\n\n # change logging level to enable or disable more verbose logging \nlogging.getLogger( ""llama_agents"" ).setLevel(logging.INFO)\n\n # Load and index your document \ndocs = [Document(text= ""The rabbit is a small mammal with long ears and a fluffy tail. His name is Peter."" )]\nindex = VectorStoreIndex.from_documents(docs)\n\n # Define a query rewrite agent \nHYDE_PROMPT_STR = (\n ""Please rewrite the following query to include more detail:\n{query_str}\n"" \n)\nHYDE_PROMPT_TMPL = PromptTemplate(HYDE_PROMPT_STR)\n\n def run_hyde_fn ( state ):\n prompt_tmpl, llm, input_str = (\n state[ ""prompt_tmpl"" ],\n state[ ""llm"" ],\n state[ ""__task__"" ]. input ,\n )\n qp = QueryPipeline(chain=[prompt_tmpl, llm])\n output = qp.run(query_str=input_str)\n state[ ""__output__"" ] = str (output)\n return state, True \n\nhyde_agent = FnAgentWorker(\n fn=run_hyde_fn,\n initial_state={ ""prompt_tmpl"" : HYDE_PROMPT_TMPL, ""llm"" : OpenAI()}\n).as_agent()\n\n # Define a RAG agent \n def run_rag_fn ( state ):\n retriever, llm, input_str = (\n state[ ""retriever"" ],\n state[ ""llm"" ],\n state[ ""__task__"" ].]"
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,The two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook are the Retrieval System and Response Generation.,1.0,4.5,1.0,"[OpenAI Cookbook: Evaluating RAG systems\nWe’re excited to unveil our OpenAI Cookbook , a guide to evaluating Retrieval-Augmented Generation (RAG) systems using LlamaIndex. We hope you’ll find it useful in enhancing the effectiveness of your RAG systems, and we’re thrilled to share it with you. The OpenAI Cookbook has three sections: Understanding Retrieval-Augmented Generation (RAG): provides a detailed overview of RAG systems, including the various stages involved in building the RAG system. Building RAG with LlamaIndex: Here, we dive into the practical aspects, demonstrating how to construct a RAG system using LlamaIndex, specifically applied to Paul Graham’s essay, utilizing the VectorStoreIndex . Evaluating RAG with LlamaIndex: The final section focuses on assessing the RAG system’s performance in two critical areas: the Retrieval System and Response Generation. We use our unique synthetic dataset generation method, generate_question_context_pairs to conduct thorough evaluations in these areas. Our goal with this cookbook is to provide the community with an essential resource for effectively evaluating and enhancing RAG systems developed using LlamaIndex. Join us in exploring the depths of RAG system evaluation and discover how to leverage the full potential of your RAG implementations with LlamaIndex. Keep building with LlamaIndex!🦙, chunk_sizes = [ 128 , 256 , 512 , 1024 , 2048 ]\n\n for chunk_size in chunk_sizes:\n avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, eval_questions)\n print ( f""Chunk size {chunk_size} - Average Response time: {avg_response_time: .2 f} s, Average Faithfulness: {avg_faithfulness: .2 f} , Average Relevancy: {avg_relevancy: .2 f} "" ) Bringing It All Together Let’s compile the processes: import nest_asyncio\n\nnest_asyncio.apply()\n\n from llama_index import (\n SimpleDirectoryReader,\n VectorStoreIndex,\n ServiceContext,\n)\n from llama_index.evaluation import (\n DatasetGenerator,\n FaithfulnessEvaluator,\n RelevancyEvaluator\n)\n from llama_index.llms import OpenAI\n\n import openai\n import time\n\nopenai.api_key = 'OPENAI-API-KEY' \n\n # Download Data \n!mkdir -p 'data/10k/' \n!wget 'https://raw.githubusercontent.com/jerryjliu/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf' \n\n # Load Data \nreader = SimpleDirectoryReader( ""./data/10k/"" )\ndocuments = reader.load_data()\n\n # To evaluate for each chunk size, we will first generate a set of 40 questions from first 20 pages. \neval_documents = documents[: 20 ]\ndata_generator = DatasetGenerator.from_documents()\neval_questions = data_generator.generate_questions_from_nodes(num = 20 )\n\n # We will use GPT-4 for evaluating the responses \ngpt4 = OpenAI(temperature= 0 , model= ""gpt-4"" )\n\n # Define service context for GPT-4 for evaluation \nservice_context_gpt4 = ServiceContext.]"
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,The two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).,1.0,,1.0,"[bge-large : Experiences significant improvement with rerankers, with the best results from CohereRerank (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with CohereRerank (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0.836049 MRR. Voyage : Has strong initial performance that is further amplified by CohereRerank (0.91573 hit rate, 0.851217 MRR), suggesting high responsiveness to reranking. JinaAI : Very strong performance, sees notable gains with bge-reranker-large (0.938202 hit rate, 0.868539 MRR) and CohereRerank (0.932584 hit rate, 0.873689), indicating that reranking significantly boosts its performance. Google-PaLM : The model demonstrates strong performance, with measurable gains when using the CohereRerank (0.910112 hit rate, 0.855712 MRR). This indicates that reranking provides a clear boost to its overall results. Impact of Rerankers : WithoutReranker : This provides the baseline performance for each embedding. bge-reranker-base : Generally improves both hit rate and MRR across embeddings. bge-reranker-large : This reranker frequently offers the highest or near-highest MRR for embeddings. For several embeddings, its performance rivals or surpasses that of the CohereRerank . CohereRerank : Consistently enhances performance across all embeddings, often providing the best or near-best results. Necessity of Rerankers : The data clearly indicates the significance of rerankers in refining search results., Boosting RAG: Picking the Best Embedding & Reranker models\nUPDATE : The pooling method for the Jina AI embeddings has been adjusted to use mean pooling, and the results have been updated accordingly. Notably, the JinaAI-v2-base-en with bge-reranker-large now exhibits a Hit Rate of 0.938202 and an MRR (Mean Reciprocal Rank) of 0.868539 and with CohereRerank exhibits a Hit Rate of 0.932584, and an MRR of 0.873689. When building a Retrieval Augmented Generation (RAG) pipeline, one key component is the Retriever. We have a variety of embedding models to choose from, including OpenAI, CohereAI, and open-source sentence transformers. Additionally, there are several rerankers available from CohereAI and sentence transformers. But with all these options, how do we determine the best mix for top-notch retrieval performance? How do we know which embedding model fits our data best? Or which reranker boosts our results the most? In this blog post, we’ll use the Retrieval Evaluation module from LlamaIndex to swiftly determine the best combination of embedding and reranker models. Let's dive in! Let’s first start with understanding the metrics available in Retrieval Evaluation Understanding Metrics in Retrieval Evaluation: To gauge the efficacy of our retrieval system, we primarily relied on two widely accepted metrics: Hit Rate and Mean Reciprocal Rank (MRR) . Let’s delve into these metrics to understand their significance and how they operate. Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses. Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries.]"


In [73]:
for context in curated_deep_eval_df.iloc[2]['contexts']:
    print(context)
    print('-' * 10)

bge-large : Experiences significant improvement with rerankers, with the best results from  CohereRerank  (0.876404 hit rate, 0.822753 MRR). llm-embedder : Benefits greatly from reranking, particularly with  CohereRerank  (0.882022 hit rate, 0.830243 MRR), which offers a substantial performance boost. Cohere : Cohere’s latest v3.0 embeddings outperform v2.0 and, with the integration of native CohereRerank, significantly improve its metrics, boasting a 0.88764 hit rate and a 0.836049 MRR. Voyage : Has strong initial performance that is further amplified by  CohereRerank  (0.91573 hit rate, 0.851217 MRR), suggesting high responsiveness to reranking. JinaAI : Very strong performance, sees notable gains with  bge-reranker-large  (0.938202 hit rate, 0.868539 MRR) and  CohereRerank  (0.932584 hit rate, 0.873689), indicating that reranking significantly boosts its performance. Google-PaLM : The model demonstrates strong performance, with measurable gains when using the  CohereRerank (0.910112

In [74]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [75]:
if LOG_TO_MLFLOW:
    mlflow.end_run()

# Archive