# Set up

In [1]:
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import mlflow

In [2]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [4]:
TESTING = True
DEBUG = True

In [5]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [6]:
LOG_TO_MLFLOW = False
if LOG_TO_MLFLOW:
    RUN_NAME = "qdrant"
    RUN_DESCRIPTION = """
Test Qdrant with Ollama
"""
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

# Load data

In [7]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [8]:
len(data)

159

In [9]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [10]:
data[0]['content']



# Prepare documents

In [11]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")

[32m2024-07-23 12:08:59.873[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=2[0m


In [12]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [13]:
documents[0]



In [14]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

## Setting LLM

In [15]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [16]:
# LLM_OPTION = 'openai'
# LLM_OPTION = 'ollama'
LLM_OPTION = 'togetherai'

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)

In [17]:
if LLM_OPTION == 'ollama':
    logger.info(f"Using local Ollama LLM...")
    # from llama_index.embeddings.ollama import OllamaEmbedding
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    OLLAMA_MODEL_NAME = 'llama3'
    llm = Ollama(base_url=base_url, model=OLLAMA_MODEL_NAME, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
    Settings.llm = llm
    # embed_model = OllamaEmbedding(
    #     model_name=OLLAMA_MODEL_NAME,
    #     base_url=base_url,
    #     ollama_additional_kwargs={"mirostat": 0},
    # )
    embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-small-en-v1.5"
    )
    embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
    Settings.embed_model = embed_model
    if LOG_TO_MLFLOW:
        mlflow.log_param("OLLAMA_MODEL_NAME", OLLAMA_MODEL_NAME)
elif LLM_OPTION == 'openai':
    logger.info(f"Using OpenAI LLM...")
    from llama_index.llms.openai import OpenAI
    from llama_index.embeddings.openai import OpenAIEmbedding
    embed_model = OpenAIEmbedding()
    OPENAI_MODEL_NAME = 'gpt-3.5-turbo'
    llm = OpenAI(model=OPENAI_MODEL_NAME)
    Settings.llm = llm
    Settings.embed_model = embed_model.max_length
    if LOG_TO_MLFLOW:
        mlflow.log_param("OPENAI_MODEL_NAME", OPENAI_MODEL_NAME)
elif LLM_OPTION == 'togetherai':
    logger.info(f"Using TogetherAI LLM...")
    from llama_index.llms.together import TogetherLLM
    from llama_index.embeddings.together import TogetherEmbedding
    TOGETHERAI_EMBEDDING_MODEL_NAME = 'togethercomputer/m2-bert-80M-2k-retrieval'
    embed_model = TogetherEmbedding(TOGETHERAI_EMBEDDING_MODEL_NAME)
    embed_model_dim = len(embed_model.get_text_embedding('sample text to find embedding dimensions'))
    TOGETHERAI_MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct-Lite'
    llm = TogetherLLM(model=TOGETHERAI_MODEL_NAME)
    Settings.llm = llm
    Settings.embed_model = embed_model
    if LOG_TO_MLFLOW:
        mlflow.log_param("TOGETHERAI_MODEL_NAME", TOGETHERAI_MODEL_NAME)
        mlflow.log_param("TOGETHERAI_EMBEDDING_MODEL_NAME", TOGETHERAI_EMBEDDING_MODEL_NAME)

[32m2024-07-23 12:09:03.360[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mUsing TogetherAI LLM...[0m


# Index embeddings

## Qdrant as VectorStore

In [18]:
import os
import pickle
import qdrant_client
from qdrant_client.models import Distance, VectorParams
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [19]:
RECREATE_INDEX = True

COLLECTION = 'ollama'
NOTEBOOK_CACHE_DP = 'data/001/togetherai'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [20]:
qdrantdb = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
aqdrantdb = qdrant_client.AsyncQdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    # location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://<host>:<port>"
    # otherwise set Qdrant instance with host and port:
    host="localhost",
    port=6333
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)
collection_exists = qdrantdb.collection_exists(COLLECTION)
if RECREATE_INDEX or not collection_exists:
    if collection_exists:
        logger.info(f"Deleting existing Qdrant collection...")
        qdrantdb.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
    logger.info(f"Creating new Qdrant collection...")
    qdrantdb.create_collection(
        COLLECTION,
        vectors_config=VectorParams(size=embed_model_dim, distance=Distance.COSINE),
    )
else:
    logger.info(f"Use existing Qdrant collection")
db_collection = qdrantdb.get_collection(COLLECTION)
vector_store = QdrantVectorStore(
    client=qdrantdb,
    collection_name=COLLECTION,
    aclient=aqdrantdb,
    prefer_grpc=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

[32m2024-07-23 12:09:06.201[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mDeleting existing Qdrant collection...[0m
[32m2024-07-23 12:09:06.207[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1mDeleting persisted nodes object at data/001/togetherai/nodes.pkl...[0m
[32m2024-07-23 12:09:06.207[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [1mCreating new Qdrant collection...[0m


Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [21]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [22]:
db_collection_count = db_collection.indexed_vectors_count

if db_collection_count > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing DB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new DB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embed_model,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-23 12:09:08.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mCreating new DB index...[0m


> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: All parameters are required
        
        If...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: The PostgresML Managed Index is doing embedding...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.66s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.31s/it]


## ChromaDB as VectorStore

In [23]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [24]:
RECREATE_INDEX = False

COLLECTION = 'togetherai'
NOTEBOOK_CACHE_DP = 'data/001/togetherai'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [25]:
db = chromadb.PersistentClient(path=f"{NOTEBOOK_CACHE_DP}/chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX or not collection_exists:
    logger.info(f"Creating new ChromaDB collection...")
    if collection_exists:
        logger.info(f"Deleting existing ChromaDB collection...")
        db.delete_collection(COLLECTION)
    if os.path.exists(NODES_PERSIST_FP):
        logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
        os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-23 12:09:50.358[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mUse existing ChromaDB collection[0m


In [26]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [27]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [28]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-23 12:09:51.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading index from existing ChromaDB...[0m


#### Inspect nodes

# Query engine

In [29]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [30]:
RETRIEVAL_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [31]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors,
)

In [32]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score: 6.458016012624403e-12] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score: 3.846737714469063e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> Top 2 nodes:
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score:             6.45802e-12] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score:             3.84674e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...


[32m2024-07-23 12:09:56.922[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is a tool used to automate online tasks.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [33]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [34]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [35]:
RECREATE_RETRIEVAL_EVAL_DATASET = True
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
RETRIEVAL_NUM_SAMPLE_NODES = 10
RETRIEVAL_NUM_SAMPLE_NODES = min(len(nodes), RETRIEVAL_NUM_SAMPLE_NODES)
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [36]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes
else:
    logger.info(f"Loading retrieval_eval_nodes from {RETRIEVAL_EVAL_DATASET_FP}...")
    with open(RETRIEVAL_EVAL_DATASET_FP, 'r') as f:
        retrieval_eval_nodes = json.load(f)

[32m2024-07-23 12:10:06.837[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mSampling 10 nodes for retrieval evaluation...[0m


In [37]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes, llm=llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-23 12:10:08.967[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating new synthetic retrieval eval dataset...[0m
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.72s/it]


### Evaluate

In [38]:
from llama_index.core.evaluation import RetrieverEvaluator

In [39]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 5.6656155025320805e-11] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score: 5.319756053366692e-11] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             5.66562e-11] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score:             5.31976e-11] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 8.190184815999299e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [No

In [40]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [41]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_2_retrieval_eval,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [45]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [46]:
async def aevaluate_labelled_rag_dataset(response_eval_dataset, query_engine, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Make predictions with the dataset
    response_eval_prediction_dataset = await response_eval_dataset.amake_predictions_with(
        predictor=query_engine, batch_size=batch_size, show_progress=True
    )

    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [51]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df
from llama_index.llms.openai import OpenAI

In [48]:
RECREATE_SYNTHETIC_EVAL_DATASET = True
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10
RESPONSE_NUM_SAMPLE_DOCUMENTS = min(len(documents), RESPONSE_NUM_SAMPLE_DOCUMENTS)

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [49]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-23 12:13:46.713[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [52]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # set context for llm provider
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-23 12:14:29.607[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating synthetic response eval dataset...[0m


Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents

 30%|█████████████████████████████████████████████████████████████████████▎                                                                                                                                                                 | 6/20 [00:06<00:12,  1.09it/s]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.42853900910315823 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 12/20 [00:13<00:06,  1.28it/s]

Retrying llama_index.llms.openai.base.OpenAI._achat in 1.2315786498605352 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


 65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                | 13/20 [00:16<00:09,  1.43s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.2172167516245488 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.25s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.38s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.25s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [53]:
synthetic_mean_scores_df, synthetic_deep_eval_df = await aevaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    query_engine,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score: 3.013818838974388e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score: 3.2198478479970387e-13] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score:             3.01382e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score:             3.21985e-13] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 4.2704071343313573e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [N

Batch processing of predictions:  25%|█████████████████████████████████████████████████▊                                                                                                                                                     | 2/8 [00:04<00:10,  1.74s/it]

> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 7.177282530368129e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score: 1.9958221238601968e-16] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             7.17728e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score:             1.99582e-16] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 7.177282530368129e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [No

Batch processing of predictions:  50%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 4/8 [00:14<00:13,  3.41s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.8601720764667513 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 6/8 [00:16<00:04,  2.09s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.087690277880354 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:18<00:00,  2.33s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score: 1.2862159539067924e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 1.2169093050872777e-13] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score:             1.28622e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             1.21691e-13] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 7.177282530368129e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [N

Batch processing of predictions:  12%|████████████████████████▉                                                                                                                                                                              | 1/8 [00:11<01:23, 11.89s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.8351789383871835 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  25%|█████████████████████████████████████████████████▊                                                                                                                                                     | 2/8 [00:12<00:30,  5.09s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9548157494374506 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 5/8 [00:16<00:07,  2.34s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5651679694242201 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:20<00:00,  2.61s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score: 3.013818838974388e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score: 3.2198478479970387e-13] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 9a9c4f7d-c70b-4787-8145-3b31812143ab] [Similarity score:             3.01382e-12] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node db00c023-90e1-4412-b278-c6764cfe7d09] [Similarity score:             3.21985e-13] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 7.177282530368129e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [No

Batch processing of predictions:  12%|████████████████████████▉                                                                                                                                                                              | 1/8 [00:04<00:30,  4.42s/it]

> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 4.2704071343313573e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 8.042819804545458e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             4.27041e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             8.04282e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 29bbc1bc-ded9-4dfc-b8f8-3b84a9810f45] [Similarity score: 2.8842676933158703e-10] The PostgresML Managed Index is doing embedding, retrieval, and augmented generation in one netwo...
> [N

Batch processing of predictions:  25%|█████████████████████████████████████████████████▊                                                                                                                                                     | 2/8 [00:11<00:34,  5.76s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.4279343163973939 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  38%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                            | 3/8 [00:12<00:19,  3.96s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.1656959158919381 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  50%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 4/8 [00:14<00:11,  2.95s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5437019035547109 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:18<00:00,  2.36s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 4.2704071343313573e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 8.042819804545458e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             4.27041e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             8.04282e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 1.5205991512041048e-05] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [N

Batch processing of predictions:  12%|████████████████████████▉                                                                                                                                                                              | 1/8 [00:09<01:03,  9.03s/it]

> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 4.2704071343313573e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 8.042819804545458e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             4.27041e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             8.04282e-17] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.14891555208704665 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 querie

Batch processing of predictions:  38%|██████████████████████████████████████████████████████████████████████████▋                                                                                                                            | 3/8 [00:13<00:18,  3.72s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.5120074270787855 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 0.031305556875834806 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 5/8 [00:15<00:06,  2.20s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.9026215322134428 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:18<00:00,  2.36s/it]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score: 0.00011441420062940186] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 2.562835198429924e-05] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> Top 2 nodes:
> [Node 5285ed79-dcee-4e10-a86d-21ff930ae480] [Similarity score:             0.000114414] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             2.56284e-05] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 4.2704071343313573e-16] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [N

Batch processing of predictions:  12%|████████████████████████▉                                                                                                                                                                              | 1/8 [00:11<01:18, 11.21s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.12478805080255362 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions:  62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 5/8 [00:16<00:06,  2.16s/it]

Retrying llama_index.llms.openai.base.OpenAI._achat in 0.27697754765339977 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'You have been rate limited. Your rate limit is 60 queries per minute. Please navigate to https://api.together.xyz/settings/billing to upgrade to a paid plan.', 'type': 'credit_limit', 'param': None, 'code': None}}.


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:19<00:00,  2.38s/it]


0it [00:00, ?it/s]

> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The PostgresML Managed Index is doing embedding...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The PostgresML Managed Index is doing embedding...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and

In [54]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,3.125
mean_relevancy_score,0.825
mean_faithfulness_score,1.0


In [55]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,"Based on the context information, I've generat...",Here are two potential questions based on the ...,1.0,3.5,1.0
1,**Question 1:** What is the primary function o...,The primary function of the LlamaIndex framewo...,1.0,3.0,1.0
2,"Based on the context information, I've generat...",Here are the two questions based on the provid...,1.0,2.0,1.0
3,**Question 1:** What is the new time for the e...,The new time for the event scheduled for Frida...,1.0,5.0,1.0
4,"Based on the context information, I've generat...",What are the key highlights of PostgresML?,1.0,2.0,1.0
5,**Question 1:**,"The event scheduled for Friday, August 6, 2021...",1.0,2.0,1.0
6,"Based on the context information, I've generat...",What are the key highlights of PostgresML?,1.0,2.0,1.0
7,**Question 1:** What is the primary advantage ...,The primary advantage of using PostgresML Mana...,1.0,4.5,1.0
8,"Based on the context information, I've generat...",Here are two potential questions based on the ...,1.0,2.0,1.0
9,**Question 1:** What is the primary function o...,The primary function of the LlamaIndex framewo...,1.0,3.0,1.0


In [56]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [62]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [63]:
curated_mean_scores_df, curated_deep_eval_df = await aevaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    query_engine,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 5.978745003930558e-11] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 29bbc1bc-ded9-4dfc-b8f8-3b84a9810f45] [Similarity score: 1.2401104166261437e-11] The PostgresML Managed Index is doing embedding, retrieval, and augmented generation in one netwo...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score:             5.97875e-11] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 29bbc1bc-ded9-4dfc-b8f8-3b84a9810f45] [Similarity score:             1.24011e-11] The PostgresML Managed Index is doing embedding, retrieval, and augmented generation in one netwo...
> Top 2 nodes:
> [Node 97cdeac5-e779-44ec-b562-46040df13401] [Similarity score: 2.9332429461163504e-13] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [N

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.30s/it]


0it [00:00, ?it/s]

> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The PostgresML Managed Index is doing embedding...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: The PostgresML Managed Index is doing embedding...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...


In [64]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,2.333333
mean_relevancy_score,0.333333
mean_faithfulness_score,1.0


In [65]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,What are key features of llama-agents?,"Model Serving - GPU accelerated inference engine for interactive applications, with no additional networking latency or reliability costs.",1.0,2.0,1.0
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,Model Serving and Model Training.,0.0,2.0,1.0
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,"The two main metrics used to evaluate the performance of the different rerankers in the RAG system are not explicitly mentioned in the provided context. However, it can be inferred that the performance of the rerankers is evaluated based on the speed and accuracy of the query results.",0.0,3.0,1.0


In [66]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [67]:
if LOG_TO_MLFLOW:
    mlflow.end_run()

# Archive