# Set up

In [1]:
import json
from loguru import logger
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [4]:
TESTING = True

In [5]:
import logging
import sys

if TESTING:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

# Load data

In [6]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [7]:
len(data)

159

In [8]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [9]:
data[0]['content']



# Prepare documents

In [10]:
input_data = data
if TESTING:
    input_data = data[:2]

In [11]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [12]:
documents[0]



In [13]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

## Setting LLM

In [14]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [15]:
LLM_OPTION = 'openai'
# llm_option = 'ollama'

In [16]:
if LLM_OPTION == 'ollama':
    logger.info(f"Using local Ollama LLM...")
    from llama_index.embeddings.ollama import OllamaEmbedding
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    OLLAMA_MODEL_NAME = 'llama3'
    llm = Ollama(base_url=base_url, model=model_name, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
    Settings.llm = llm
    embedding = OllamaEmbedding(
        model_name=OLLAMA_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )
    Settings.embed_model = embedding
elif LLM_OPTION == 'openai':
    logger.info(f"Using OpenAI LLM...")
    from llama_index.llms.openai import OpenAI
    from llama_index.embeddings.openai import OpenAIEmbedding
    embedding = OpenAIEmbedding()
    llm = OpenAI()

# Index embeddings

In [18]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [19]:
RECREATE_INDEX = False

COLLECTION = 'mvp'
NOTEBOOK_CACHE_DP = 'data/001'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [21]:
db = chromadb.PersistentClient(path="./chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX and collection_exists:
    logger.info(f"Deleting existing ChromaDB collection...")
    db.delete_collection(COLLECTION)
    logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
    os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-21 20:28:39.288[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mUse existing ChromaDB collection[0m


In [22]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [24]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=512, chunk_overlap=10),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-21 20:28:52.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading index from existing ChromaDB...[0m


#### Inspect nodes

# Query engine

In [25]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [26]:
RETRIEVAL_TOP_K = 2
RETRIEVAL_SIMILARITY_CUTOFF = 0.7

In [27]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF)],
)

In [28]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score: 0.7028767111776245] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score: 0.6983078035402547] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score:             0.702877] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score:             0.698308] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...


[32m2024-07-21 20:29:32.371[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is the author of the document discussing the automation of tasks with MultiOn and LlamaIndex.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [29]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [30]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [32]:
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
RECREATE_RETRIEVAL_EVAL_DATASET = False
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"

In [33]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        nodes, llm=llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-21 20:31:32.621[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading existing synthetic retrieval eval dataset at data/001/llamaindex_blog_retrieval_eval_dataset.json...[0m


### Evaluate

In [35]:
from llama_index.core.evaluation import RetrieverEvaluator

In [36]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score: 0.7706024402397623] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node 83ea5ca4-144c-4d4f-9e16-126437c04d50] [Similarity score: 0.7690872064464744] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score:             0.770602] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node 83ea5ca4-144c-4d4f-9e16-126437c04d50] [Similarity score:             0.769087] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score: 0.7600939133324441] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee

In [37]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [38]:
display_results(f"top-{RETRIEVAL_TOP_K} eval", retrieval_eval_results, metrics=RETRIEVAL_METRICS)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top-2 eval,0.888889,0.75,0.444444,0.888889,0.75,0.48216


### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [39]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [40]:
async def aevaluate_labelled_rag_dataset(response_eval_dataset, query_engine, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Make predictions with the dataset
    response_eval_prediction_dataset = await response_eval_dataset.amake_predictions_with(
        predictor=query_engine, batch_size=batch_size, show_progress=True
    )

    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [42]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [43]:
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RECREATE_SYNTHETIC_EVAL_DATASET = False
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"

In [44]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # set context for llm provider
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        documents,
        llm=RESPONSE_EVAL_LLM_MODEL,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-21 20:33:18.182[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mLoading existing synthetic response eval dataset at data/001/llamaindex_blog_response_eval_dataset.json...[0m


In [45]:
synthetic_mean_scores_df, synthetic_deep_eval_df = await aevaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    query_engine,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                          | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score: 0.7939684598032454] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 2fc2500d-0c87-4de0-b710-4baf7363146f] [Similarity score: 0.7869168959723832] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score:             0.793968] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 2fc2500d-0c87-4de0-b710-4baf7363146f] [Similarity score:             0.786917] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score: 0.7856961414375103] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee

Batch processing of predictions:  12%|████████▎                                                         | 1/8 [00:01<00:10,  1.55s/it]

> Top 2 nodes:
> [Node d9b1eb63-625b-40a1-8528-4360c0866714] [Similarity score: 0.8508187598421912] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score: 0.8238927288472728] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> Top 2 nodes:
> [Node d9b1eb63-625b-40a1-8528-4360c0866714] [Similarity score:             0.850819] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score:             0.823893] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> Top 2 nodes:
> [Node c8aaf46e-911e-4e05-b8bf-935e3e2cc14a] [Similarity score: 0.7174431084585459] The email was authenticated and passed SPF and DKIM checks.

In response to the last email, I wou...
> [Node 77180175-87f6

Batch processing of predictions: 100%|██████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.45it/s]


0it [00:00, ?it/s]

> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: All parameters are required
        
        If...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: All parameters are required
        
        If...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Simplify your RAG application architecture wit

In [46]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.0
mean_relevancy_score,0.875
mean_faithfulness_score,0.875


In [47]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,What is the purpose of the LlamaIndex framewor...,The purpose of the LlamaIndex framework in the...,1.0,5.0,1.0
1,"In the technical walkthrough provided, describ...",Empty Response,0.0,1.0,0.0
2,"In the context of the document, describe the p...",The MultiOn agent responds to an email chain b...,1.0,4.0,1.0
3,What is the role of LlamaHub in relation to Mu...,LlamaHub complements MultiOn and LlamaIndex by...,1.0,5.0,1.0
4,Explain the key challenges associated with typ...,The key challenges associated with typical Ret...,1.0,4.5,1.0
5,Describe the process of setting up a PostgresM...,The process of setting up a PostgresML Managed...,1.0,4.5,1.0
6,Explain the process of creating an index using...,The process of creating an index using Postgre...,1.0,,1.0
7,How does the query engine function in Postgres...,The query engine in PostgresML functions by un...,1.0,,1.0


### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [48]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [49]:
curated_mean_scores_df, curated_deep_eval_df = await aevaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    query_engine,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                          | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score: 0.6851398311784824] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score: 0.6658651314386175] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node 0852227d-b31c-4bf0-bc44-6151b69f33a2] [Similarity score:             0.68514] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node c31413e8-b4ee-4eac-8156-50d0ba3cb407] [Similarity score:             0.665865] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...


Batch processing of predictions:  33%|██████████████████████                                            | 1/3 [00:01<00:02,  1.48s/it]

> Top 2 nodes:
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score: 0.7133581330203652] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node d9b1eb63-625b-40a1-8528-4360c0866714] [Similarity score: 0.7025082414123069] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> Top 2 nodes:
> [Node 8f6c7f71-ab97-41b0-add7-2dee53702208] [Similarity score:             0.713358] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node d9b1eb63-625b-40a1-8528-4360c0866714] [Similarity score:             0.702508] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> Top 2 nodes:
> [Node d9b1eb63-625b-40a1-8528-4360c0866714] [Similarity score: 0.6425521718172984] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node 8f6c7f71-ab97

Batch processing of predictions: 100%|██████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.27it/s]


0it [00:00, ?it/s]

> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Simplify your RAG application architecture with...


In [50]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,1.833333
mean_relevancy_score,0.0
mean_faithfulness_score,0.333333


In [51]:
curated_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,What are key features of llama-agents?,Empty Response,0.0,1.0,0.0
1,What are the two critical areas of RAG system ...,The two critical areas of RAG system performan...,0.0,3.5,1.0
2,What are the two main metrics used to evaluate...,Empty Response,0.0,1.0,0.0


# Archive