# Set up

In [1]:
import json
from loguru import logger
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import mlflow

In [2]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [4]:
TESTING = False
DEBUG = True

In [5]:
import logging
import sys

if DEBUG:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [6]:
LOG_TO_MLFLOW = True
if LOG_TO_MLFLOW:
    RUN_NAME = "mvp_002"
    RUN_DESCRIPTION = """
Add debug=True and reusing persisted database
"""
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run(run_name=RUN_NAME, description=RUN_DESCRIPTION)
    mlflow.log_param("TESTING", TESTING)

# Load data

In [7]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [8]:
len(data)

159

In [9]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [10]:
data[0]['content']



# Prepare documents

In [11]:
input_data = data
if TESTING:
    input_data = data[:2]
logger.info(f"{len(input_data)=}")

[32m2024-07-22 15:02:05.452[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mlen(input_data)=159[0m


In [12]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [13]:
documents[0]



In [14]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

## Setting LLM

In [15]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [16]:
LLM_OPTION = 'openai'
# llm_option = 'ollama'

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)

In [17]:
if LLM_OPTION == 'ollama':
    logger.info(f"Using local Ollama LLM...")
    from llama_index.embeddings.ollama import OllamaEmbedding
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    OLLAMA_MODEL_NAME = 'llama3'
    llm = Ollama(base_url=base_url, model=model_name, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
    Settings.llm = llm
    embedding = OllamaEmbedding(
        model_name=OLLAMA_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )
    Settings.embed_model = embedding
    if LOG_TO_MLFLOW:
        mlflow.log_param("OLLAMA_MODEL_NAME", OLLAMA_MODEL_NAME)
elif LLM_OPTION == 'openai':
    logger.info(f"Using OpenAI LLM...")
    from llama_index.llms.openai import OpenAI
    from llama_index.embeddings.openai import OpenAIEmbedding
    embedding = OpenAIEmbedding()
    OPENAI_MODEL_NAME = 'gpt-3.5-turbo'
    llm = OpenAI(model=OPENAI_MODEL_NAME)
    if LOG_TO_MLFLOW:
        mlflow.log_param("OPENAI_MODEL_NAME", OPENAI_MODEL_NAME)

[32m2024-07-22 15:02:16.601[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mUsing OpenAI LLM...[0m


# Index embeddings

In [18]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [19]:
RECREATE_INDEX = False

COLLECTION = 'mvp'
NOTEBOOK_CACHE_DP = 'data/001'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [20]:
db = chromadb.PersistentClient(path=f"{NOTEBOOK_CACHE_DP}/chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX or not collection_exists:
    logger.info(f"Deleting existing ChromaDB collection...")
    db.delete_collection(COLLECTION)
    logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
    os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-22 15:02:31.799[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mUse existing ChromaDB collection[0m


In [21]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [22]:
CHUNKER = "SentenceSplitter"
CHUNKER_CONFIG = {
    "chunk_size": 512,
    "chunk_overlap": 10
}
if LOG_TO_MLFLOW:
    mlflow.log_param("CHUNKER", CHUNKER)
    for k, v in CHUNKER_CONFIG.items():
        mlflow.log_param(f"CHUNKER__{k}", v)

In [23]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(**CHUNKER_CONFIG),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-22 15:02:48.982[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading index from existing ChromaDB...[0m


#### Inspect nodes

# Query engine

In [24]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [26]:
RETRIEVAL_TOP_K = 2
# Need to be able to control this cutoff until specify it
RETRIEVAL_SIMILARITY_CUTOFF = None
# RETRIEVAL_SIMILARITY_CUTOFF = 0.3

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    if RETRIEVAL_SIMILARITY_CUTOFF:
        mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [27]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

node_postprocessors = []

if RETRIEVAL_SIMILARITY_CUTOFF is not None:
    node_postprocessors.append(SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF))

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=node_postprocessors,
)

In [28]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node eb1f08fa-46d9-47c1-a10a-3f65efd514bf] [Similarity score: 0.6978917999454536] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node b6b6b342-99e9-4459-82d6-941a640ff819] [Similarity score: 0.6923464230331815] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> Top 2 nodes:
> [Node eb1f08fa-46d9-47c1-a10a-3f65efd514bf] [Similarity score:             0.697892] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node b6b6b342-99e9-4459-82d6-941a640ff819] [Similarity score:             0.692346] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...


[32m2024-07-22 15:04:24.838[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is an AI agents platform designed to facilitate the autonomous completion of tasks in any web environment. It empowers developers to build AI agents that can manage online activities from start to finish, handling everything from simple data retrieval to complex interactions.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [29]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [30]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [35]:
RECREATE_RETRIEVAL_EVAL_DATASET = False
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"
if RECREATE_RETRIEVAL_EVAL_DATASET:
    RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
    RETRIEVAL_NUM_SAMPLE_NODES = 10
    
    if LOG_TO_MLFLOW:
        mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)
        mlflow.log_param("RETRIEVAL_NUM_SAMPLE_NODES", RETRIEVAL_NUM_SAMPLE_NODES)

In [36]:
if RECREATE_RETRIEVAL_EVAL_DATASET:
    if RETRIEVAL_NUM_SAMPLE_NODES:
        logger.info(f"Sampling {RETRIEVAL_NUM_SAMPLE_NODES} nodes for retrieval evaluation...")
        np.random.seed(41)
        retrieval_eval_nodes = np.random.choice(nodes, RETRIEVAL_NUM_SAMPLE_NODES)
    else:
        logger.info(f"Using all nodes for retrieval evaluation")
        retrieval_eval_nodes = nodes

In [37]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        retrieval_eval_nodes, llm=llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-22 15:05:51.359[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading existing synthetic retrieval eval dataset at data/001/llamaindex_blog_retrieval_eval_dataset.json...[0m


### Evaluate

In [38]:
from llama_index.core.evaluation import RetrieverEvaluator

In [39]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node fdc7d228-4d26-40b7-9e89-a644305265e4] [Similarity score: 0.8067454274188276] The agent then reasons that it needs to call the  read_search_data  tool, which will query the in...
> [Node 1f4bd31e-e73c-4264-b765-d7d76f2359b9] [Similarity score: 0.788734662447601] Data Agents
Today we’re incredibly excited to announce the launch of a big new capability within ...
> Top 2 nodes:
> [Node fdc7d228-4d26-40b7-9e89-a644305265e4] [Similarity score:             0.806745] The agent then reasons that it needs to call the  read_search_data  tool, which will query the in...
> [Node 1f4bd31e-e73c-4264-b765-d7d76f2359b9] [Similarity score:             0.788735] Data Agents
Today we’re incredibly excited to announce the launch of a big new capability within ...
> Top 2 nodes:
> [Node 8b1f680b-98a8-4088-b680-0ec4dd4f627f] [Similarity score: 0.8044526440159468] It repeats these steps in an iterative loop until the task is complete. There are other interacti...
> [Node fdc7d228-4d26-

In [40]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [41]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_2_retrieval_eval,0.473684,0.421053,0.236842,0.473684,0.421053,0.266618


In [42]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [43]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [44]:
async def aevaluate_labelled_rag_dataset(response_eval_dataset, query_engine, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Make predictions with the dataset
    response_eval_prediction_dataset = await response_eval_dataset.amake_predictions_with(
        predictor=query_engine, batch_size=batch_size, show_progress=True
    )

    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [45]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [47]:
RECREATE_SYNTHETIC_EVAL_DATASET = False
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RESPONSE_NUM_SAMPLE_DOCUMENTS = 10

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    mlflow.log_param("RESPONSE_NUM_SAMPLE_DOCUMENTS", RESPONSE_NUM_SAMPLE_DOCUMENTS)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [48]:
if RESPONSE_NUM_SAMPLE_DOCUMENTS:
    logger.info(f"Sampling {RESPONSE_NUM_SAMPLE_DOCUMENTS} documents for response evaluation...")
    np.random.seed(41)
    response_eval_documents = np.random.choice(documents, RESPONSE_NUM_SAMPLE_DOCUMENTS)
else:
    logger.info(f"Using all documents for retrieval evaluation")
    response_eval_documents = documents

[32m2024-07-22 15:07:12.403[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSampling 10 documents for response evaluation...[0m


In [49]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # set context for llm provider
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        response_eval_documents,
        llm=llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-22 15:07:19.560[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mLoading existing synthetic response eval dataset at data/001/llamaindex_blog_response_eval_dataset.json...[0m


In [50]:
synthetic_mean_scores_df, synthetic_deep_eval_df = await aevaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    query_engine,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 294ecbb3-ea9a-4cc9-889f-fad2fb89a032] [Similarity score: 0.7902765003425318] This stack is different from any ETL stack before it, because unlike traditional software, every ...
> [Node 90bee6f8-8ac2-4da6-8127-f13e4488155d] [Similarity score: 0.7768721898180933] This is a surprisingly prevalent use case across a variety of data types and verticals, from ArXi...
> Top 2 nodes:
> [Node 294ecbb3-ea9a-4cc9-889f-fad2fb89a032] [Similarity score:             0.790277] This stack is different from any ETL stack before it, because unlike traditional software, every ...
> [Node 90bee6f8-8ac2-4da6-8127-f13e4488155d] [Similarity score:             0.776872] This is a surprisingly prevalent use case across a variety of data types and verticals, from ArXi...
> Top 2 nodes:
> [Node b09a7306-7211-49b1-8153-df9fdd50c2f0] [Similarity score: 0.7442927068795645] It’s what gets us up in the morning and keeps us motivated to keep pushing the boundaries of what...
> [Node 72301e09-d5ca

Batch processing of predictions:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 7/8 [00:04<00:00,  3.22it/s]

> Top 2 nodes:
> [Node a869feb3-4f92-406f-bc83-4336f5fbecf2] [Similarity score: 0.7958722664575402] LlamaIndex Newsletter 2023–12–19
What’s up, Llama Followers 🦙, We’re excited to bring you another...
> [Node ca46dbfc-c51d-4174-8e17-bb516ba98ae3] [Similarity score: 0.7687064012879486] Linking the resources again below: Gemini (text-only) Guide Gemini (multi-modal) Guide Semantic R...
> Top 2 nodes:
> [Node a869feb3-4f92-406f-bc83-4336f5fbecf2] [Similarity score:             0.795872] LlamaIndex Newsletter 2023–12–19
What’s up, Llama Followers 🦙, We’re excited to bring you another...
> [Node ca46dbfc-c51d-4174-8e17-bb516ba98ae3] [Similarity score:             0.768706] Linking the resources again below: Gemini (text-only) Guide Gemini (multi-modal) Guide Semantic R...


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:07<00:00,  1.06it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node e0f90e61-0084-497c-91aa-067748038341] [Similarity score: 0.7829275434542138] Utilizing LlamaIndex
      connectors allows you to seamlessly integrate your data into the
     ...
> [Node c1f13df6-488f-425c-aa9c-5b1eb3e0af36] [Similarity score: 0.7765200492167021] Agentic RAG With LlamaIndex
The topic of Agentic RAG explores how agents can be incorporated into...
> Top 2 nodes:
> [Node e0f90e61-0084-497c-91aa-067748038341] [Similarity score:             0.782928] Utilizing LlamaIndex
      connectors allows you to seamlessly integrate your data into the
     ...
> [Node c1f13df6-488f-425c-aa9c-5b1eb3e0af36] [Similarity score:             0.77652] Agentic RAG With LlamaIndex
The topic of Agentic RAG explores how agents can be incorporated into...
> Top 2 nodes:
> [Node d3b4fbb6-70ff-4970-89a1-caf6b2d634a0] [Similarity score: 0.8352047269882725] Build and Evaluate LLM Apps with LlamaIndex and TruLens
Authors:  Anupam Datta, Shayak Sen, Jerry...
> [Node f6bc96f3-6d28-

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.20it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 3078847e-199e-4273-a522-2e3e2f5e648a] [Similarity score: 0.8642420660772184] LlamaIndex Accelerates Enterprise Generative AI with NVIDIA NIM
Generative AI is rapidly transfor...
> [Node dea8c47b-84b7-4768-b03d-568182ada8eb] [Similarity score: 0.756826920715372] “Now, developers can abstract complexities associated with data ingestion, simplify RAG pipeline ...
> Top 2 nodes:
> [Node 3078847e-199e-4273-a522-2e3e2f5e648a] [Similarity score:             0.864242] LlamaIndex Accelerates Enterprise Generative AI with NVIDIA NIM
Generative AI is rapidly transfor...
> [Node dea8c47b-84b7-4768-b03d-568182ada8eb] [Similarity score:             0.756827] “Now, developers can abstract complexities associated with data ingestion, simplify RAG pipeline ...
> Top 2 nodes:
> [Node e0f90e61-0084-497c-91aa-067748038341] [Similarity score: 0.740371794695132] Utilizing LlamaIndex
      connectors allows you to seamlessly integrate your data into the
     ...
> [Node c1f13df6-488f-4

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.83it/s]
Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/6 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node c32329ef-6d60-4636-a599-9ec12928ac95] [Similarity score: 0.68926271740739] from  transformers  import  pipeline
 from  transformers  import  AutoTokenizer, AutoModelForToke...
> [Node 4a4de14c-9803-4597-ad0c-47a616719af3] [Similarity score: 0.6851382079260567] NewsGPT(Neotice): Summarize news articles with LlamaIndex — Hackathon winning app
We’re excited t...
> Top 2 nodes:
> [Node c32329ef-6d60-4636-a599-9ec12928ac95] [Similarity score:             0.689263] from  transformers  import  pipeline
 from  transformers  import  AutoTokenizer, AutoModelForToke...
> [Node 4a4de14c-9803-4597-ad0c-47a616719af3] [Similarity score:             0.685138] NewsGPT(Neotice): Summarize news articles with LlamaIndex — Hackathon winning app
We’re excited t...
> Top 2 nodes:
> [Node cf13437a-eb99-40cc-bbab-804d77e1de98] [Similarity score: 0.79416673127969] LlamaIndex Newsletter 2023–11–14
Hello Llama Friends 🦙 LlamaIndex is 1 year old this week! 🎉 To c...
> [Node 2d8ceeda-0623-498

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.27it/s]


0it [00:00, ?it/s]

> Adding chunk: Querying a network of knowledge with llama-inde...
> Adding chunk: A place where data suppliers package their data...
> Adding chunk: Querying a network of knowledge with llama-inde...
> Adding chunk: A place where data suppliers package their data...
> Adding chunk: This stack is different from any ETL stack befo...
> Adding chunk: This is a surprisingly prevalent use case acros...
> Adding chunk: This stack is different from any ETL stack befo...
> Adding chunk: This is a surprisingly prevalent use case acros...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Notebook . 🗺️ Guides: Guide  to Integrating Lla...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Notebook . 🗺️ Guides: Guide  to Integrating Lla...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Below, we list a select few of the evaluation n...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re 

In [51]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.134615
mean_relevancy_score,0.933333
mean_faithfulness_score,1.0


In [52]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,How does the new feature released by LlamaInde...,"The new feature released by LlamaIndex, llama-...",1.0,4.0,1.0
1,Discuss the advancements made in LlamaIndex's ...,The advancements made in LlamaIndex's PDF pars...,1.0,4.0,1.0
2,Explain the three main sections of the OpenAI ...,The three main sections of the OpenAI Cookbook...,1.0,4.5,1.0
3,How does the OpenAI Cookbook suggest evaluatin...,The OpenAI Cookbook suggests evaluating RAG sy...,1.0,4.5,1.0
4,How has LlamaIndex evolved over the past year ...,LlamaIndex has evolved over the past year by e...,1.0,4.5,1.0
5,Can you explain the significance of Retrieval-...,Retrieval-Augmented Generation (RAG) plays a s...,1.0,4.0,1.0
6,How does the partnership with Google Gemini be...,The partnership with Google Gemini benefits Ll...,1.0,4.5,1.0
7,Describe the new Multi-Doc SEC 10Q Dataset lau...,The new Multi-Doc SEC 10Q Dataset launched by ...,1.0,4.5,1.0
8,How does the MemoryCache project by Mozilla ut...,The MemoryCache project by Mozilla utilizes Pr...,1.0,4.0,1.0
9,What are the key features of the LionAGI agent...,The LionAGI agent framework by Ocean Li incorp...,1.0,4.0,1.0


In [53]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [54]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [55]:
curated_mean_scores_df, curated_deep_eval_df = await aevaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    query_engine,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 08a0b1fc-d3e7-4152-aec0-622e31cd1059] [Similarity score: 0.7783543956834692] Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
We'...
> [Node aa548ec3-ce01-4681-a11d-4438af0078c6] [Similarity score: 0.7576050237219937] ,
    service_name= "dumb_fact_agent" ,
    host= "localhost" ,
    port= 8004 
) And finally we ...
> Top 2 nodes:
> [Node 08a0b1fc-d3e7-4152-aec0-622e31cd1059] [Similarity score:             0.778354] Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
We'...
> [Node aa548ec3-ce01-4681-a11d-4438af0078c6] [Similarity score:             0.757605] ,
    service_name= "dumb_fact_agent" ,
    host= "localhost" ,
    port= 8004 
) And finally we ...
> Top 2 nodes:
> [Node c64cb1f0-5316-46ba-a6d9-9534ce72ab4d] [Similarity score: 0.8118085001908792] OpenAI Cookbook: Evaluating RAG systems
We’re excited to unveil our  OpenAI Cookbook , a guide to...
> [Node c33db99d-e2f0

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.00s/it]


0it [00:00, ?it/s]

> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: ,
    service_name= "dumb_fact_agent" ,
    hos...
> Adding chunk: Introducing llama-agents: A Powerful Framework ...
> Adding chunk: ,
    service_name= "dumb_fact_agent" ,
    hos...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Advanced RAG With the success requirements defi...
> Adding chunk: OpenAI Cookbook: Evaluating RAG systems
We’re e...
> Adding chunk: Advanced RAG With the success requirements defi...
> Adding chunk: Boosting RAG: Picking the Best Embedding & Rera...
> Adding chunk: Reranking involves using a semantic search mode...
> Adding chunk: Boosting RAG: Picking the Best Embedding & Rera...
> Adding chunk: Reranking involves using a semantic search mode...


In [56]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.75
mean_relevancy_score,1.0
mean_faithfulness_score,1.0


In [57]:
with pd.option_context('display.max_colwidth', None):
    display(curated_deep_eval_df)

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,What are key features of llama-agents?,"The key features of llama-agents include a Distributed Service Oriented Architecture where each agent can function as an independently running microservice, communication between agents via standardized API interfaces using a central control plane orchestrator, the ability to define agentic and explicit orchestration flows, ease of deployment allowing for independent launching, scaling, and monitoring of each agent and the control plane, as well as built-in observability tools for scalability and resource management.",1.0,4.5,1.0
1,What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?,The two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook are the Retrieval System and Response Generation.,1.0,,1.0
2,What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?,Hit Rate and Mean Reciprocal Rank,1.0,5.0,1.0


In [58]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [59]:
mlflow.end_run()

# Archive