# Set up

In [2]:
import json
from loguru import logger
import pandas as pd
from tqdm.notebook import tqdm

import mlflow

In [3]:
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

nest_asyncio.apply()

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

## Constants

In [5]:
TESTING = True

In [6]:
import logging
import sys

if TESTING:
    logging.getLogger('llama_index').addHandler(logging.StreamHandler(stream=sys.stdout))
    logging.getLogger('llama_index').setLevel(logging.DEBUG)

In [7]:
LOG_TO_MLFLOW = True
if LOG_TO_MLFLOW:
    mlflow.set_experiment("Chain Frost - LlamaIndex Blog QnA Chatbot")
    mlflow.start_run()
    mlflow.log_param("TESTING", TESTING)

2024/07/22 14:24:55 INFO mlflow.tracking.fluent: Experiment with name 'Chain Frost - LlamaIndex Blog QnA Chatbot' does not exist. Creating a new experiment.


# Load data

In [8]:
DATA_FP = '../crawl_llamaindex_blog/data/blogs.json'
with open(DATA_FP, 'r') as f:
    data = json.load(f)

In [9]:
len(data)

159

In [10]:
data[:5]

[{'title': 'Automate online tasks with MultiOn and LlamaIndex',
  'author': 'MultiOn',
  'date': 'May 23, 2024',
  'tags': ['automation', 'Agents']},
 {'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
  'content': 'We’re happy to announce the recent integration of LlamaIndex with PostgresML — a comprehensive machine learning platform built on PostgreSQL. The PostgresML Managed Index allows LlamaIndex users to seamlessly manage document storage, splitting, embedding, and retrieval. By using PostgresML as the backend, users benefit from a streamlined and optimized process for Retrieval-Augmented Generation (RAG). This integration unifies embedding, vector search, and text generation into a single network call, resulting in faster, more reliable, and easier-to-manage RAG workflows. The problem with typical RAG workflows Typical Retrieval-Augmented Generation (RAG) workflows come with significant drawbacks, particularly for users. Poor performance is a ma

# Check data

In [11]:
data[0]['content']



# Prepare documents

In [12]:
input_data = data
if TESTING:
    input_data = data[:2]

In [13]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

documents = []
for record in input_data:
    title = record['title']
    metadata = {
        'title': title,
        'author': record['author'],
        'date': record['date'],
        'tags': ', '.join(record['tags'])
    }
    text = f"{title}\n{record['content']}"
    doc = Document(text=text, metadata=metadata)
    documents.append(doc)

In [14]:
documents[0]



In [15]:
documents[1].metadata

{'title': 'Simplify your RAG application architecture with LlamaIndex + PostgresML',
 'author': 'PostgresML',
 'date': 'May 28, 2024',
 'tags': 'Managed Indexes'}

## Setting LLM

In [16]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings, ServiceContext

In [17]:
LLM_OPTION = 'openai'
# llm_option = 'ollama'

if LOG_TO_MLFLOW:
    mlflow.log_param("LLM_OPTION", LLM_OPTION)

In [18]:
if LLM_OPTION == 'ollama':
    logger.info(f"Using local Ollama LLM...")
    from llama_index.embeddings.ollama import OllamaEmbedding
    LLM_SERVER_HOST = '192.168.100.14'
    LLM_SERVER_PORT = 11434
    base_url = f'http://{LLM_SERVER_HOST}:{LLM_SERVER_PORT}'
    OLLAMA_MODEL_NAME = 'llama3'
    llm = Ollama(base_url=base_url, model=model_name, request_timeout=60.0)
    !ping -c 1 $LLM_SERVER_HOST
    Settings.llm = llm
    embedding = OllamaEmbedding(
        model_name=OLLAMA_MODEL_NAME,
        base_url=base_url,
        ollama_additional_kwargs={"mirostat": 0},
    )
    Settings.embed_model = embedding
    if LOG_TO_MLFLOW:
        mlflow.log_param("OLLAMA_MODEL_NAME", OLLAMA_MODEL_NAME)
elif LLM_OPTION == 'openai':
    logger.info(f"Using OpenAI LLM...")
    from llama_index.llms.openai import OpenAI
    from llama_index.embeddings.openai import OpenAIEmbedding
    embedding = OpenAIEmbedding()
    OPENAI_MODEL_NAME = 'gpt-3.5-turbo'
    llm = OpenAI(model=OPENAI_MODEL_NAME)
    if LOG_TO_MLFLOW:
        mlflow.log_param("OPENAI_MODEL_NAME", OPENAI_MODEL_NAME)

[32m2024-07-22 14:24:58.610[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mUsing OpenAI LLM...[0m


# Index embeddings

In [19]:
import os
import pickle
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [20]:
RECREATE_INDEX = True

COLLECTION = 'mvp'
NOTEBOOK_CACHE_DP = 'data/001'
NODES_PERSIST_FP = f'{NOTEBOOK_CACHE_DP}/nodes.pkl'
os.makedirs(NOTEBOOK_CACHE_DP, exist_ok=True)

In [21]:
db = chromadb.PersistentClient(path=f"{NOTEBOOK_CACHE_DP}/chroma_db")
collection_exists = COLLECTION in [c.name for c in db.list_collections()]
if RECREATE_INDEX or not collection_exists:
    logger.info(f"Deleting existing ChromaDB collection...")
    db.delete_collection(COLLECTION)
    logger.info(f"Deleting persisted nodes object at {NODES_PERSIST_FP}...")
    os.remove(NODES_PERSIST_FP)
else:
    logger.info(f"Use existing ChromaDB collection")
chroma_collection = db.get_or_create_collection(COLLECTION)

[32m2024-07-22 14:24:59.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mDeleting existing ChromaDB collection...[0m
[32m2024-07-22 14:24:59.521[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mDeleting persisted nodes object at data/001/nodes.pkl...[0m


In [22]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [23]:
if chroma_collection.count() > 0 and RECREATE_INDEX == False:
    logger.info(f"Loading index from existing ChromaDB...")
    with open(NODES_PERSIST_FP, 'rb') as f:
        nodes = pickle.load(f)
else:
    logger.info(f"Creating new ChromaDB index...")
    # Generate nodes
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/
    
    from llama_index.core.extractors import TitleExtractor
    from llama_index.core.ingestion import IngestionPipeline, IngestionCache
    
    # create the pipeline with transformations
    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=512, chunk_overlap=10),
            TitleExtractor(),
            embedding,
        ],
        vector_store = vector_store
    )
    
    # Need to use await and arun here to run the pipeline else error
    # Ref: https://docs.llamaindex.ai/en/stable/examples/ingestion/async_ingestion_pipeline/
    # Ref: https://github.com/run-llama/llama_index/issues/13904#issuecomment-2145561710
    nodes = await pipeline.arun(documents=documents)
    with open(NODES_PERSIST_FP, 'wb') as f:
        pickle.dump(nodes, f)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)

[32m2024-07-22 14:25:00.002[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCreating new ChromaDB index...[0m


> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: All parameters are required
        
        If...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: Step 2: Create the PostgresML Managed Index Fir...
> Adding chunk: The PostgresML Managed Index is doing embedding...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.38s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.57it/s]


#### Inspect nodes

# Query engine

In [24]:
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [25]:
RETRIEVAL_TOP_K = 2
RETRIEVAL_SIMILARITY_CUTOFF = 0.7

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_TOP_K", RETRIEVAL_TOP_K)
    mlflow.log_param("RETRIEVAL_SIMILARITY_CUTOFF", RETRIEVAL_SIMILARITY_CUTOFF)

In [26]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=RETRIEVAL_TOP_K,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=RETRIEVAL_SIMILARITY_CUTOFF)],
)

In [27]:
question = "What is MultiOn?"
response = query_engine.query(question)
logger.info(response)

> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score: 0.7029416244989046] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score: 0.6983769899484777] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score:             0.702942] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score:             0.698377] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...


[32m2024-07-22 14:25:12.170[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mMultiOn is the author of the document discussing the automation of tasks with MultiOn and LlamaIndex.[0m


# Evaluation

## Retrieval Evaluation

### Building synthetic evaluation dataset

In [28]:
with open(NODES_PERSIST_FP, 'rb') as f:
    nodes = pickle.load(f)

In [29]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset

In [30]:
RETRIEVAL_NUM_QUESTIONS_PER_CHUNK = 2
RECREATE_RETRIEVAL_EVAL_DATASET = True
RETRIEVAL_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_retrieval_eval_dataset.json"

if LOG_TO_MLFLOW:
    mlflow.log_param("RETRIEVAL_NUM_QUESTIONS_PER_CHUNK", RETRIEVAL_NUM_QUESTIONS_PER_CHUNK)

In [31]:
if RECREATE_RETRIEVAL_EVAL_DATASET or not os.path.exists(RETRIEVAL_EVAL_DATASET_FP):
    logger.info(f"Creating new synthetic retrieval eval dataset...")
    retrieval_eval_dataset = generate_question_context_pairs(
        nodes, llm=llm, num_questions_per_chunk=RETRIEVAL_NUM_QUESTIONS_PER_CHUNK
    )
    retrieval_eval_dataset.save_json(RETRIEVAL_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic retrieval eval dataset at {RETRIEVAL_EVAL_DATASET_FP}...")
    retrieval_eval_dataset = EmbeddingQAFinetuneDataset.from_json(RETRIEVAL_EVAL_DATASET_FP)

[32m2024-07-22 14:25:12.236[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating new synthetic retrieval eval dataset...[0m
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


### Evaluate

In [32]:
from llama_index.core.evaluation import RetrieverEvaluator

In [33]:
RETRIEVAL_METRICS = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    RETRIEVAL_METRICS, retriever=retriever
)

retrieval_eval_results = await retriever_evaluator.aevaluate_dataset(retrieval_eval_dataset)

> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score: 0.7601724451671024] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score: 0.7320236136357314] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score:             0.760172] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score:             0.732024] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node ca7e2271-1946-4225-a666-869bba529af6] [Similarity score: 0.7686286115565645] print (agent.chat( "browse to the latest email from Julian and open the email" )) Added user mess...
> [Node 6c5ed3b0-e702

In [34]:
def display_results(name, eval_results, metrics=['hit_rate', 'mrr'], include_cohere_rerank=False):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [35]:
metric_prefix = f"top_{RETRIEVAL_TOP_K}_retrieval_eval"
retrieval_eval_results_df = display_results(metric_prefix, retrieval_eval_results, metrics=RETRIEVAL_METRICS)
retrieval_eval_results_df

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top_2_retrieval_eval,0.833333,0.666667,0.416667,0.833333,0.666667,0.435525


In [36]:
if LOG_TO_MLFLOW:
    for metric, metric_value in retrieval_eval_results_df.to_dict(orient='records')[0].items():
        if metric in RETRIEVAL_METRICS:
            mlflow.log_metric(f"{metric_prefix}_{metric}", metric_value)

### Manually curated dataset
Ref: https://docs.llamaindex.ai/en/stable/module_guides/evaluating/usage_pattern_retrieval/

In [37]:
MANUAL_EVAL_QA = [
("What are key features of llama-agents?",
"""
Key features of llama-agents are:
1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service
"""
),
("What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?",
"""
Retrieval System and Response Generation.
"""
),
("What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?",
"""
Hit rate and Mean Reciprocal Rank (MRR)

Hit Rate: Hit rate calculates the fraction of queries where the correct answer is found within the top-k retrieved documents. In simpler terms, it’s about how often our system gets it right within the top few guesses.

Mean Reciprocal Rank (MRR): For each query, MRR evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document. Specifically, it’s the average of the reciprocals of these ranks across all the queries. So, if the first relevant document is the top result, the reciprocal rank is 1; if it’s second, the reciprocal rank is 1/2, and so on.
"""
)
]

## Response Evaluation
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/downloading_llama_datasets/

In [38]:
async def aevaluate_labelled_rag_dataset(response_eval_dataset, query_engine, dataset_name="synthetic", batch_size=8, judge_model='gpt-3.5-turbo', cache_dp='.'):
    # Make predictions with the dataset
    response_eval_prediction_dataset = await response_eval_dataset.amake_predictions_with(
        predictor=query_engine, batch_size=batch_size, show_progress=True
    )

    # Instantiate the judges
    judges = {
        "correctness": CorrectnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "relevancy": RelevancyEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "faithfulness": FaithfulnessEvaluator(
            llm=OpenAI(temperature=0, model=judge_model),
        ),
        "semantic_similarity": SemanticSimilarityEvaluator(),
    }

    # Initialize evaluations dictionary
    evals = {
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
    }

    # Evaluate each prediction
    for example, prediction in tqdm(
        zip(response_eval_dataset.examples, response_eval_prediction_dataset.predictions)
    ):
        correctness_result = judges["correctness"].evaluate(
            query=example.query,
            response=prediction.response,
            reference=example.reference_answer,
        )

        relevancy_result = judges["relevancy"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        faithfulness_result = judges["faithfulness"].evaluate(
            query=example.query,
            response=prediction.response,
            contexts=prediction.contexts,
        )

        evals["correctness"].append(correctness_result)
        evals["relevancy"].append(relevancy_result)
        evals["faithfulness"].append(faithfulness_result)

    # Save evaluations to JSON
    evaluations_objects = {
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
    }

    with open(f"{cache_dp}/{dataset_name}_evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)

    # Generate evaluation results DataFrames
    deep_eval_correctness_df, mean_correctness_df = get_eval_results_df(
        ["base_rag"] * len(evals["correctness"]),
        evals["correctness"],
        metric="correctness",
    )
    deep_eval_relevancy_df, mean_relevancy_df = get_eval_results_df(
        ["base_rag"] * len(evals["relevancy"]),
        evals["relevancy"],
        metric="relevancy",
    )
    deep_eval_faithfulness_df, mean_faithfulness_df = get_eval_results_df(
        ["base_rag"] * len(evals["faithfulness"]),
        evals["faithfulness"],
        metric="faithfulness",
    )

    mean_scores_df = pd.concat(
        [
            mean_correctness_df.reset_index(),
            mean_relevancy_df.reset_index(),
            mean_faithfulness_df.reset_index(),
        ],
        axis=0,
        ignore_index=True,
    )
    mean_scores_df = mean_scores_df.set_index("index")
    mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

    deep_eval_df = pd.concat([
        deep_eval_correctness_df[['query', 'answer']],
        deep_eval_relevancy_df[['scores']].rename(columns={'scores': 'relevancy_score'}),
        deep_eval_correctness_df[['scores']].rename(columns={'scores': 'correctness_score'}),
        deep_eval_faithfulness_df[['scores']].rename(columns={'scores': 'faithfulness_score'}),
    ], axis=1)

    return mean_scores_df, deep_eval_df

### Generate synthetic Llama Dataset

In [39]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.llama_dataset import LabeledRagDataset
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    SemanticSimilarityEvaluator,
)
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

In [40]:
SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK = 2
RECREATE_SYNTHETIC_EVAL_DATASET = True
RESPONSE_EVAL_LLM_MODEL = 'gpt-3.5-turbo'
RESPONSE_EVAL_LLM_MODEL_CONFIG = {
    "temperature": 0.3
}
RESPONSE_EVAL_DATASET_FP = f"{NOTEBOOK_CACHE_DP}/llamaindex_blog_response_eval_dataset.json"

if LOG_TO_MLFLOW:
    mlflow.log_param("SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK", SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK)
    mlflow.log_param("RESPONSE_EVAL_LLM_MODEL", RESPONSE_EVAL_LLM_MODEL)
    for k, v in RESPONSE_EVAL_LLM_MODEL_CONFIG.items():
        mlflow.log_param(f"RESPONSE_EVAL_LLM_MODEL_CONFIG__{k}", v)

In [41]:
if RECREATE_SYNTHETIC_EVAL_DATASET or not os.path.exists(RESPONSE_EVAL_DATASET_FP):
    logger.info(f"Creating synthetic response eval dataset...")
    # set context for llm provider
    response_eval_llm = OpenAI(model=RESPONSE_EVAL_LLM_MODEL, **RESPONSE_EVAL_LLM_MODEL_CONFIG)

    # instantiate a DatasetGenerator
    response_dataset_generator = RagDatasetGenerator.from_documents(
        documents,
        llm=llm,
        num_questions_per_chunk=SYNTHETIC_RESPONSE_NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        show_progress=True,
    )

    synthetic_response_eval_dataset = response_dataset_generator.generate_dataset_from_nodes()

    synthetic_response_eval_dataset.save_json(RESPONSE_EVAL_DATASET_FP)
else:
    logger.info(f"Loading existing synthetic response eval dataset at {RESPONSE_EVAL_DATASET_FP}...")
    synthetic_response_eval_dataset = LabeledRagDataset.from_json(RESPONSE_EVAL_DATASET_FP)

[32m2024-07-22 14:26:00.669[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCreating synthetic response eval dataset...[0m


Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
 
I have opened the la...
> Adding chunk: Simplify your RAG application architecture with...
> Adding chunk: documents = SimpleDirectoryReader( "data" ).loa...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.27s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [42]:
synthetic_mean_scores_df, synthetic_deep_eval_df = await aevaluate_labelled_rag_dataset(
    synthetic_response_eval_dataset,
    query_engine,
    dataset_name="synthetic",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score: 0.7596490551728567] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score: 0.7502007361666977] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> Top 2 nodes:
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score:             0.759649] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score:             0.750201] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> Top 2 nodes:
> [Node 6f824aaa-0440-48f2-a4e1-9cd25a5bf407] [Similarity score: 0.8252573722447051] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node afc179af-5233

Batch processing of predictions:  62%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                          | 5/8 [00:04<00:01,  1.66it/s]

> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score: 0.7821356009124653] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score: 0.7497006863665036] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score:             0.782136] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702-43e5-abbc-be393ae04e19] [Similarity score:             0.749701] Automate online tasks with MultiOn and LlamaIndex
Introduction MultiOn is an AI agents platform d...
> Top 2 nodes:
> [Node f8994407-be6c-4c46-bf83-e1c660951a6e] [Similarity score: 0.7996859571107091] Step 2: Create the PostgresML Managed Index First install Llama_index and the PostgresML Managed ...
> [Node 3f0dcda1-cb3c

Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:07<00:00,  1.06it/s]


0it [00:00, ?it/s]

> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: Automate online tasks with MultiOn and LlamaInd...
> Adding chunk: As these technologies evolve, they will continu...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: The email was authenticated and passed SPF and ...
> Adding chunk: print (agent.chat( "browse to the latest email ...
> Adding chunk: The email was authenticated and passed SPF and

In [43]:
synthetic_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,4.571429
mean_relevancy_score,1.0
mean_faithfulness_score,1.0


In [44]:
synthetic_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,How does MultiOn empower developers to automat...,MultiOn empowers developers to automate online...,1.0,5.0,1.0
1,What is the role of LlamaIndex in complementin...,LlamaIndex complements MultiOn by providing an...,1.0,4.5,1.0
2,How does the MultiOn agent summarize the email...,The MultiOn agent summarizes the email chain w...,1.0,4.5,1.0
3,How does the MultiOn agent facilitate the send...,The MultiOn agent facilitates the sending of t...,1.0,,1.0
4,How does the integration of LlamaIndex with Po...,The integration of LlamaIndex with PostgresML ...,1.0,4.5,1.0
5,Explain the challenges associated with typical...,The challenges associated with typical RAG wor...,1.0,4.5,1.0
6,How does the PostgresML Managed Index simplify...,The PostgresML Managed Index simplifies the RA...,1.0,4.5,1.0
7,Explain the process of querying using the Post...,The querying process using the PostgresML Inde...,1.0,4.5,1.0


In [45]:
if LOG_TO_MLFLOW:
    for k, v in synthetic_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"synthetic_response_eval__{k}", v)
    synthetic_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/synthetic_deep_eval_df.html", "synthetic_deep_eval_df")

### Manually curated
Ref: https://docs.llamaindex.ai/en/stable/examples/llama_dataset/ragdataset_submission_template/#1c-creating-a-labelledragdataset-from-scratch-with-manually-constructed-examples

In [46]:
from llama_index.core.llama_dataset import LabelledRagDataset, LabelledRagDataExample, CreatedBy, CreatedByType

examples = []

for question, expected_anwser in MANUAL_EVAL_QA:
    example = LabelledRagDataExample(
        query=question,
        query_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_answer=expected_anwser,
        reference_answer_by=CreatedBy(type=CreatedByType.HUMAN),
        reference_contexts=[],
    )
    examples.append(example)

curated_response_eval_dataset = LabelledRagDataset(examples=examples)

# save this dataset as it is required for the submission
curated_response_eval_dataset.save_json(f"{NOTEBOOK_CACHE_DP}/curated_response_eval_dataset.json")

In [47]:
curated_mean_scores_df, curated_deep_eval_df = await aevaluate_labelled_rag_dataset(
    curated_response_eval_dataset,
    query_engine,
    dataset_name="curated",
    judge_model=RESPONSE_EVAL_LLM_MODEL,
    cache_dp=NOTEBOOK_CACHE_DP
)

Batch processing of predictions:   0%|                                                                                                                                                                                                               | 0/3 [00:00<?, ?it/s]

> Top 2 nodes:
> [Node afc179af-5233-405c-b132-86094639742a] [Similarity score: 0.7102608983008766] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 6f824aaa-0440-48f2-a4e1-9cd25a5bf407] [Similarity score: 0.6991652827223744] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> Top 2 nodes:
> [Node afc179af-5233-405c-b132-86094639742a] [Similarity score:             0.710261] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> [Node 6f824aaa-0440-48f2-a4e1-9cd25a5bf407] [Similarity score:             0.699165] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> Top 2 nodes:
> [Node 9b9f091e-632a-463e-a178-3f56b407c0aa] [Similarity score: 0.6852199499977049] As these technologies evolve, they will continue to unlock new potentials in AI application, sign...
> [Node 6c5ed3b0-e702

Batch processing of predictions:  33%|██████████████████████████████████████████████████████████████████▎                                                                                                                                    | 1/3 [00:01<00:02,  1.46s/it]

> Top 2 nodes:
> [Node 6f824aaa-0440-48f2-a4e1-9cd25a5bf407] [Similarity score: 0.6402868722827324] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node afc179af-5233-405c-b132-86094639742a] [Similarity score: 0.6343866618327142] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...
> Top 2 nodes:
> [Node 6f824aaa-0440-48f2-a4e1-9cd25a5bf407] [Similarity score:             0.640287] Simplify your RAG application architecture with LlamaIndex + PostgresML
We’re happy to announce t...
> [Node afc179af-5233-405c-b132-86094639742a] [Similarity score:             0.634387] On the PostgresML cloud, you can perform vector operations, create embeddings, and generate real-...


Batch processing of predictions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.22it/s]


0it [00:00, ?it/s]

> Adding chunk: On the PostgresML cloud, you can perform vector...
> Adding chunk: On the PostgresML cloud, you can perform vector...


In [48]:
curated_mean_scores_df

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,1.5
mean_relevancy_score,0.0
mean_faithfulness_score,0.0


In [49]:
curated_deep_eval_df

Unnamed: 0,query,answer,relevancy_score,correctness_score,faithfulness_score
0,What are key features of llama-agents?,Empty Response,0.0,1.0,0.0
1,What are the two critical areas of RAG system ...,The two critical areas of RAG system performan...,0.0,2.5,0.0
2,What are the two main metrics used to evaluate...,Empty Response,0.0,1.0,0.0


In [50]:
if LOG_TO_MLFLOW:
    for k, v in curated_mean_scores_df.T.to_dict(orient='records')[0].items():
        mlflow.log_metric(f"curated_response_eval__{k}", v)
    curated_deep_eval_df.to_html(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html")
    mlflow.log_artifact(f"{NOTEBOOK_CACHE_DP}/curated_deep_eval_df.html", "curated_deep_eval_df")

# Clean up

In [51]:
mlflow.end_run()

# Archive