# Web scrapping data

The first step is to use Selenium and BeautifulSoup to scrape blog links from the LlamaIndex blog page. The script is designed to run in headless mode, meaning it does not open a browser window, making it suitable for automated environments. 

In [None]:
#%%
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import time

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode to avoid opening a browser window
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
service = Service('/usr/local/bin/chromedriver')  # Update with the path to your chromedriver

driver = webdriver.Chrome(service=service, options=chrome_options)

# Block link

main_link = "https://www.llamaindex.ai/blog"

# Open the blog page
driver.get(main_link)

# Scroll to the bottom of the page to load all content
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to the bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Wait to load the page
    time.sleep(SCROLL_PAUSE_TIME)
    
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the WebDriver
driver.quit()

# Extract all blog links
blog_links = []
for a_tag in soup.find_all('a', href=True):
    href = a_tag['href']
    if '/blog/' in href:
        blog_links.append(href)

# Remove duplicates and print the blog links
blog_links = list(set(blog_links))
for link in blog_links:
    print(link)


The next step is to go through all links extracted by the previous code, extract all the HTML links with relevant information, and store the data as a JSON file with the following structure:


```json
{
  "blog_posts": [
    {
      "date": "date",
      "author": "author",
      "title": "title",
      "tags": ["tag1", "tag2"],
      "content": "content_markdown",
      "related_posts": ["related_post1", "related_post2"],
      "link": "blog_link"
    }
    // Additional blog post objects can be added here
  ]
}
```


In [None]:
# Extract the content of the first blog post
def has_blogpost_date_class(tag):
    return tag.has_attr('class') and any('BlogPost_date' in cls for cls in tag['class'])

def has_blogpost_title_class(tag):
    return tag.has_attr('class') and any('BlogPost_title' in cls for cls in tag['class'])

def has_blogpost_tags_class(tag):
    return tag.has_attr('class') and any('BlogPost_tags' in cls for cls in tag['class'])

def has_blogpost_content_class(tag):
    return tag.has_attr('class') and any('BlogPost_htmlPost' in cls for cls in tag['class'])

def has_blogpost_relatedposts_class(tag):
    return tag.has_attr('class') and any('BlogPost_relatedPostsList' in cls for cls in tag['class'])


#%%

blog_contents = []

# parse the first blog post using beautifulsoup
for blog_link in blog_links:
    print(f"Extracting content for blog post: {blog_link}")
    blog_link = main_link + "/" + blog_link.split("/blog")[1]
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(blog_link)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

     # Find all elements that match the criteria
    elements_with_blogpost_date = soup.find_all(has_blogpost_date_class)
    elements_with_blogpost_title = soup.find_all(has_blogpost_title_class)
    elements_with_blogpost_tags = soup.find_all(has_blogpost_tags_class)
    elements_with_blogpost_content = soup.find_all(has_blogpost_content_class)
    elements_with_blogpost_relatedposts = soup.find_all(has_blogpost_relatedposts_class)


    # extract out the content of the first blog post
    try:
        date_author = elements_with_blogpost_date[0].get_text()
        try:
            date = date_author.split(' • ')[1].strip()
        except:
            date = date_author
            author = ""
    except:
        date = ""
        author = ""
    
    author = date_author.split(' • ')[0].strip()
    title = elements_with_blogpost_title[0].get_text()
    content = soup.find_all(has_blogpost_content_class)[0]
    html_string = str(content)
    content_markdown = md(html_string)
    # extract out the related base on the the text of the a tag
    try:
        tags = [tag['href'].split('/')[-1] for tag in elements_with_blogpost_tags[0].find_all('a')]
    except IndexError:
        tags = []
    try:
        related_posts = [post.get_text() for post in elements_with_blogpost_relatedposts[0].find_all('a')]
    except IndexError:
        related_posts = []

    blog_contents.append({
        'date': date,
        'author': author,
        'title': title,
        'tags': tags,
        'content': content_markdown,
        'related_posts': related_posts,
        'link': blog_link,
    })
    
    
    print(f"Extracted content for blog post: {title}")
#dump the blog contents to a json file
import json
with open('blog_contents.json', 'w') as f:
    json.dump({"blog_posts": blog_contents}, f, indent=4)


# Load and Index data


This step reads blog post data from a JSON file and converts each post into a `Document` object using the `llama_index.core` module. It performs the following steps:

1. **Load JSON Data**: Reads the `data/blog_contents.json` file and extracts the `blog_posts` list.
2. **Create Document Objects**: Defines a function `create_document(content)` that takes a blog post dictionary and returns a `Document` object with the post content and metadata.
3. **Generate Document List**: Uses a list comprehension to create a list of `Document` objects from the JSON data, storing the result in the `documents` variable.


In [2]:
from llama_index.core import Document
import json

with open('data/blog_contents.json') as f:
    data = json.load(f)['blog_posts']

def create_document(content):
    return Document(
        text=content["content"],
        metadata={
            "title": content["title"],
            "date": content["date"],
            "author": content["author"],
            "tags": content["tags"],
            "related_posts": content["related_posts"],
            "link": content["link"],
        }
    )

documents = [create_document(content) for content in data]

Next, set up and run an ingestion pipeline to process a list of `Document` objects by parsing and splitting them into manageable chunks using `MarkdownNodeParser` and `SentenceSplitter`. It uses `nest_asyncio` to enable nested asyncio event loops and processes the documents through an `IngestionPipeline`, resulting in a list of processed nodes. 

The `chunk_size` is set either to `512` with an overlap of `50`, or `1024` with an overlap of `10` for testing.

In [3]:
from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
import nest_asyncio

nest_asyncio.apply()
chunk_size = 1024
chunk_overlap = 10

parser = MarkdownNodeParser()
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
pipeline = IngestionPipeline(transformations=[parser, splitter])
nodes = pipeline.run(documents=documents)


Next, load and initialize various embeddings and a language model using API keys for different services, including `OpenAI`, `Gemini`, `VoyageAI`, and `Cohere`. For the LLM model, I used only `gemini-1.5-flash`, and for the embedding model, I tried different embedding models to test which one is the best:

1. embed-english-v2.0 (Cohere)
2. text-embedding-3-small (OpenAI)
3. embedding-001 (Gemini)
4. voyage-2 (Voyage)
5. embed-english-v3.0 (Cohere)
6. text-embedding-ada-002 (OpenAI)

In [4]:
#load the model and the embedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.embeddings.voyageai import VoyageEmbedding
from llama_index.embeddings.cohere import CohereEmbedding

from llama_index.llms.gemini import Gemini
from llama_index.core import Settings


gemini_pro_api_key = "AIzaSyCnoLpXf-htTXwnVAN7_y0XC_5nhxzUtto"
openai_api_key = "sk-proj-4UvyN6hXPREr1vzH13gyT3BlbkFJy3BITZeZvinu7F9tPtpt"
voyage_api_key = "pa-K__iAI6LGPD1KuwJ8QA2p--ei41NcL14y1ZhNyyz0h8"
cohere_api_key = "KsgEqBsrUqxz8DuwV1qpE8eT0fhDBgcIvGMPDdF9"

model = "gemini-1.5-flash"
model_embedding = "embedding-001"


llm = Settings.llm = Gemini(api_key=gemini_pro_api_key, model=f"models/{model}")


The `embed_model_dict` defines the `Embedding` objects used to index data, depending on the name of the embedding model.

In [5]:
embed_model_dict = {'embed-english-v2.0': CohereEmbedding(model_name="embed-english-v2.0",
                                                                 api_key=cohere_api_key),
                    'text-embedding-3-small': OpenAIEmbedding(model_name="text-embedding-3-small",
                                                              api_key=openai_api_key),
                    'embedding-001': GeminiEmbedding(model_name="embedding-001",
                                                      api_key=gemini_pro_api_key),
                    'voyage-2': VoyageEmbedding(model_name="voyage-2",
                                                      voyage_api_key=voyage_api_key),
                    'embed-english-v3.0': CohereEmbedding(model_name="embed-english-v3.0",
                                                           api_key=cohere_api_key),
                    'text-embedding-ada-002': OpenAIEmbedding(model_name="text-embedding-ada-002",
                                                              api_key=openai_api_key),}

In [67]:
from llama_index.core import SummaryIndex, VectorStoreIndex
from llama_index.core import StorageContext, load_index_from_storage

# indexing 
for model_embedding, embed in embed_model_dict.items():

    print(f"Indexing with {model_embedding}...")

    embed_model = Settings.embedding = embed

    print(Settings.embedding)


    # Persist the index to disk
    PERSIST_DIR_VECTOR = f"./storage/chunk_size_{chunk_size}/{model_embedding}/vector_index"
    PERSIST_DIR_SUMMARY = f"./storage/chunk_size_{chunk_size}/{model_embedding}/summary_index"

    #check if PERSIS_DIR_VECTOR and PERSIST_DIR_SUMMARY have files

    try:
        # Rebuild storage context
        storage_context_vector = StorageContext.from_defaults(persist_dir=PERSIST_DIR_VECTOR)
        storage_context_summary = StorageContext.from_defaults(persist_dir=PERSIST_DIR_SUMMARY)

        # Load index from the storage context
        vector_index = load_index_from_storage(storage_context_vector, embed_model=embed_model)
        summary_index= load_index_from_storage(storage_context_summary, embed_model=embed_model)

    except FileNotFoundError:
        # create the indexes
        vector_index = VectorStoreIndex(nodes)
        summary_index = SummaryIndex(nodes)

        # store the indexes

        vector_index.storage_context.persist(persist_dir=PERSIST_DIR_VECTOR)
        summary_index.storage_context.persist(persist_dir=PERSIST_DIR_SUMMARY)

Indexing with embed-english-v2.0...
model_name='embed-english-v2.0' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x2903c1280> num_workers=None api_key='KsgEqBsrUqxz8DuwV1qpE8eT0fhDBgcIvGMPDdF9' truncate='END' input_type=None embedding_type='float'
Indexing with text-embedding-3-small...
model_name='text-embedding-3-small' embed_batch_size=100 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x2903c1280> num_workers=None additional_kwargs={} api_key='sk-proj-4UvyN6hXPREr1vzH13gyT3BlbkFJy3BITZeZvinu7F9tPtpt' api_base='https://api.openai.com/v1' api_version='' max_retries=10 timeout=60.0 default_headers=None reuse_client=True dimensions=None
Indexing with embedding-001...
model_name='embedding-001' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x2903c1280> num_workers=None title=None task_type='retrieval_document' api_key='AIzaSyCnoLpXf-htTXwnVAN7_y0XC_5nhxzUtto'

# Retrieval 
This step aims to efficiently retrieve specific contexts and summaries from indexed documents using different embedding models and evaluate the performance of various retrievers and rerankers.

1. **Define Functions for Building and Retrieving Indices**:
    - **`build_retriever` Function**: Configure response synthesizers and create vector and summary retrievers. These retrievers are wrapped into retriever tools and combined into a `RouterRetriever` for flexible querying.
    - **`load_index` Function**: Rebuild storage contexts for both vector and summary indices from specified directories and load the indices for further use.

2. **Generate QA Dataset for Evaluation**:
    - **Create Question-Context Pairs**: Generate a QA dataset by creating question-context pairs from a set of documents using the `llama_index` library.
    - **Handle API Rate Limits**: Employ a retry mechanism with exponential backoff to manage potential API rate limits or transient errors.
    - **Compile and Save Dataset**: Compile the generated question-context pairs into an `EmbeddingQAFinetuneDataset` and save it as a JSON file for further use.
    - **Testing Parameters**: Use 60 questions for testing with `chunk_size = 1024` and 100 questions for `chunk_size = 512`

Next, define functions to build and retrieve indices for a document retrieval system using the `llama_index` library. The `build_retriever` function configures response synthesizers and creates vector and summary retrievers, which are then wrapped into retriever tools and combined into a `RouterRetriever` for flexible querying. The `load_index` function rebuilds storage contexts for both vector and summary indices from specified directories and loads the indices for further use. This setup allows for efficient retrieval of specific contexts and summaries from indexed documents using different embedding models.

In [6]:
# Retrive the index

from llama_index.core.retrievers import VectorIndexRetriever, SummaryIndexRetriever, RouterRetriever
from llama_index.core.selectors import LLMMultiSelector
from llama_index.core.tools import RetrieverTool
from llama_index.core import Settings

def build_retriver(vector_index, sumary_index, llm=Settings.llm):

    # configure response synthesizer
    response_synthesizer = get_response_synthesizer()

    # create the retrievers
    vector_retriever = VectorIndexRetriever(index=vector_index,
                                            similarity_top_k=10
                                            )
    summary_retriever = SummaryIndexRetriever(index=summary_index,
                                            similarity_top_k=10
                                            )


    # initialize tools
    vector_retriever_tool = RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful for retrieving specific context from llamaindex blog posts.",
    )
    summary_retriever_tool = RetrieverTool.from_defaults(
        retriever=summary_retriever,
        description="Useful for retrieving summaries of llamaindex blog posts.",
    )

    # define retriever
    retriever = RouterRetriever(
        selector=LLMMultiSelector.from_defaults(llm=llm),
        retriever_tools=[
            vector_retriever_tool,
            summary_retriever_tool,
        ],
    )

    return vector_retriever, summary_retriever, retriever


def load_index(model_embedding):



    # Rebuild storage context
    storage_context_vector = StorageContext.from_defaults(persist_dir=f"./storage/chunk_size_{chunk_size}/{model_embedding}/vector_index")
    storage_context_summary = StorageContext.from_defaults(persist_dir=f"./storage/chunk_size_{chunk_size}/{model_embedding}/summary_index")

    # Load index from the storage context
    vector_index = load_index_from_storage(storage_context_vector, embed_model=embed_model_dict[model_embedding])
    summary_index= load_index_from_storage(storage_context_summary, embed_model=embed_model_dict[model_embedding])

    return vector_index, summary_index

### Generate QA dataset to evaluate the retriever with different embeddings model

Next, I generated a QA dataset by creating question-context pairs from a set of documents using the `llama_index` library. It employs a retry mechanism with exponential backoff to handle potential API rate limits or other transient errors. The `generate_question_context_pairs_with_backoff` function uses the OpenAI model to generate questions based on random document samples, and the process is repeated until a specified number of tests is reached. The generated question-context pairs are then compiled into an `EmbeddingQAFinetuneDataset` and saved as a JSON file for further use. I used 60 questions for testing in case of `chunk_size = 1024` and 100 questions for `chunk_size = 512`


In [75]:
#Dataset for evaluation

import random
import time
import pandas as pd
from tenacity import retry, wait_random_exponential, stop_after_attempt
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    RetrieverEvaluator,
    generate_question_context_pairs
)
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

qa_generate_prompt_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.\

RETURN THE QUESTIONS ONLY.
"""


model_dataset = "gpt-3.5-turbo"
llm_dataset = OpenAI(model=model_dataset, api_key=openai_api_key)



#I don't understand how does this work works, what does it mean to backoff? what happen under the hood
# Retry decorator with exponential backoff
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def generate_question_context_pairs_with_backoff(nodes, llm, num_questions_per_chunk=1):
    return generate_question_context_pairs(random.sample(nodes, 10), 
                                           llm=llm, 
                                           num_questions_per_chunk=num_questions_per_chunk,
                                           qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)

# Initialize QA dataset and counter
qa_dataset_list = []
num_tests = 0
max_tests = 100

while num_tests < max_tests:
    try:
        # Generate question-context pairs with retry and backoff
        question_context_pairs = generate_question_context_pairs_with_backoff(nodes, Settings.llm, num_questions_per_chunk=1)
        
        # Update the number of tests and append to the QA dataset
        num_tests += 10
        qa_dataset_list.append(question_context_pairs)

        # Pause for 30 seconds after each batch
        time.sleep(45)

        
    except Exception as e:
        print(f"Error generating question-context pairs: {e}")


qa_dataset = EmbeddingQAFinetuneDataset(
    queries = {k: v for dataset in qa_dataset_list for k, v in dataset.queries.items()},
    corpus =  {k: v for dataset in qa_dataset_list for k, v in dataset.corpus.items()},
    relevant_docs = {k: v for dataset in qa_dataset_list for k, v in dataset.relevant_docs.items()},
    mode='text')

qa_dataset.queries
qa_dataset.save_json("qa_dataset_chunk_size_{chunk_size}.json")

# Print the total number of QA pairs generated
print(f"Total QA pairs generated: {len(qa_dataset_list)}")


 50%|█████     | 5/10 [00:04<00:04,  1.16it/s]


KeyboardInterrupt: 

# Retrieval Evaluation

The aim of this step is to evaluate different embedding models and rerankers by:

1. **Loading Indices**: Load vector and summary indices for each embedding model.
2. **Building Retrievers**: Initialize vector and summary retrievers.
3. **Setting Up Rerankers**: Configure various rerankers, including a baseline with no reranking.
4. **Running Evaluations**: Evaluate the retrievers on a QA dataset using `RetrieverEvaluator`.
5. **Saving Results**: Save the evaluation metrics (hit rate, MRR, precision, recall, AP, NDCG) to CSV files for further analysis.

Additionally, define a custom retriever class, `CustomRetriever`, that performs both vector search and optional reranking of retrieved results, allowing for testing different reranker models.

In summary, there is no significant difference between the embedding models, which suggests that the code might not be running correctly. However, the `chunk_size` of `512` performs worse than the `chunk_size` of 1024, with hit rates of `0.78` and `0.89`, respectively. Based on the provided images, the best chunk size is `1024`, and the best rerank model is "`cohere`," as it consistently achieves the highest Mean Reciprocal Rank (MRR) across different embedding models.


In [88]:
#Define a helper function to extract the evaluatation results

def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df

Define a custom retriever class, `CustomRetriever`, that extends the `BaseRetriever` class from the `llama_index` library. The `CustomRetriever` performs both vector search and optional reranking of retrieved results. This function allows for creating the retrieval that can have and test different reranker models; when `reranker=None`, it means that no reranker model is used.

It initializes with a `VectorIndexRetriever` and an optional reranker. The `_retrieve` method retrieves nodes based on a given query and applies the reranker if provided. The class also includes asynchronous methods `_aretrieve` and `aretrieve` for retrieving nodes asynchronously.

In [10]:
from typing import List
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever
from llama_index.core.schema import QueryBundle, QueryType, NodeWithScore
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.postprocessor.cohere_rerank import CohereRerank


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both Vector search and Knowledge Graph search."""

    def __init__(self, vector_retriever: VectorIndexRetriever, reranker=None) -> None:
        """Initialize parameters."""
        self._vector_retriever = vector_retriever
        self._reranker = reranker

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

        if self._reranker is not None:
            retrieved_nodes = self._reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        else:
            retrieved_nodes = retrieved_nodes
        
        return retrieved_nodes

    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Asynchronously retrieve nodes given query."""
        return self._retrieve(query_bundle)

    async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
        """Asynchronously retrieve nodes given a string or QueryBundle."""
        if isinstance(str_or_query_bundle, str):
            str_or_query_bundle = QueryBundle(str_or_query_bundle)
        return await self._aretrieve(str_or_query_bundle)

Eevaluates different embedding models and rerankers by loading indices, building retrievers, and running evaluations on a QA dataset. It iterates through each embedding model in embed_model_dict, initializes vector and summary retrievers, and sets up various rerankers. For each reranker, including a baseline with no reranking, it checks if the results file already exists, runs the evaluation using RetrieverEvaluator, and saves the results to a CSV file. The evaluation metrics include hit rate, MRR, precision, recall, AP, and NDCG, and the results are saved for further analysis.

The metrics used to evaluate included: 

- **Hit Rate `hit_rate`**: Measures the proportion of relevant documents retrieved out of the total number of relevant documents available.
- **Mean Reciprocal Rank (MRR) `mrr`**: Calculates the average of the reciprocal ranks of the first relevant document retrieved for a set of queries.
- **Precision `precision`**: Indicates the ratio of relevant documents retrieved to the total number of documents retrieved.
- **Recall `recall`**: Represents the ratio of relevant documents retrieved to the total number of relevant documents in the dataset.
- **Average Precision (AP) `ap`**: Combines precision and recall by averaging precision values at ranks where relevant documents are found.
- **Normalized Discounted Cumulative Gain (NDCG) `ndcg`**: Assesses ranking quality by considering both the relevance and the position of retrieved documents.

In [58]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
import json
with open('qa_dataset_chunk_size_1024.json') as f:
    qa_dataset = json.load(f)
qa_dataset = EmbeddingQAFinetuneDataset(
queries = qa_dataset['queries'],
corpus = qa_dataset['corpus'],
relevant_docs = qa_dataset['relevant_docs'],
mode= 'text')

In [89]:
import os
from llama_index.core.evaluation import RetrieverEvaluator
from tenacity import retry, stop_after_attempt, wait_fixed

# @retry(wait=wait_random_exponential(min=10, max=90), stop=stop_after_attempt(6))
async def evaluate_retriever(model_embedding, rerank_name, reranker, vector_retriever, qa_dataset):
    if os.path.exists(f"{model_embedding}_{rerank_name}_results.csv"):
        print(f"{model_embedding}_{rerank_name}_results.csv already exists")
        return

    custom_retriever = CustomRetriever(vector_retriever=vector_retriever, reranker=reranker)
    metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]
    retriever_evaluator = RetrieverEvaluator.from_metric_names(metrics, retriever=custom_retriever)
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset, show_progress=True)
    result_df = display_results(rerank_name, eval_results)

    print(result_df)
    result_df.to_csv(f"{model_embedding}_{rerank_name}_results.csv")

for model_embedding in embed_model_dict.keys():
    print(f"Running evaluation for {model_embedding}...")

    Settings.embedding = embed

    vector_index = VectorStoreIndex(nodes)
    summary_index = SummaryIndex(nodes)

    vector_retriever, summary_retriever, _ = build_retriver(vector_index, summary_index, llm=llm)


    # Initialize the rerankers
    # ≈ = FlagEmbeddingReranker(model="BAAI/bge-reranker-v2-m3", top_n=10)
    # bge_large_reranker = FlagEmbeddingReranker(model="BAAI/bge-reranker-large", top_n=10)
    cohere_rerank = CohereRerank(api_key=cohere_api_key, top_n=10)

    bge_large_reranker = None
    bge_v2_reranker = None

    for rerank_name, reranker in zip(["base", "bge_small", "bge_large", "cohere"], 
                                     [None, bge_v2_reranker, bge_large_reranker, cohere_rerank]):

        print(f"Reranking with {rerank_name}...")

        if (bge_v2_reranker is None) or (bge_large_reranker is None):
            continue
        
        await evaluate_retriever(model_embedding, rerank_name, reranker, vector_retriever, qa_dataset)

Running evaluation for embed-english-v2.0...
Reranking with base...


100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [14:11<00:00,  8.52s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [12:42<00:00,  7.63s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [00:52<00:00,  1.89it/s]


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226
Running evaluation for text-embedding-3-small...


Task exception was never retrieved
future: <Task finished name='Task-1402' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /Users/giahuyhoangle/anaconda3/envs/ragenv/lib/python3.12/site-packages/tqdm/asyncio.py:75> exception=ValueError('shapes (4096,) and (768,) not aligned: 4096 (dim 0) != 768 (dim 0)')>
Traceback (most recent call last):
  File "/Users/giahuyhoangle/anaconda3/envs/ragenv/lib/python3.12/asyncio/tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "/Users/giahuyhoangle/anaconda3/envs/ragenv/lib/python3.12/site-packages/tqdm/asyncio.py", line 76, in wrap_awaitable
    return i, await f
              ^^^^^^^
  File "/Users/giahuyhoangle/anaconda3/envs/ragenv/lib/python3.12/site-packages/llama_index/core/evaluation/retrieval/base.py", line 186, in eval_worker
    return await self.aevaluate(query, expected_ids=expected_ids, mode=mode)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Reranking with base...


100%|██████████| 100/100 [00:25<00:00,  3.96it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [10:46<00:00,  6.46s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [12:15<00:00,  7.35s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [00:51<00:00,  1.94it/s]


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226
Running evaluation for embedding-001...
Reranking with base...


100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [14:38<00:00,  8.79s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [12:22<00:00,  7.43s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [01:02<00:00,  1.60it/s]


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226
Running evaluation for voyage-2...
Reranking with base...


100%|██████████| 100/100 [00:25<00:00,  3.92it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [11:39<00:00,  7.00s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [10:33<00:00,  6.33s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226
Running evaluation for embed-english-v3.0...
Reranking with base...


100%|██████████| 100/100 [00:25<00:00,  3.89it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [12:12<00:00,  7.32s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [18:00<00:00, 10.80s/it]   


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [00:47<00:00,  2.13it/s]


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226
Running evaluation for text-embedding-ada-002...
Reranking with base...


100%|██████████| 100/100 [00:26<00:00,  3.83it/s]


  retrievers  hit_rate      mrr  precision  recall       ap      ndcg
0       base      0.89  0.65581      0.089    0.89  0.65581  0.156697
Reranking with bge_small...


100%|██████████| 100/100 [11:27<00:00,  6.88s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_small      0.89  0.718024      0.089    0.89  0.718024  0.167217
Reranking with bge_large...


100%|██████████| 100/100 [12:32<00:00,  7.52s/it]  


  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0  bge_large      0.89  0.727107      0.089    0.89  0.727107  0.168697
Reranking with cohere...


100%|██████████| 100/100 [00:59<00:00,  1.67it/s]

  retrievers  hit_rate       mrr  precision  recall        ap      ndcg
0     cohere      0.89  0.752333      0.089    0.89  0.752333  0.173226





In [106]:
#load all the results from the csv files in the results_1024 folder and concatenate them into a single dataframe and show the results, the index is embed_model
# and the columns are reranker(level1)  hit_rate, mrr level2

import pandas as pd
import os
import glob



def display_summary_results(path = r'results_1024'):
    
    all_files = glob.glob(os.path.join(path, "*.csv"))

    df_from_each_file = []

    for f in all_files:

        df = pd.read_csv(f)[['retrievers','hit_rate', 'mrr']]

        #change retrievers column name to reranker
        df.rename(columns={'retrievers':'reranker'}, inplace=True)
        df['embed_model'] = f.split('/')[-1].split('_')[0]
        df.set_index(['embed_model','reranker'], inplace=True)

        # print(df)

        # #set order the reranker index to base, bge_small, bge_large, cohere
        # df = df.reindex(['base', 'bge_small', 'bge_large', 'cohere'])


        df_from_each_file.append(df)

    # concatenate all the dataframes
    df = pd.concat(df_from_each_file, axis=0)

    rerankers = ['base', 'bge_small', 'bge_large', 'cohere']

    df_rankers = []

    for reranker in rerankers:
        print(f"Results for {reranker}")

        df_ranker = df.xs(reranker, level='reranker')

        print(df_ranker)
        
        #reorder the rows base on the index
        
        reranker_T = df_ranker.T
        reranker_T['reranker'] = reranker

        reranker_T.set_index(['reranker'], append=True, inplace=True)
        df_rankers.append(reranker_T)
        
    # concatenate all the dataframes horizontall

    df = pd.concat(df_rankers, axis=0)
    return df



In [107]:
df_1024 = display_summary_results()

df_512 = display_summary_results('results_512')

Results for base
                        hit_rate      mrr
embed_model                              
embed-english-v2.0          0.89  0.65581
text-embedding-ada-002      0.89  0.65581
embedding-001               0.89  0.65581
text-embedding-3-small      0.89  0.65581
voyage-2                    0.89  0.65581
embed-english-v3.0          0.89  0.65581
Results for bge_small
                        hit_rate       mrr
embed_model                               
text-embedding-3-small      0.89  0.718024
voyage-2                    0.89  0.718024
embed-english-v2.0          0.89  0.718024
embedding-001               0.89  0.718024
text-embedding-ada-002      0.89  0.718024
embed-english-v3.0          0.89  0.718024
Results for bge_large
                        hit_rate       mrr
embed_model                               
voyage-2                    0.89  0.727107
text-embedding-3-small      0.89  0.727107
embedding-001               0.89  0.727107
embed-english-v2.0          0.89  0.727107
t

In [108]:
df_1024.head(1000)

Unnamed: 0_level_0,embed_model,embed-english-v2.0,text-embedding-ada-002,embedding-001,text-embedding-3-small,voyage-2,embed-english-v3.0
Unnamed: 0_level_1,reranker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hit_rate,base,0.89,0.89,0.89,0.89,0.89,0.89
mrr,base,0.65581,0.65581,0.65581,0.65581,0.65581,0.65581
hit_rate,bge_small,0.89,0.89,0.89,0.89,0.89,0.89
mrr,bge_small,0.718024,0.718024,0.718024,0.718024,0.718024,0.718024
hit_rate,bge_large,0.89,0.89,0.89,0.89,0.89,0.89
mrr,bge_large,0.727107,0.727107,0.727107,0.727107,0.727107,0.727107
hit_rate,cohere,0.89,0.89,0.89,0.89,0.89,0.89
mrr,cohere,0.752333,0.752333,0.752333,0.752333,0.752333,0.752333


In [109]:
df_512.head(1000)

Unnamed: 0_level_0,embed_model,embed-english-v2.0,text-embedding-ada-002,embedding-001,text-embedding-3-small,voyage-2,embed-english-v3.0
Unnamed: 0_level_1,reranker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hit_rate,base,0.78,0.78,0.78,0.78,0.78,0.78
mrr,base,0.515036,0.515036,0.515036,0.515036,0.515036,0.515036
hit_rate,bge_small,0.78,0.78,0.78,0.78,0.78,0.78
mrr,bge_small,0.678429,0.678429,0.678429,0.678429,0.678429,0.678429
hit_rate,bge_large,0.78,0.78,0.78,0.78,0.78,0.78
mrr,bge_large,0.66694,0.66694,0.66694,0.66694,0.66694,0.66694
hit_rate,cohere,0.78,0.78,0.78,0.78,0.78,0.78
mrr,cohere,0.673583,0.673583,0.673583,0.673583,0.673583,0.673583


In summary, there is no significant difference between the embedding models, which suggests that the code might not be running correctly. However, the `chunk_size` of `512` performs worse than the `chunk_size` of 1024, with hit rates of `0.78` and `0.89`, respectively. Based on the provided images, the best chunk size is `1024`, and the best rerank model is "`cohere`," as it consistently achieves the highest Mean Reciprocal Rank (MRR) across different embedding models.


# Query and Response Evaluation

Based on the previous sessions, it turns out that the embedding model doesn't matter much (or the code I wrote might be wrong), so the final RAG pipeline will use the Gemini embedding-001 model and the "gemini-1-5-flash" model as the base LLM model, with Cohere as the reranker. As the performance of RAG heavily depends on the retrieval step rather than the response synthesizer step, I did not perform evaluation on the latter. Instead, I wrote a helper function `query_and_print_sources` that handles backoff limits using the Tenacity library and prints out the most relevant sources. I also used node preprocessing, retaining only nodes that have a similarity with the question larger than 0.7. The outputs show that the RAG model correctly retrieves relevant information and answers the example question successfully.



In [50]:
#import 

from llama_index.core import VectorStoreIndex, SummaryIndex
from llama_index.core.retrievers import VectorIndexRetriever, SummaryIndexRetriever, RouterRetriever
from llama_index.core.query_engine import RetrieverQueryEngine, RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core.tools import RetrieverTool, QueryEngineTool
from llama_index.core import get_response_synthesizer
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.postprocessor import SimilarityPostprocessor

# #load the best model
# model = "gemini-1.5-flash"
# model_embedding = "embedding-001"

# gemini_pro_api_key = "AIzaSyCnoLpXf-htTXwnVAN7_y0XC_5nhxzUtto"

# llm = Settings.llm = Gemini(api_key=gemini_pro_api_key, model=f"models/{model}")
# embed_model = Settings.embed_model = GeminiEmbedding(model_name=f"models/{model_embedding}",
#                                        api_key=gemini_pro_api_key, )




# # create the indexes
# vector_index = VectorStoreIndex(nodes)
# summary_index = SummaryIndex(nodes)


# create the retrievers
vector_retriever, summary_retriever, _ = build_retriver(vector_index, summary_index)
reranker = CohereRerank(api_key=cohere_api_key, top_n=5)
vector_with_rerank_retriever = CustomRetriever(vector_retriever=vector_retriever,reranker=reranker)


# configure response synthesizer
summary_response_synthesizer = get_response_synthesizer(response_mode="tree_summarize",
                                                        use_async=True)

vector_response_synthesizer = get_response_synthesizer()


# create the query engines
list_query_engine = RetrieverQueryEngine(retriever=summary_retriever,
                                         response_synthesizer=summary_response_synthesizer,
                                         node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

vector_query_engine = RetrieverQueryEngine(retriever=vector_retriever,
                                            response_synthesizer=vector_response_synthesizer,
                                            node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
    )

vector_query_engine_with_rerank = RetrieverQueryEngine(retriever=vector_with_rerank_retriever,
                                         response_synthesizer=vector_response_synthesizer,
                                         node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

list_tool = QueryEngineTool.from_defaults(
    query_engine=list_query_engine,
    description="Useful for questions asking summary questions (only if the question specifies summary or summarize) about content the blogs of llama-index.",
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific snippets from the blog post of llama-index."
    ),
)

vector_tool_with_rerank = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine_with_rerank,
    description=(
        "Useful for retrieving specific snippets from the blog post of llama-index."
    ),
)


# create the query engines
query_engine = RouterQueryEngine(
    verbose=True,
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
    ],
)

query_engine_with_rerank = RouterQueryEngine(
    verbose=True,
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool_with_rerank,
    ],
)

In [60]:
from tenacity import retry, wait_exponential, stop_after_attempt

def query_and_print_sources(question, query_engine):
    @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(5))
    def make_query():
        response = query_engine.query(question)
        return response

    try:
        # Query the vector query engine
        response = make_query()
        
        # Print the response
        print('Response:', response)

        # Format and print the source information
        print("""
        TOP. 1 SOURCE:
        ---------------------
        Title: {title}
        Author: {author}
        Date: {date}
        Tags: {tags}
        Related Posts: {related_posts}
        Link: {link}
        Source Text: {text}
        """.format(
            title=response.source_nodes[0].metadata['title'],
            author=response.source_nodes[0].metadata['author'],
            date=response.source_nodes[0].metadata['date'],
            tags=', '.join(response.source_nodes[0].metadata['tags']),
            related_posts=', '.join(response.source_nodes[0].metadata['related_posts']),
            link=response.source_nodes[0].metadata['link'],
            text=response.source_nodes[0].text,
        ))
    except Exception as e:
        print(f"Failed to get response after several retries: {e}")


In [56]:
question_1 = "What are key features of llama-agents?"

query_and_print_sources(question_1, query_engine=query_engine_with_rerank)

[1;3;38;5;200mSelecting query engine 1: The question asks for specific features of llama-agents, suggesting a need for retrieval of specific information, which aligns with choice 2's description of retrieving snippets..
[0mResponse: llama-agents is a framework that simplifies the process of building, iterating, and deploying multi-agent AI systems. It offers a distributed service-oriented architecture, communication via standardized API interfaces, the ability to define agentic and explicit orchestration flows, ease of deployment, and scalability and resource management. 


          
        SOURCES
        
        ---------------------
        Title: Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
        Author: LlamaIndex
        Date: Jun 26, 2024
        Tags: agents
        Related Posts: Building a multi-agent concierge system, Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems, Automate

In [61]:
question_2 = 'What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?'
query_and_print_sources(question_2, query_engine=query_engine_with_rerank)

[1;3;38;5;200mSelecting query engine 1: The question asks for specific snippets from the 'Evaluating RAG with LlamaIndex' section, indicating a need for retrieval of specific information..
[0mResponse: The two critical areas of RAG system performance assessed are the Retrieval System and Response Generation. 


        TOP. 1 SOURCE:
        ---------------------
        Title: OpenAI Cookbook: Evaluating RAG systems
        Author: Ravi Theja
        Date: Nov 28, 2023
        Tags: llamaindex, openai, llm, retrieval-augmented
        Related Posts: LlamaIndex Newsletter 2024-04-02, LlamaIndex Newsletter 2024-03-26, LlamaIndex Newsletter 2024-03-19, One-click Open Source RAG Observability with Langfuse
        Link: https://www.llamaindex.ai/blog//openai-cookbook-evaluating-rag-systems-fe393c61fb93
        Source Text: We’re excited to unveil our [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/evaluation/Evaluate_RAG_with_LlamaIndex.ipynb), a guide to 

In [62]:
question_3 = 'What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?'
query_and_print_sources(question_3, query_engine=query_engine_with_rerank)


[1;3;38;5;200mSelecting query engine 1: The question asks for specific information about rerankers in a RAG system, which suggests retrieving snippets from a relevant blog post..
[0mResponse: The two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR). 


        TOP. 1 SOURCE:
        ---------------------
        Title: Boosting RAG: Picking the Best Embedding & Reranker models
        Author: Ravi Theja
        Date: Nov 3, 2023
        Tags: embedding, llm, openai, search, llamaindex
        Related Posts: LlamaIndex Newsletter 2024-04-02, LlamaIndex Newsletter 2024-03-26, LlamaIndex Newsletter 2024-03-19, One-click Open Source RAG Observability with Langfuse
        Link: https://www.llamaindex.ai/blog//boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83
        Source Text: > It’s worth mentioning that these results provide a solid insight into performance for this particular dat

# Conclusion

In this assignment, I experimented with different chunk sizes (chunk_size = 1024, overlap = 10, and chunk_size = 512, overlap = 50) and various embedding models, including:
1. `embed-english-v2.0` (Cohere)
2. `text-embedding-3-small` (OpenAI)
3. `embedding-001` (Gemini)
4. `voyage-2` (Voyage)
5. `embed-english-v3.0` (Cohere)
6. `text-embedding-ada-002` (OpenAI)

However, due to some hidden errors that I haven't figured out, the hit rate did not improve with changing models. The chunk size of 1024 performed better than 512, with hit rates of 0.89 and 0.78, respectively. The final RAG pipeline includes both vector index and summary index. The router engine uses both summary and vector retrieval, with the vector retriever and the Cohere reranker model. The router uses LLMSingleSelector to choose the appropriate query engine. The results show that the model retrieves the relevant nodes and answers the questions successfully.

