# RAG Evaluation


Install packages


In [48]:
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    requests==2.32.5 \
    python-dotenv==1.2.1 \
    tqdm==4.67.1 \
    litellm==1.78.5 \
    elasticsearch==8.19.3 \
    sentence-transformers==5.2.2

Import packages


In [None]:
import hashlib
import json
import random
import time
import uuid
from collections import defaultdict
from pathlib import Path

import litellm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

load_dotenv()

True

## Ground Truth Dataset


Download documents


In [None]:
docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]
    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

Generate document ids based on it's content


In [None]:
def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id

Apply ids


In [None]:
for doc in documents:
    doc["id"] = generate_document_id(doc)
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

Check for duplicates


In [None]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc["id"]
    hashes[doc_id].append(doc)

len(hashes), len(documents)

(947, 948)

Duplicated ids


In [None]:
for k, v in hashes.items():
    if len(v) > 1:
        print(k, len(v))

593f7569 2


In [None]:
hashes["593f7569"]

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

Save documents with ids


In [None]:
with open("documents-with-ids.json", "wt") as f_out:
    json.dump(documents, f_out, indent=2)

In [11]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [None]:
# !docker exec -it ollama ollama pull qwen3:0.6b

In [None]:
# !docker exec -it ollama ollama list

Lite llm with open router


In [None]:
response = litellm.completion(
    model="openrouter/meta-llama/llama-3.3-70b-instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Explain transformers llm architecture in one paragraph.",
        },
    ],
    # api_base="http://localhost:11434",
    # api_key="ollama",
    # format="json",
    # custom_llm_provider="ollama",
)

print(response.choices[0].message["content"])

The Transformer LLM (Large Language Model) architecture is a type of neural network designed primarily for natural language processing tasks. It relies on self-attention mechanisms to analyze input sequences, such as text, in parallel, allowing for more efficient processing than traditional recurrent neural networks (RNNs). The architecture consists of an encoder and a decoder, with the encoder taking in a sequence of tokens (e.g., words or characters) and generating a continuous representation of the input sequence. The decoder then generates output sequences, one token at a time, based on this representation. The key components include self-attention layers, which allow the model to weigh the importance of different input tokens relative to each other, and feed-forward neural network (FNN) layers, which transform the output of the self-attention mechanism. This design enables the model to capture complex contextual relationships in input data and generate coherent and contextually re

Prompt template


In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record.

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = litellm.completion(
        model="openrouter/meta-llama/llama-3.3-70b-instruct",
        messages=[
            {"role": "user", "content": prompt},
        ],
        format="json",
        # api_base="http://localhost:11434",
        # api_key="ollama",
        # custom_llm_provider="ollama",
    )

    return response.choices[0].message.content

In [None]:
OUTPUT_PATH = Path("generated_questions.json")

if OUTPUT_PATH.exists():
    with OUTPUT_PATH.open("r", encoding="utf-8") as f:
        results = json.load(f)
else:
    results = {}

In [None]:
def save_results(data, path=OUTPUT_PATH):
    tmp_path = path.with_suffix(".tmp")
    with tmp_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    tmp_path.replace(path)

In [None]:
for doc in tqdm(documents):
    doc_id = str(doc["id"])

    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

    save_results(results)

100%|██████████| 948/948 [00:00<00:00, 584048.21it/s]


In [None]:
def extract_json(text):
    start_idx = text.find("[") if "[" in text else len(text)

    if start_idx == len(text):
        return None

    for end_idx in range(len(text), start_idx, -1):
        try:
            return json.loads(text[start_idx:end_idx])
        except:
            continue
    return None

In [None]:
parsed_results = {}

for doc_id, questions in results.items():
    try:
        parsed_results[doc_id] = extract_json(questions)
    except Exception as error:
        print(error)
        print(questions)
        break

In [None]:
doc_index = {d["id"]: d for d in documents}

In [None]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]["course"]
    for q in questions:
        final_results.append((q, course, doc_id))

In [None]:
df = pd.DataFrame(final_results, columns=["question", "course", "document"])
df.head()

Unnamed: 0,question,course,document
0,What is the exact date and time when our cours...,data-engineering-zoomcamp,c02e79ef
1,How can I stay updated about the course schedu...,data-engineering-zoomcamp,c02e79ef
2,What are the necessary steps I need to take be...,data-engineering-zoomcamp,c02e79ef
3,Where can I find the course calendar and how d...,data-engineering-zoomcamp,c02e79ef
4,What are the different platforms I need to joi...,data-engineering-zoomcamp,c02e79ef


In [None]:
df.to_csv("ground-truth-data.csv", index=False)

In [26]:
!head ground-truth-data.csv

question,course,document
What is the exact date and time when our course is scheduled to begin,data-engineering-zoomcamp,c02e79ef
How can I stay updated about the course schedule and important announcements,data-engineering-zoomcamp,c02e79ef
What are the necessary steps I need to take before the course starts,data-engineering-zoomcamp,c02e79ef
Where can I find the course calendar and how do I access it,data-engineering-zoomcamp,c02e79ef
What are the different platforms I need to join to be fully registered for the course,data-engineering-zoomcamp,c02e79ef
What do I need to know before enrolling in this course,data-engineering-zoomcamp,1f6520ca
Are there any specific requirements to join this course,data-engineering-zoomcamp,1f6520ca
Do I need prior experience to take this course,data-engineering-zoomcamp,1f6520ca
What are the necessary skills to succeed in this course,data-engineering-zoomcamp,1f6520ca


## Retrieval evaluation

How to know which parameters in search function are the best?

Metrics:

- Hit Rate (HR) or Recall at K (R@K)
- Mean Reciprocal Rank (MRR)


Run Elasticsearch:

```sh
docker run --rm -it \
  --name elasticsearch \
  -p 9200:9200 \
  -p 9300:9300 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.security.transport.ssl.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms2g -Xmx2g" \
  docker.elastic.co/elasticsearch/elasticsearch:8.5.1
```


Create an Elasticsearch client instance


In [None]:
es_client = Elasticsearch(
    "http://localhost:9200",
)

Check connection


In [None]:
es_client.info()

ObjectApiResponse({'name': '6e0b5fefdbf2', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'lmeNs05iSlO7hyTYz4uYZQ', 'version': {'number': '8.5.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c1310c45fc534583afe2c1c03046491efba2bba2', 'build_date': '2022-11-09T21:02:20.169855900Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

Create an index in Elasticsearch


In [None]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:26<00:00, 35.24it/s]


In [None]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": course}},
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [None]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp",
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [None]:
df_ground_truth = pd.read_csv("ground-truth-data.csv")
ground_truth = df_ground_truth.to_dict(orient="records")

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = elastic_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4735/4735 [00:47<00:00, 99.44it/s] 


In [None]:
relevance_total[:10]

[[False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False]]

In [None]:
def hit_rate(relevance_total):
    counter = 0

    for row in relevance_total:
        if True in row:
            counter += 1

    return counter / len(relevance_total)

In [None]:
hit_rate(relevance_total)

0.7074973600844773

In [None]:
def mrr(relevance_total):
    total_score = 0

    for row in relevance_total:
        for rank in range(len(row)):
            if row[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [None]:
mrr(relevance_total)

0.574255543822597

## RAG Evaluation


In [None]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
es_client = Elasticsearch(
    "http://localhost:9200",
    request_timeout=120,
)

In [None]:
es_client.info()

ObjectApiResponse({'name': '8086797ecf26', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'e9NMVjKpQg6tKR2VL1o0wg', 'version': {'number': '8.5.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c1310c45fc534583afe2c1c03046491efba2bba2', 'build_date': '2022-11-09T21:02:20.169855900Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [None]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
            },
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(
    index=index_name, body=index_settings, wait_for_active_shards=1
)
es_client.cluster.health(wait_for_status="yellow", timeout="30s")

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))))

In [None]:
for doc in tqdm(documents):
    question = doc["question"]
    text = doc["text"]
    doc["question_text_vector"] = model.encode(question + " " + text)

    es_client.index(index=index_name, document=doc, timeout="60s")

  0%|          | 0/948 [04:01<?, ?it/s]


ApiError: ApiError(503, 'unavailable_shards_exception', '[course-questions][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[course-questions][0]] containing [index {[course-questions][mU0eFZwB3w1105dQN853], source[n/a, actual length: [8.4kb], max length: 2kb]}]]')