# Evaluate the RAG system

To evaluate the LLM answer we will follow the below steps:
- In the ground truth dataset we have a user question along with the actual answer of this question from the FAQ document
- Pass each question of the ground truth dataset to the RAG sytem to get the LLM answer of that question
- Create the embedding of the actual answer and the LLM Answer
- Compute the cosine similarity and other metrics of the two answer to evaluate the RAG system

In [72]:
import requests 
import pandas as pd
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os

In [5]:
# Load environment variables from .envrc
load_dotenv(".envrc")

True

## Load all the necessary documents

In [6]:
# Load the FAQ records with their ids
# Create the github url
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
# Request the json data
docs_response = requests.get(docs_url)
# Open the json data
documents = docs_response.json()
# Verify the first record
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [24]:
# Create a dictionary with any id containing the full record
doc_idx = {d['id']: d for d in documents}

In [17]:
# Load the ground truth dataset
df_ground_truth = pd.read_csv('ground-truth-data.csv')
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

## Index the necessary documents

In [8]:
# Initialize the client 
es_client = Elasticsearch('http://localhost:9200') # This is the port created after running the docker file

# Create the Schema of the Elastic Search Index
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,            # Here we are using the dimensionality of the embedding we want to store 
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Provide the name of the index
index_name = "vector_questions"
# Create the elastic search index
response = es_client.indices.create(index=index_name, body=index_settings)
# Verify that elastic search is created
response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_questions'})

In [9]:
# Initialize the selected model to create the embeddings
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Create the embeddings for each record in our FAQ dataset
for doc in tqdm(documents):
    # Extract the text fields you want to embed
    question_text = doc['question'] + ' ' + doc['text']
    # Create the embedding for each text field
    doc['question_text_vector'] = model.encode(question_text)



  0%|          | 0/948 [00:00<?, ?it/s]

In [10]:
# Fit the documents into the elastic search index
for doc in tqdm(documents):
    es_client.index(index = index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

## Create the RAG system

In [11]:
# Create the retrieval function for the elastic search
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

# Create the retrieval part of rag with the optimal results
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [12]:
# Create the function to build the prompt
def build_prompt(query, search_results):
    # Create the prompt template
    prompt_template = '''
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    '''.strip()
    
    # Create the context from the search results
    context = ''
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    # Putting the context and query all together
    prompt = prompt_template.format(question = query, context = context).strip()
    return prompt

In [13]:
# Initialize the openai instance
client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [14]:
# Create the rag system function
def rag(query: dict, model='gpt-4o') -> str:
    # Search the knowledge base
    search_results = question_text_vector_knn(query)
    # Prepare the prompt for the model
    prompt = build_prompt(query['question'], search_results)
    # Use the llm to get the response
    answer = llm(prompt, model=model)
    return answer

## Create LLM answers based on ground truth questions

In [22]:
# Take the first 5 entries of the ground truth dataset
sample = ground_truth[:5]
# Initialize the answers
answers = {}


In [26]:
# Use the RAG system to generate the answers from the sample with gpt-4o
for i, rec in enumerate(tqdm(sample)):
    # Create a cache so in case tha the rag breaks we don't need to rerun it
    if i in answers:
        continue
    # Generate the LLM answer
    answer_llm = rag(rec)
    # Extract the document Id from the ground truth
    doc_id = rec['document']
    # Extract the original record from the ground truth
    original_doc = doc_idx[doc_id]
    # Save the answer from the ground truth record
    answer_orig = original_doc['text']
    # Save the answers in 1 entry
    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course']
    }

  0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
# View the results as a Dataframe
pd.DataFrame(answers.values())

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up for the course using the link ...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [35]:
# Initialize the answers
answers_35 = {}

In [36]:
# Use the RAG system to generate the answers from the sample with Gpt 3.5 turbo
for i, rec in enumerate(tqdm(sample)):
    # Create a cache so in case tha the rag breaks we don't need to rerun it
    if i in answers_35:
        continue
    # Generate the LLM answer
    answer_llm = rag(rec, model='gpt-3.5-turbo')
    # Extract the document Id from the ground truth
    doc_id = rec['document']
    # Extract the original record from the ground truth
    original_doc = doc_idx[doc_id]
    # Save the answer from the ground truth record
    answer_orig = original_doc['text']
    # Save the answers in 1 entry
    answers_35[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course']
    }

  0%|          | 0/5 [00:00<?, ?it/s]

In [37]:
# View the results as a Dataframe
pd.DataFrame(answers_35.values())

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,NONE,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,You can structure your questions and answers f...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [38]:
answers_35

{0: {'answer_llm': 'NONE',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
  'document': '0227b872',
  'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp'},
 1: {'answer_llm': 'NONE',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shr

In [42]:
# Download the full results with gpt-4o
!wget r'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/04-monitoring/data/results-gpt4o.csv'
# Download the full results with gpt-3.5
!wget r'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/04-monitoring/data/results-gpt35.csv'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


zsh:1: command not found: wget


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


zsh:1: command not found: wget


In [52]:
# Load the llm and actual answers from gpt4o
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o.csv'
results_gpt4o_url = f'{base_url}/{relative_url}?raw=1'

df_results_gpt4o = pd.read_csv(results_gpt4o_url)
results_gpt4o = df_results_gpt4o.to_dict(orient='records')

# Load the llm and actual answers from gpt35
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt35.csv'
results_gpt35_url = f'{base_url}/{relative_url}?raw=1'

df_results_gpt35 = pd.read_csv(results_gpt35_url)
results_gpt35 = df_results_gpt35.to_dict(orient='records')

In [61]:
# Compute the cosine similarity for one record
record = results_gpt4o[10]
# Extract the two answers to compare
answer_llm = record['answer_llm']
answer_orig = record['answer_orig']
# Create the embeddings for the two answers
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)
# Calculate the cosine similarity
print(record)
v_llm.dot(v_orig)

{'answer_llm': 'Yes, sessions are recorded, so you won’t miss anything if you miss a session. You can view the recordings later. Additionally, you can ask questions for office hours in advance or on Slack, and they will be addressed during the live stream or in the Slack channel.', 'answer_orig': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.', 'document': '5170565b', 'question': 'Are sessions recorded if I miss one?', 'course': 'machine-learning-zoomcamp'}


0.7962799

In [62]:
# Create the function to calculate cosine similarity
def compute_similarity(record):
    # Extract the two answers to compare
    answer_llm = record['answer_llm']
    answer_orig = record['answer_orig']
    # Create the embeddings for the two answers
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    # Calculate the cosine similarity
    return v_llm.dot(v_orig)

In [67]:
# Create similarities for all records of the gpt4oresults
similarities_4o = [compute_similarity(record) for record in results_gpt4o]
# Save the similarities in the results
df_results_gpt4o['cosine'] = similarities_4o

In [69]:
# Calculare the descriptive statistics for the similarities
df_results_gpt4o['cosine'].describe()

count    1830.000000
mean        0.679129
std         0.217995
min        -0.153425
25%         0.591460
50%         0.734788
75%         0.835390
max         0.995338
Name: cosine, dtype: float64

In [73]:
# Create similarities for all records of the gpt4oresults
similarities_35 = [compute_similarity(record) for record in results_gpt35]
# Save the similarities in the results
df_results_gpt35['cosine'] = similarities_35

In [74]:
# Calculare the descriptive statistics for the similarities
df_results_gpt35['cosine'].describe()

count    1830.000000
mean        0.657599
std         0.226062
min        -0.168921
25%         0.546504
50%         0.714783
75%         0.817262
max         1.000000
Name: cosine, dtype: float64