This project analyzes how different Large Language Models (LLMs) respond to Retrieval-Augmented Generation (RAG). We based our implementation on the paper "RAGAS: Automated Evaluation of Retrieval Augmented Generation" by Shahul Es, Jithin James, Luis Espinosa-Anke, and Steven Schockaert. To evaluate the RAG performance of the different models, we implemented the three different scores mentioned in the paper: faithfulness, answer relevance, and context relevance. For instructions on how to run the project see the README.md file.

### Dependencies

In [1]:
!pip install torch --index-url https://download.pytorch.org/whl/cu128 --quiet 
!pip install ipywidgets sentence-transformers faiss-cpu transformers datasets transformers jupyterlab_widgets pandas accelerate numpy hf_xet tqdm --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import sys
import re
#------------------------------------------------------------------------------------------------
#os.environ["HF_HOME"] = "D:/AI_Models" #-> only for my computer delete on others!!!
#------------------------------------------------------------------------------------------------
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from accelerate import Accelerator
from tqdm.notebook import tqdm

Checks if CUDA is available and sets the accelerator for hardware-agnostic execution:

In [3]:
print(f"CUDA verfügbar: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    
    accelerator = Accelerator()
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Anzahl GPUs: {torch.cuda.device_count()}")
    print(f"CUDA Version (PyTorch): {torch.version.cuda}")
    print(f"Accelerator device: {accelerator.device}")
else:
    accelerator = Accelerator(cpu=True) 
    print(f"Accelerator device: {accelerator.device}")

CUDA verfügbar: True
GPU Name: NVIDIA GeForce RTX 2060
Anzahl GPUs: 1
CUDA Version (PyTorch): 12.8
Accelerator device: cuda


### Import the models

Due to hardware limitations, we were unable to run all models directly one afer the other, so we always hat to comment out all models except the one we were currently using.

In [None]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(
    model_name_mistral,
    dtype="auto"
    )
model_mistral = accelerator.prepare(model_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(
    model_name_deepseek,
    dtype="auto"
    )
model_deepseek = accelerator.prepare(model_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(
    model_name_llama,
    dtype="auto"
    )
model_llama = accelerator.prepare(model_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""

#-> "Qwen/Qwen3-8B"
model_name_qwen = "Qwen/Qwen3-1.7B"

model_qwen = AutoModelForCausalLM.from_pretrained(
    model_name_qwen,
    dtype=torch.float16, 
    device_map="auto",
    attn_implementation="sdpa" 
)
model_qwen = accelerator.prepare(model_qwen) 
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Import the dataset

In [40]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

#TODO change split
rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="all")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)

1200

In [6]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [7]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape

Batches:   0%|          | 0/273 [00:00<?, ?it/s]

(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [8]:
"""def retrieve_context(question, k=5):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))
    
    # Filter: nur Chunks mit mehr als 50 Zeichen UND Frage-Keywords enthalten
    keywords = [w.lower() for w in question.split()]
    selected_chunks = []
    for i in indices[0]:
        chunk = chunks[i].strip()
        if len(chunk) > 50 and any(kw in chunk.lower() for kw in keywords):
            selected_chunks.append(chunk)
        if len(selected_chunks) >= k:
            break
    return "\n".join(selected_chunks)"""


'def retrieve_context(question, k=5):\n    q_emb = embed_model.encode([question])\n    distances, indices = index.search(q_emb, len(chunks))\n\n    # Filter: nur Chunks mit mehr als 50 Zeichen UND Frage-Keywords enthalten\n    keywords = [w.lower() for w in question.split()]\n    selected_chunks = []\n    for i in indices[0]:\n        chunk = chunks[i].strip()\n        if len(chunk) > 50 and any(kw in chunk.lower() for kw in keywords):\n            selected_chunks.append(chunk)\n        if len(selected_chunks) >= k:\n            break\n    return "\n".join(selected_chunks)'

Prompt similar the the one from the Ragas paper

In [9]:
def build_rag_prompt(context, question):
    return f"""Answer the question using ONLY the information from the context.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [10]:
#TODO entscheide bei jedem max_tokens=1000 ob 1000 angebracht sind
def answer_without_context(model, tokenizer, prompt, max_tokens=1000, do_sample=True):
    messages = [
        {"role": "user", "content": prompt}
    ]
    
    text_input = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        #-----------------------------------------------------------
        enable_thinking=False #-> falls kein thinking model löschen!
        #-----------------------------------------------------------
    )
    inputs = tokenizer(text_input, return_tensors="pt").to(accelerator.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=do_sample,
            temperature=0.7,
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

In [41]:

def answer_with_context(model, tokenizer, prompt, context, max_tokens=1000, do_sample=True):
    rag_instruction = (
        "Answer using the information from the given context.\n"
        f"context: {context}\n\n"
        f"{prompt}"
    )

    messages = [
        {"role": "user", "content": rag_instruction}
    ]
    
    text_input = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        #-----------------------------------------------------------
        enable_thinking=False #-> falls kein thinking model löschen!
        #-----------------------------------------------------------
    )
    
    inputs = tokenizer(text_input, return_tensors="pt").to(accelerator.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=do_sample,
            temperature=0.7,
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )
    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

In [12]:
def build_statement_prompt(question: str, answer: str):
    return f"""Given a question and answer, create one or more statements from each sentence in the given answer.
question: {question}
answer: {answer}"""

In [13]:
def extract_statements(question: str, answer: str, model, tokenizer, max_tokens=1000):
    #Extracts statements out of the model generated answer 
    prompt = build_statement_prompt(question, answer)

    answer = answer_without_context(model, tokenizer, prompt, max_tokens, do_sample=False)

    statements = [line.strip() for line in answer.split("\n") if len(line.strip()) > 3]
    return statements

Changed the prompt for better understanding

In [14]:
def build_faithfullness_prompt(statements):
    prompt = f"""Consider the given context and following statements, the determine whether they are supported by the information presente in the context. 
Provide a brief explanation for each statement before arriving at the verdict (Yes/No). 
Answer in the format:
Statement: ...
Explanation: ...
Verdict: (Yes/No)
Do not deviate from the specified format.
These are the statements
"""
    for s in statements:
        prompt += f"Statement: {s}\n"
        
    return prompt

In [29]:
def count_supported(answer: str):
    yes_matches = re.findall(r'Verdict:\s*(Yes)', answer, re.IGNORECASE)
    yes_count = len(yes_matches)
    return yes_count

In [16]:
def calculate_faithfulness_score(total_statements: int, suported_statements: int):
    if suported_statements == 0.0:
        return 0.0

    return suported_statements / total_statements

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [17]:
def build_answer_relevance_prompt(answer: str):
    prompt = f"""Generate a question for the given answer.
    answer: {answer}"""
    return prompt

In [18]:
def extract_questions(text: str) -> list:
    text.strip()
    list = text.split("?")
    return [item + "?" for item in list]

In [19]:
def calculate_question_similarity(original_question, generated_questions):
    q_embedding = embed_model.encode([original_question])
    gen_q_embeddings = embed_model.encode(generated_questions)

    q_embedding = np.array(q_embedding).astype('float32')
    gen_q_embeddings = np.array(gen_q_embeddings).astype('float32')

    faiss.normalize_L2(q_embedding)
    faiss.normalize_L2(gen_q_embeddings)

    d = q_embedding.shape[1]
    index = faiss.IndexFlatIP(d)
    
    index.add(gen_q_embeddings)
    
    k = len(generated_questions)
    D, I = index.search(q_embedding, k=k)
    
    scores = D[0]
    return scores

In [20]:
#TODO
def calculate_answer_relevance_score(similatity: list):
    return float(np.mean(similatity))

$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [21]:
def build_context_relevance_prompt(question: str):
    prompt = f"""Please extract relevant sentences from the provided context that can potentially help ansewr the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phares "Insufficient Information".While extracting candidate sentences you're not allowed to make any changes to senctences from the given context.
    Question: {question}"""
    return prompt

In [22]:
#TODO vielleicht erweitern sehr simple satz zählung -> entscheiden ob auch ! und ? zählen
def count_sentences(sentence: str):
    num = sentence.count(". ")
    num += sentence.count("! ")
    num += sentence.count("? ")
    return num

In [23]:
def calculate_context_relevance_score(num_extracted, num_sentences):
    if num_sentences == 0.0:
        return 0.0
    
    return num_extracted / num_sentences

### RAG analysis

In [24]:
def analyse_faithfullness(model, tokenizer, context, question, answer, do_sample=True):
    #extracts statements out of the answer and generates a statements list
    statements = extract_statements(question, answer, model_qwen, tokenizer_qwen)

    #lets the model evaluate the faithfullness
    faithfullness_prompt = build_faithfullness_prompt(statements)
    #do_sample has to be false
    eval = answer_with_context(model_qwen, tokenizer_qwen, context, faithfullness_prompt, do_sample=False)
    num_supported = count_supported(eval)

    return [statements, num_supported]

In [25]:
#TODO entscheiden wieviele fragen die funkition bearbeiten soll
def analyse_answer_relevance(model, tokenizer, question, answer, do_sample=True):
    prompt = build_answer_relevance_prompt(answer)
    questions_generated = answer_without_context(model, tokenizer, prompt, do_sample=do_sample)
    questions_generated = extract_questions(questions_generated)

    sim = calculate_question_similarity(question, questions_generated)

    return sim

In [26]:
def analyse_context_relevance(model, tokenizer, context, question):
    prompt = build_context_relevance_prompt(question)
    answer = answer_with_context(model, tokenizer, prompt, context, do_sample=False)

    num_extracted = count_sentences(answer)
    num_sentences = count_sentences(context)
    
    return [num_extracted, num_sentences]

In [None]:
def analyse_rag(model, tokenizer, do_sample=True):
    scores = {
        "faithfullness score" : 0,
        "answer relevance score" : 0,
        "context relevance score" : 0
    }
    #for faithfullness
    statements = []
    supported_statements = 0

    #for answer relevance
    similatity = []

    #for context relevance
    num_extracted = 0
    num_sentences = 0

    for i in tqdm(range(5)):
        #generates a answer with the context for the current question
        question = rag_dataset_1200[100+i]["question"]
        context = rag_dataset_1200[100+i]["context"]
        answer = answer_with_context(model_qwen, tokenizer_qwen, context, question, do_sample=do_sample)

        faithfullness = analyse_faithfullness(model, tokenizer, context, question, answer, do_sample)
        statements.extend(faithfullness[0])
        supported_statements += faithfullness[1]

        answer_relevance = analyse_answer_relevance(model, tokenizer, question, answer, do_sample=do_sample)
        similatity.extend(answer_relevance)

        context_relevance = analyse_context_relevance(model, tokenizer, context, question)
        num_extracted += context_relevance[0]
        num_sentences += context_relevance[1]

    scores["faithfullness score"] = calculate_faithfulness_score(len(statements), supported_statements)
    scores["answer relevance score"] = calculate_answer_relevance_score(similatity)
    scores["context relevance score"] = calculate_context_relevance_score(num_extracted, num_sentences)

    return scores



### Conclusion

Calculate scores for Qwen without temperature:

In [36]:
analyse_rag(model_qwen, tokenizer_qwen, do_sample=False)

  0%|          | 0/1 [00:00<?, ?it/s]

{'faithfullness score': 1.0,
 'answer relevance score': 0.5071002244949341,
 'context relevance score': 0.1}

Calculate scores for Qwen with temperature:

In [37]:
analyse_rag(model_qwen, tokenizer_qwen, do_sample=True)

  0%|          | 0/1 [00:00<?, ?it/s]

{'faithfullness score': 1.0,
 'answer relevance score': 0.5071002244949341,
 'context relevance score': 0.1}