# Comparing The Effectiveness Of RAG Between Models


### Dependencies

In [None]:
!pip install sentence-transformers faiss-cpu transformers torch datasets transformers jupyterlab_widgets pandas numpy --quiet


In [1]:
import os
import sys
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Import the models

In [2]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(model_name_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(model_name_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(model_name_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""


model_name_qwen = "Qwen/Qwen2.5-0.5B"
model_qwen = AutoModelForCausalLM.from_pretrained(model_name_qwen)
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)

### Import the dataset

In [3]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="train")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)


960

In [4]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [5]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")



embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape


Batches: 100%|██████████| 273/273 [02:28<00:00,  1.84it/s]


(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [6]:
print(type(embeddings))
print(len(embeddings))  
if len(embeddings) > 0:
    print(np.array(embeddings).shape)

dimension = embeddings.shape[1]


index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

index.ntotal

<class 'numpy.ndarray'>
8732
(8732, 384)


8732

In [7]:
def retrieve_context(question, k=10):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))  
    selected_chunks = [chunks[i] for i in indices[0] if len(chunks[i].strip()) > 50]
    return "\n".join(selected_chunks[:k])  


Prompt similar the the one from the Ragas paper

In [8]:
def build_rag_prompt(context, question):
    return f"""
Answer the question using ONLY the information from the context.
Return ONLY complete sentences.
Do NOT include explanations, commentary, or unrelated text.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [9]:
def answer_without_context(model, tokenizer, prompt, max_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id
        )
    generated = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


In [10]:
def answer_with_context(model, tokenizer, question, max_tokens=400, k=10):
    context = retrieve_context(question, k=k)
    prompt = build_rag_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors="pt")
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,       
            temperature=0.7,    
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id
        )

    generated = output_ids[0][prompt_len:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [11]:
def eval_similarity(q: str, q_answer: str) -> float:
    emb = embed_model.encode([q, q_answer])
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    return (sim + 1) / 2

$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [12]:
def extract_statements(answer: str, model, tokenizer, max_tokens=150):
    """
    Splits a model-generated answer into complete factual statements.
    Each statement will be one line. Partial words or fragments are avoided.
    """

    prompt = f"""
Split the following answer into complete factual statements.
Return one complete sentence per line.
Do NOT add explanations, bullet points, or partial words.

Answer:
{answer}
"""


    inputs = tokenizer(prompt, return_tensors="pt")

    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    
    statements = [line.strip() for line in output_text.split("\n") if len(line.strip()) > 3]

    return statements


In [13]:
def evaluate_faithfulness(statements, context, model, tokenizer):
    if len(statements) == 0:
        return 0.0

    prompt = build_faithfullness_prompt(statements, context)
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)


    answer_lines = [line.split("Answer:")[-1].strip() for line in output.split("\n") if "Answer:" in line]

    supported = sum(1 for line in answer_lines if line.lower().startswith("yes"))

    return supported / len(statements)


In [14]:
def generate_questions_from_answer(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Generate up to 3 questions that could be answered by the following answer.
Return one question per line.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [q.strip() for q in output.split("\n") if q.strip().endswith("?")]


In [40]:
def evaluate_faithfulness_given_answer(question, context, answer, model, tokenizer, max_tokens=200):
    """
    Prüft, ob die gegebene Antwort vollständig durch den Context gestützt wird.
    Rückgabe: Float zwischen 0 und 1 (1 = vollständig unterstützt)
    """

    prompt = f"""
You are an evaluator. Given a context and a candidate answer to a question,
assess whether the answer is fully supported by the context.

Context:
{context}

Question:
{question}

Answer:
{answer}

Is the answer fully supported by the context? Answer only 'Yes' or 'No'.
"""

    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip().lower()

    # Konvertiere Yes/No in 1/0
    if "yes" in output_text:
        return 1.0
    else:
        return 0.0


In [15]:
def calculate_answer_relevance_score(question: str, generated_questions: list):
    if len(generated_questions) == 0:
        return 0.0

    score = 0
    for q in generated_questions:
        score += eval_similarity(question, q)

    return score / len(generated_questions)


In [52]:
def calculate_answer_relevance_direct(ground_truth: str, llm_answer: str, embed_model=None) -> float:
    """
    Misst die semantische Ähnlichkeit zwischen Ground-Truth und LLM-Antwort (0-1).
    Rückgabe: 1 = sehr ähnlich, 0 = keine Ähnlichkeit
    """
    if not llm_answer or llm_answer.lower() in ["i don't know", "unknown"]:
        return 0.0
    
    emb = embed_model.encode([ground_truth, llm_answer])
    sim = np.dot(emb[0], emb[1]) / (np.linalg.norm(emb[0]) * np.linalg.norm(emb[1]))
    return (sim + 1) / 2


In [54]:
def detect_hallucination(faithfulness: float, answer_relevance: float, threshold: float = 0.5) -> bool:
    """
    Prüft, ob die Antwort halluziniert ist.
    True = Halluzination, False = keine Halluzination
    """
    return faithfulness < threshold or answer_relevance < threshold


In [55]:
def evaluate_rag_answer_given_answer(ground_truth: str, llm_answer: str, embed_model=None, threshold: float = 0.5) -> dict:
    """
    Kombiniert alle Scores für eine einzelne LLM-Antwort.
    Rückgabe: dict mit 'faithfulness', 'answer_relevance', 'hallucination'
    """
    answer_relevance = calculate_answer_relevance_direct(ground_truth, llm_answer, embed_model=embed_model)
    faithfulness = evaluate_faithfulness_vs_groundtruth(ground_truth, llm_answer, embed_model=embed_model)
    hallucination = detect_hallucination(faithfulness, answer_relevance, threshold=threshold)
    
    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": hallucination
    }


In [53]:
def evaluate_faithfulness_vs_groundtruth(ground_truth: str, llm_answer: str, embed_model=None) -> float:
    """
    Prüft, wie stark die LLM-Antwort durch die Ground-Truth gestützt wird.
    Hier gleichgesetzt mit der semantischen Ähnlichkeit.
    """
    return calculate_answer_relevance_direct(ground_truth, llm_answer, embed_model=embed_model)


In [38]:
def evaluate_rag_answer(question, answer, context, model, tokenizer):
    if answer.lower() in ["i don't know", ""]:
        return {
            "faithfulness": 0.0,
            "answer_relevance": 0.0,
            "hallucination": True
        }

    # Anpassung: kurze Antworten nicht splitten
    if len(answer.strip()) < 100:
        statements = [answer.strip()]
    else:
        statements = extract_statements(answer, model, tokenizer)[:10]

    if not statements:
        faithfulness = 0.0
    else:
        faithfulness = evaluate_faithfulness(statements, context, model, tokenizer)

    answer_relevance = calculate_answer_relevance_direct(question, answer)

    # Hallucination nur, wenn keine Aussage gestützt wird
    hallucination = faithfulness == 0.0

    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination":bool((faithfulness < 0.5) or (answer_relevance < 0.5))
    }


In [18]:
def build_faithfullness_prompt(statements, context):
    prompt = f"""
Context:
{context}

For each of the statements below, decide if it is fully supported by the context.
Respond with ONLY 'Yes' or 'No'.
Do not add any explanations.

Examples:
Statement: "Grasse Cathedral is the town's most notable landmark."
Answer: Yes

Statement: "Grasse is the capital of France."
Answer: No

Now evaluate the following statements:
"""
    for s in statements:
        prompt += f'Statement: "{s}"\nAnswer: '

    return prompt


In [48]:
def evaluate_rag_answer_vs_groundtruth(question, llm_answer, ground_truth, threshold=0.5):
    """
    Vergleicht die LLM-Antwort mit der Ground-Truth-Antwort.
    
    Args:
        question: str, die Frage
        llm_answer: str, vom Modell generierte Antwort
        ground_truth: str, richtige Antwort aus dem Dataset
        threshold: float, Schwellenwert für semantische Ähnlichkeit
    
    Returns:
        dict mit faithfulness, answer_relevance, hallucination
    """
    # 1. Answer Relevance: semantische Ähnlichkeit zwischen LLM-Antwort und Ground-Truth
    answer_relevance = calculate_answer_relevance_direct(ground_truth, llm_answer)

    # 2. Faithfulness: wie stark die LLM-Antwort durch die Ground-Truth gestützt wird
    # = 1 wenn sehr ähnlich, sonst 0
    faithfulness = answer_relevance

    # 3. Hallucination: True wenn die Antwort nicht der Ground-Truth ähnelt
    hallucination = answer_relevance < threshold

    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": hallucination
    }


In [60]:
def analyse_with_groundtruth(questions, ground_truths, model, tokenizer, embed_model, k=5, max_tokens=200, threshold=0.5):
    
    results = []

    for question, gt_answer in zip(questions, ground_truths):
        # 1. Kontext abrufen (optional, für RAG)
        context = retrieve_context(question, k=k)

        # 2. LLM generiert Antwort unter Verwendung des Kontextes
        llm_answer = answer_with_context(
            model=model,
            tokenizer=tokenizer,
            question=question,
            max_tokens=max_tokens,
            k=k
        )

        # 3. Scores berechnen im Vergleich zur Ground-Truth
        scores = evaluate_rag_answer_vs_groundtruth(
            question=question,
            llm_answer=llm_answer,
            ground_truth=gt_answer,
            threshold=threshold
        )

        # 4. Ergebnisse speichern
        results.append({
            "question": question,
            "context": context,
            "ground_truth": gt_answer,
            "llm_answer": llm_answer,
            **scores
        })

    return results


### RAG analysis

In [46]:
def analyse_with_model(questions, model, tokenizer, k=5, max_tokens=200):
    """
    Für jede Frage:
    1. Kontext aus Dokumenten abrufen
    2. LLM generiert eine Antwort
    3. Faithfulness, Answer Relevance und Hallucination berechnen
    4. Ergebnisse zurückgeben
    """
    results = []

    for question in questions:
        # 1. Kontext abrufen
        context = retrieve_context(question, k=k)

        # 2. Antwort vom Modell generieren
        answer = answer_with_context(
            model=model,
            tokenizer=tokenizer,
            question=question,
            max_tokens=max_tokens,
            k=k
        )

        # 3. Scores berechnen
        scores = evaluate_rag_answer_given_answer(
            question=question,
            answer=answer,
            context=context,
            model=model,
            tokenizer=tokenizer
        )

        # 4. Ergebnisse speichern
        results.append({
            "question": question,
            "context": context,
            "answer": answer,
            **scores
        })

    return results


In [62]:
questions = ["Who were the three stars in the NHL game between Buffalo Sabres and Edmonton Oilers?"]
ground_truths = ["Ryan O’Reilly, Brian Gionta, and Leon Draisaitl were the three stars."]

results = analyse_with_groundtruth(
    questions=questions,
    ground_truths=ground_truths,
    model=model_qwen,
    tokenizer=tokenizer_qwen,
    embed_model=embed_model,  # <- wichtig!
    k=5,
    max_tokens=200,
    threshold=0.5
)

for res in results:
    print("Question:", res["question"])
    print("LLM Answer:", res["llm_answer"])
    print("Faithfulness:", res["faithfulness"])
    print("Answer Relevance:", res["answer_relevance"])
    print("Hallucination:", res["hallucination"])
    print("\n---\n")


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


AttributeError: 'NoneType' object has no attribute 'encode'