# Comparing The Effectiveness Of RAG Between Models


### Dependencies

In [None]:
!pip install sentence-transformers faiss-cpu transformers torch datasets transformers jupyterlab_widgets pandas numpy --quiet


In [1]:
import os
import sys
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Import the models

In [2]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(model_name_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(model_name_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(model_name_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""


model_name_qwen = "Qwen/Qwen2.5-0.5B"
model_qwen = AutoModelForCausalLM.from_pretrained(model_name_qwen)
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)

### Import the dataset

In [3]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="train")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)


960

In [4]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [5]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")



embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape


Batches: 100%|██████████| 273/273 [02:26<00:00,  1.86it/s]


(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [6]:
print(type(embeddings))
print(len(embeddings))  
if len(embeddings) > 0:
    print(np.array(embeddings).shape)

dimension = embeddings.shape[1]


index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

index.ntotal

<class 'numpy.ndarray'>
8732
(8732, 384)


8732

In [24]:
def retrieve_context(question, k=10):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))  
    selected_chunks = [chunks[i] for i in indices[0] if len(chunks[i].strip()) > 50]
    return "\n".join(selected_chunks[:k])  


Prompt similar the the one from the Ragas paper

In [25]:
def build_rag_prompt(context, question):
    return f"""
Answer the question using ONLY the information from the context.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [26]:
def answer_without_context(model, tokenizer, prompt, max_tokens=800):
    tokanized_input = tokenizer(prompt, return_tensors="pt")
    input_ids = tokanized_input["input_ids"]

    prompt_length = input_ids.shape[1]

    for _ in range(max_tokens):
        with torch.no_grad():
            output = model(input_ids).logits
        output = output.squeeze(dim=0)
        next_token_scores = output[-1]
        next_token_id = next_token_scores.argmax(dim=-1)
        input_ids = torch.cat((input_ids, torch.LongTensor([next_token_id]).reshape(1, -1)), dim=-1)

    
    generated_ids = input_ids[0, prompt_length:]

    return tokenizer.decode(generated_ids)

In [27]:
def answer_with_context(model, tokenizer, question, max_tokens=400, k=10):
    context = retrieve_context(question, k=k)
    prompt = build_rag_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors="pt")
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,       # sampling aktivieren
            temperature=0.7,     # kreative Antworten
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id
        )

    generated = output_ids[0][prompt_len:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [28]:
def eval_similarity(q: str, q_answer: str) -> float:
    emb = embed_model.encode([q, q_answer])
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    return (sim + 1) / 2

$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [29]:
def extract_statements(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Split the following answer into simple factual statements.
Return one statement per line. Do not add anything else.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [s.strip("- ").strip() for s in output.split("\n") if len(s.strip()) > 3]


In [30]:
def evaluate_faithfulness(statements, context, model, tokenizer):
    if len(statements) == 0:
        return 0.0

    prompt = build_faithfullness_prompt(statements, context)
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)


    answer_lines = [line.split("Answer:")[-1].strip() for line in output.split("\n") if "Answer:" in line]

    supported = sum(1 for line in answer_lines if line.lower().startswith("yes"))

    return supported / len(statements)


In [31]:
def generate_questions_from_answer(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Generate up to 3 questions that could be answered by the following answer.
Return one question per line.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [q.strip() for q in output.split("\n") if q.strip().endswith("?")]


In [32]:
def calculate_answer_relevance_score(question: str, generated_questions: list):
    if len(generated_questions) == 0:
        return 0.0

    score = 0
    for q in generated_questions:
        score += eval_similarity(question, q)

    return score / len(generated_questions)


In [33]:
def calculate_answer_relevance_direct(question: str, answer: str):
    """
    Misst die semantische Ähnlichkeit zwischen Frage und Antwort direkt
    und skaliert sie auf [0,1].
    """
    if not answer or answer.lower() in ["i don't know", "unknown"]:
        return 0.0


    emb = embed_model.encode([question, answer])
    
  
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    
 
    return (sim + 1) / 2


In [34]:
def evaluate_rag_answer(question, answer, context, model, tokenizer):
 
    if answer.lower() in ["i don't know", ""]:
        return {
            "faithfulness": 0.0,
            "answer_relevance": 0.0,
            "hallucination": True
        }

    statements = extract_statements(answer, model, tokenizer)[:10]

    


    if not statements:
        faithfulness = 0.0
    else:
        faithfulness = evaluate_faithfulness(statements, context, model, tokenizer)


    answer_relevance = calculate_answer_relevance_direct(question, answer)


    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": faithfulness < 1.0
    }


In [35]:
def build_faithfullness_prompt(statements, context):
    prompt = f"""
Context:
{context}

For each of the statements below, decide if it is fully supported by the context.
Respond with ONLY 'Yes' or 'No'.
Do not add any explanations.

Examples:
Statement: "Grasse Cathedral is the town's most notable landmark."
Answer: Yes

Statement: "Grasse is the capital of France."
Answer: No

Now evaluate the following statements:
"""
    for s in statements:
        prompt += f'Statement: "{s}"\nAnswer: '

    return prompt


### RAG analysis

In [36]:
def analyse():
    results = []
    for i in range(2):  
        question = rag_dataset_1200[i]["question"]
        context = rag_dataset_1200[i]["context"]
        answer = answer_with_context(model_qwen, tokenizer_qwen, question, max_tokens=50, k=5)

        scores = evaluate_rag_answer(
            question, answer, context, model_qwen, tokenizer_qwen
        )

        results.append({
            "question": question,
            "answer": answer,
            **scores
        })
    return results


In [37]:

question = "Who found the answer to a search query collar george herbert essay?"


context = retrieve_context(question, k=5)

print("RETRIEVED CONTEXT:")
print(context)
print("\n---\n")



answer = answer_with_context(
    model=model_qwen, 
    tokenizer=tokenizer_qwen, 
    question=question,  
    max_tokens=200,
    k=5
)

print("ANSWER:")
print(answer)
print("\n---\n")


scores = evaluate_rag_answer(
    question=question,
    answer=answer,
    context=context,
    model=model_qwen,
    tokenizer=tokenizer_qwen
)

print("EVALUATION SCORES:")
print(scores)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


RETRIEVED CONTEXT:
Francisco Rogers found the answer to a search query collar george herbert essay
Link ----> collar george herbert essay
Write my essay ESSAYERUDITE.COM
constitution research paper ideas
definition essay humility
business strategy case study solution
corporals course essay
decisions in paradise essays
college essay word count
credit cart terminal paper
byron don juan essay
democratic party essays
coursework language learning material teaching
christmas commercialized essay
dahrendorf essays theory
 golconda fort the history of the maghrib an interpretive essay essay of the story of an hour best essays for high school students classical argument essay format essay question writing ncea level 2 english essay structure essays on the delian league cause and effect essay rubrics deloitte case studies essay on fashion among youth fine art dissertation examples thesis about arthritis buy term paper now why i want to be a cop essay referencing chapters in an essay thesis state

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


ANSWER:
I don't know.

---



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


EVALUATION SCORES:
{'faithfulness': 0.1, 'answer_relevance': np.float32(0.5739031), 'hallucination': True}
