# Comparing The Effectiveness Of RAG Between Models


### Dependencies

In [1]:
!pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 --quiet 
!pip install ipywidgets sentence-transformers faiss-cpu transformers datasets transformers jupyterlab_widgets pandas accelerate numpy hf_xet tqdm --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import sys
#------------------------------------------------------------------------------------------------
#os.environ["HF_HOME"] = "D:/AI_Models" #-> only for my computer delete on others!!!
#------------------------------------------------------------------------------------------------
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from accelerate import Accelerator
from tqdm.notebook import tqdm

In [3]:
print(f"CUDA verfügbar: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    
    accelerator = Accelerator()
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Anzahl GPUs: {torch.cuda.device_count()}")
    print(f"CUDA Version (PyTorch): {torch.version.cuda}")
    print(f"Accelerator device: {accelerator.device}")
else:
    accelerator = Accelerator(cpu=True) 
    print(f"Accelerator device: {accelerator.device}")

CUDA verfügbar: True
GPU Name: NVIDIA GeForce RTX 2060
Anzahl GPUs: 1
CUDA Version (PyTorch): 12.8
Accelerator device: cuda


### Import the models

In [4]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(
    model_name_mistral,
    dtype="auto"
    )
model_mistral = accelerator.prepare(model_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(
    model_name_deepseek,
    dtype="auto"
    )
model_deepseek = accelerator.prepare(model_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(
    model_name_llama,
    dtype="auto"
    )
model_llama = accelerator.prepare(model_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""

#"Qwen/Qwen2.5-0.5B" -> "Qwen/Qwen2-7B"
model_name_qwen = "Qwen/Qwen3-1.7B"
model_qwen = AutoModelForCausalLM.from_pretrained(
    model_name_qwen,
    dtype="auto"
)
model_qwen = accelerator.prepare(model_qwen) 
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Import the dataset

In [5]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="train")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)

960

In [6]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [7]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")



embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape


Batches:   0%|          | 0/273 [00:00<?, ?it/s]

(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [8]:
print(type(embeddings))
print(len(embeddings))  
if len(embeddings) > 0:
    print(np.array(embeddings).shape)

dimension = embeddings.shape[1]


index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

index.ntotal

<class 'numpy.ndarray'>
8732
(8732, 384)


8732

In [9]:
"""def retrieve_context(question, k=5):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))
    
    # Filter: nur Chunks mit mehr als 50 Zeichen UND Frage-Keywords enthalten
    keywords = [w.lower() for w in question.split()]
    selected_chunks = []
    for i in indices[0]:
        chunk = chunks[i].strip()
        if len(chunk) > 50 and any(kw in chunk.lower() for kw in keywords):
            selected_chunks.append(chunk)
        if len(selected_chunks) >= k:
            break
    return "\n".join(selected_chunks)"""


'def retrieve_context(question, k=5):\n    q_emb = embed_model.encode([question])\n    distances, indices = index.search(q_emb, len(chunks))\n\n    # Filter: nur Chunks mit mehr als 50 Zeichen UND Frage-Keywords enthalten\n    keywords = [w.lower() for w in question.split()]\n    selected_chunks = []\n    for i in indices[0]:\n        chunk = chunks[i].strip()\n        if len(chunk) > 50 and any(kw in chunk.lower() for kw in keywords):\n            selected_chunks.append(chunk)\n        if len(selected_chunks) >= k:\n            break\n    return "\n".join(selected_chunks)'

Prompt similar the the one from the Ragas paper

In [10]:
def build_rag_prompt(context, question):
    return f"""Answer the question using ONLY the information from the context.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [11]:
#TODO entscheide bei jedem max_tokens=1000 ob 1000 angebracht sind
def answer_without_context(model, tokenizer, question, max_tokens=1000, do_sample=True):
    #TODO entscheiden ob der system prompt bleibt für die RAG analyse
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": question}
    ]
    
    text_input = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        #-----------------------------------------------------------
        enable_thinking=False #-> falls kein thinking model löschen!
        #-----------------------------------------------------------
    )
    inputs = tokenizer(text_input, return_tensors="pt").to(accelerator.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=do_sample,
            temperature=0.7,
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

In [12]:

def answer_with_context(model, tokenizer, question, context, max_tokens=1000, do_sample=True):
    rag_instruction = (
        "Answer the question using the information from the given context.\n"
        f"question: {question}\n"
        f"context: {context}"
    )

    #TODO entscheiden ob der system prompt bleibt für die RAG analyse
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": rag_instruction}
    ]
    
    text_input = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        #-----------------------------------------------------------
        enable_thinking=False #-> falls kein thinking model löschen!
        #-----------------------------------------------------------
    )
    
    inputs = tokenizer(text_input, return_tensors="pt").to(accelerator.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=do_sample,
            temperature=0.7,
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )
    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

In [13]:
def _answer_with_context(model, tokenizer, question, max_tokens=50, k=5):
    context = retrieve_context(question, k=k)
    prompt = build_rag_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors="pt")
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,        # deterministisch
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = output_ids[0][prompt_len:]
    answer = tokenizer.decode(generated, skip_special_tokens=True).strip()

    # Nur erster Satz behalten
    for sep in [".", "!", "?"]:
        if sep in answer:
            answer = answer.split(sep)[0] + sep
            break

    return answer


### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

In [14]:
def build_statement_prompt(question: str, answer: str):
    return f"""Given a question and answer, create one or more statements from each sentence in the given answer.
question: {question}
answer: {answer}"""

In [15]:
def extract_statements(question: str, answer: str, model, tokenizer, max_tokens=1000):
    #Extracts statements out of the model generated answer 
    prompt = build_statement_prompt(question, answer)

    answer = answer_without_context(model, tokenizer, prompt, max_tokens, do_sample=False)

    statements = [line.strip() for line in answer.split("\n") if len(line.strip()) > 3]
    return statements

Changed the prompt for better understanding

In [34]:
def build_faithfullness_prompt(statements):
    prompt = f"""Consider the given context and following statements, the determine whether they are supported by the information presente in the context. 
Provide a brief explanation for each statement before arriving at the verdict (Yes/No). 
Answer in the format:
Statement: ...
Explanation: ...
Verdict: (Yes/No)
Do not deviate from the specified format.
These are the statements
"""
    for s in statements:
        prompt += f"Statement: {s}\n"
        
    return prompt

In [17]:
#TODO zählt aktuell die anzahl von "Yes" soll aber richtig die anzahl der supported statements zählen -> vlt zu get_statements()
def count_supported(prompt: str):
    return prompt.count("Yes")

In [18]:
def calculate_faithfulness_score(total_statements: int, suported_statements: int):
    if suported_statements == 0.0:
        return 0.0

    return suported_statements / total_statements

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [36]:
def build_answer_relevance_prompt(answer: str):
    prompt = f"""Generate a question for the given answer.
    answer: {answer}"""
    return prompt

In [19]:
#TODO
def calculate_similarity(q: str, q_answer: str) -> float:
   pass

In [20]:
#TODO
def calculate_answer_relevance_score(question: str, generated_questions: list):
    if len(generated_questions) == 0:
        return 0.0

    score = 0
    for q in generated_questions:
        score += eval_similarity(question, q)

    return score / len(generated_questions)


$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [None]:
def build_context_relevance_prompt(question: str):
    prompt = f"""Please extract relevant sentences from the provided context that can potentially help ansewr the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phares "Insufficient Information".While extracting candidate sentences you're not allowed to make any changes to senctences from the given context.
    Question: {question}"""
    return prompt

In [22]:
#TODO vielleicht erweitern sehr simple satz zählung
def count_sentences(sentence: str):
    return sentence.count(". ")

In [23]:
def calculate_context_relevance_score(num_extracted, num_sentences):
    if num_sentences == 0.0:
        return 0.0
    
    return num_extracted / num_sentences

### RAG analysis

In [42]:
def analyse_faithfullness(model, tokenizer, context, question, answer, do_sample=True):
    #extracts statements out of the answer and generates a statements list
    statements = extract_statements(question, answer, model_qwen, tokenizer_qwen)

    #lets the model evaluate the faithfullness
    faithfullness_prompt = build_faithfullness_prompt(statements)
    #do_sample has to be false
    eval = answer_with_context(model_qwen, tokenizer_qwen, context, faithfullness_prompt, do_sample=False)
    num_supported = count_supported(eval)

    return [statements, num_supported]

In [None]:
#TODO entscheiden wieviele fragen die funkition bearbeiten soll
def analyse_answer_relevance(model, tokenizer, answer, do_sample=True):
    prompt = build_answer_relevance_prompt(answer)
    llm_question = answer_without_context(model, tokenizer, prompt, do_sample=do_sample)


In [39]:
def analyse_context_relevance(model, tokenizer, context, question):
    prompt = build_context_relevance_prompt(question)
    print(prompt)
    answer = answer_with_context(model, tokenizer, context, prompt, do_sample=False)

    num_extracted = count_sentences(answer)
    num_sentences = count_sentences(context)
    
    return [num_extracted, num_sentences]

In [40]:
def analyse_rag(model, tokenizer, do_sample=True):
    scores = {
        "faithfullness score" : 0,
        "answer relevance score" : 0,
        "context relevance score" : 0
    }
    #for faithfullness
    statements = []
    supported_statements = 0

    #for answer relevance

    #for context relevance
    num_extracted = 0
    num_sentences = 0

    for i in tqdm(range(1)):
        #generates a answer with the context for the current question
        question = rag_dataset_1200[90+i]["question"]
        context = rag_dataset_1200[90+i]["context"]
        answer = answer_with_context(model_qwen, tokenizer_qwen, context, question, do_sample=do_sample)

        faithfullness = analyse_faithfullness(model, tokenizer, context, question, answer, do_sample)
        statements.extend(faithfullness[0])
        supported_statements += faithfullness[1]


        #TODO
        #answer_relevance = analyse_answer_relevance()

        context_relevance = analyse_context_relevance(model, tokenizer, context, question)
        num_extracted += context_relevance[0]
        num_sentences += context_relevance[1]

    scores["faithfullness score"] = calculate_faithfulness_score(len(statements), supported_statements)
    scores["context relevance score"] = calculate_context_relevance_score(num_extracted, num_sentences)

    return scores

print(analyse_rag(model_qwen, tokenizer_qwen, do_sample=False))

  0%|          | 0/1 [00:00<?, ?it/s]

Statement: 1. The new website for the Amsterdamse Kunstraad was designed to offer users a clear and functional overview of all the content.  
Explanation: The context states that the main goal was to give the user a clear and functional overview of all the content.  
Verdict: Yes  

Statement: 2. The primary purpose of the new website was to provide a comprehensive and organized view of the content available on the Amsterdamse Kunstraad.  
Explanation: The context mentions that the goal was to provide a clear and functional overview of all the content, which implies a comprehensive and organized view.  
Verdict: Yes
Please extract relevant sentences from the provided context that can potentially help ansewr the following question.If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phares "Insufficient Information".While extracting candidate sentences you're not allowed to make any changes to senctences from the given 