# Comparing The Effectiveness Of RAG Between Models


### Dependencies

In [11]:
!pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 --quiet
!pip install sentence-transformers faiss-cpu transformers datasets transformers jupyterlab_widgets pandas accelerate numpy hf_xet --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\juliu\Documents\karl_info\Large-Language-Modell-Project_\venv\Scripts\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\juliu\Documents\karl_info\Large-Language-Modell-Project_\venv\Scripts\python.exe -m pip install --upgrade pip


In [12]:
import os
import sys
#------------------------------------------------------------------------------------------------
os.environ["HF_HOME"] = "D:/AI_Models" #-> only fpr my computer delete on others!!!
#------------------------------------------------------------------------------------------------
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from accelerate import Accelerator

Test if CUDA is available and set the accelerator

In [None]:

print(f"CUDA verfügbar: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    accelerator = Accelerator()

    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Anzahl GPUs: {torch.cuda.device_count()}")
    print(f"CUDA Version (PyTorch): {torch.version.cuda}")
    print(f"Accelerator device: {accelerator.device}")
else:
    accelerator = Accelerator(cpu=True) 


CUDA verfügbar: True
GPU Name: NVIDIA GeForce RTX 5060 Ti
Anzahl GPUs: 1
CUDA Version (PyTorch): 12.8
Accelerator device: cuda


: 

### Import the models

In [None]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(model_name_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(model_name_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(model_name_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""

#"Qwen/Qwen2.5-0.5B" -> "Qwen/Qwen2-7B"
model_name_qwen = "Qwen/Qwen2-7B"
model_qwen = AutoModelForCausalLM.from_pretrained(
    model_name_qwen,
    dtype="auto"
)
model_qwen = accelerator.prepare(model_qwen) 
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)

Downloading model to: D:/AI_Models


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Import the dataset

In [None]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="train")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)


Generating train split: 100%|██████████| 960/960 [00:00<00:00, 48082.59 examples/s]
Generating test split: 100%|██████████| 240/240 [00:00<00:00, 42372.06 examples/s]


960

In [None]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device=accelerator.device)

embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape


Batches: 100%|██████████| 273/273 [00:08<00:00, 32.87it/s]


(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [None]:
print(type(embeddings))
print(len(embeddings))  
if len(embeddings) > 0:
    print(np.array(embeddings).shape)

dimension = embeddings.shape[1]


index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

index.ntotal

<class 'numpy.ndarray'>
8732
(8732, 384)


8732

In [None]:
def retrieve_context(question, k=10):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))  
    selected_chunks = [chunks[i] for i in indices[0] if len(chunks[i].strip()) > 50]
    return "\n".join(selected_chunks[:k])  

Prompt similar the the one from the Ragas paper

In [None]:
def build_rag_prompt(context, question):
    return f"""
Answer the question using ONLY the information from the context.
Return ONLY complete sentences.
Do NOT include explanations, commentary, or unrelated text.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [None]:
def answer_without_context(model, tokenizer, prompt, max_tokens=200):
    #inputs = tokenizer(prompt, return_tensors="pt") -> falls keine CUDA fähige grafikkarte verwendet wird entkommentieren
    inputs = tokenizer(prompt, return_tensors="pt").to(accelerator.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True, #-> evtl do_sample auf false
            temperature=0.01,
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )
    generated = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


In [None]:
a = answer_without_context(model_qwen, tokenizer_qwen, "Code a function in Python that adds two numbers together!", max_tokens=1000)

print(a)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


def add_numbers(a, b):
    return a + b


In [None]:
def answer_with_context(model, tokenizer, question, max_tokens=400, k=10):
    context = retrieve_context(question, k=k)
    prompt = build_rag_prompt(context, question)
    #inputs = tokenizer(prompt, return_tensors="pt") -> falls keine CUDA fähige grafikkarte verwendet wird entkommentieren
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True, #-> evtl do_sample auf false
            temperature=0.01,    
            top_p=0.95,
            eos_token_id=tokenizer.eos_token_id
        )

    generated = output_ids[0][prompt_len:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [None]:
def eval_similarity(q: str, q_answer: str) -> float:
    emb = embed_model.encode([q, q_answer])
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    return (sim + 1) / 2

$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [None]:
def extract_statements(answer: str, model, tokenizer, max_tokens=150):
    """
    Splits a model-generated answer into complete factual statements.
    Each statement will be one line. Partial words or fragments are avoided.
    """

    prompt = f"""
Split the following answer into complete factual statements.
Return one complete sentence per line.
Do NOT add explanations, bullet points, or partial words.

Answer:
{answer}
"""


    inputs = tokenizer(prompt, return_tensors="pt")

    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    
    statements = [line.strip() for line in output_text.split("\n") if len(line.strip()) > 3]

    return statements


In [None]:
def evaluate_faithfulness(statements, context, model, tokenizer):
    if len(statements) == 0:
        return 0.0

    prompt = build_faithfullness_prompt(statements, context)
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)


    answer_lines = [line.split("Answer:")[-1].strip() for line in output.split("\n") if "Answer:" in line]

    supported = sum(1 for line in answer_lines if line.lower().startswith("yes"))

    return supported / len(statements)


In [None]:
def generate_questions_from_answer(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Generate up to 3 questions that could be answered by the following answer.
Return one question per line.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [q.strip() for q in output.split("\n") if q.strip().endswith("?")]


In [None]:
def calculate_answer_relevance_score(question: str, generated_questions: list):
    if len(generated_questions) == 0:
        return 0.0

    score = 0
    for q in generated_questions:
        score += eval_similarity(question, q)

    return score / len(generated_questions)


In [None]:
def calculate_answer_relevance_direct(question: str, answer: str):
    """
    Misst die semantische Ähnlichkeit zwischen Frage und Antwort direkt
    und skaliert sie auf [0,1].
    """
    if not answer or answer.lower() in ["i don't know", "unknown"]:
        return 0.0


    emb = embed_model.encode([question, answer])
    
  
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    
 
    return (sim + 1) / 2


In [None]:
def evaluate_rag_answer(question, answer, context, model, tokenizer):
 
    if answer.lower() in ["i don't know", ""]:
        return {
            "faithfulness": 0.0,
            "answer_relevance": 0.0,
            "hallucination": True
        }

    statements = extract_statements(answer, model, tokenizer)[:10]

    


    if not statements:
        faithfulness = 0.0
    else:
        faithfulness = evaluate_faithfulness(statements, context, model, tokenizer)


    answer_relevance = calculate_answer_relevance_direct(question, answer)


    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": faithfulness < 1.0
    }


In [None]:
def build_faithfullness_prompt(statements, context):
    prompt = f"""
Context:
{context}

For each of the statements below, decide if it is fully supported by the context.
Respond with ONLY 'Yes' or 'No'.
Do not add any explanations.

Examples:
Statement: "Grasse Cathedral is the town's most notable landmark."
Answer: Yes

Statement: "Grasse is the capital of France."
Answer: No

Now evaluate the following statements:
"""
    for s in statements:
        prompt += f'Statement: "{s}"\nAnswer: '

    return prompt


### RAG analysis

In [None]:
def analyse():
    results = []
    for i in range(2):  
        question = rag_dataset_1200[i]["question"]
        context = rag_dataset_1200[i]["context"]
        answer = answer_with_context(model_qwen, tokenizer_qwen, question, max_tokens=50, k=5)

        scores = evaluate_rag_answer(
            question, answer, context, model_qwen, tokenizer_qwen
        )

        results.append({
            "question": question,
            "answer": answer,
            **scores
        })
    return results


In [None]:
"""
question = "Who were the three stars in the NHL game between Buffalo Sabres and Edmonton Oilers?"


context = retrieve_context(question, k=5)

print("RETRIEVED CONTEXT:")
print(context)
print("\n---\n")



answer = answer_with_context(
    model=model_qwen, 
    tokenizer=tokenizer_qwen, 
    question=question,  
    max_tokens=200,
    k=5
)

print("ANSWER:")
print(answer)
print("\n---\n")


scores = evaluate_rag_answer(
    question=question,
    answer=answer,
    context=context,
    model=model_qwen,
    tokenizer=tokenizer_qwen
)

print("EVALUATION SCORES:")
print(scores)
"""

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


RETRIEVED CONTEXT:
d on all of them (and by my count, scored on three that weren’t really chances at all!)
- Make no mistake, these were two world-class tire fires going at it, and the Oilers have a lot to clean up, just as they did after the Flames victories. But the Sabres were by far the larger and stinkier of the two tire fires. Rewatch the game if you don’t believe me (I’m doing that right now to confirm my post game comments, and yes, it hurts a lot).
- The truth is, we see this kind of thing regularly – goa
Game Notes EDM vs BUF
Buffalo Sabres (Head Coach: Dan Bylsma) at Edmonton Oilers (Head Coach: Todd McLellan)
NHL Game #31, Rogers Place, 2016-10-16 05:00:00PM (GMT -0600)
Sabres 6-2
Referees: Tim Peel, Brad Watson
Linesmen: Ryan Gibbons, Devin Berg
Three Stars: Ryan O’Reilly; Brian Gionta; Leon Draisaitl
- OK, look, I know we’re all disgusted at a big loss to a badly injured bottom feeder like Buffalo. But it’s important not to get overly caught up in the post facto narrative

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


ANSWER:
The three stars in the NHL game between Buffalo Sabres and Edmonton Oilers were Ryan O’Reilly, Brian Gionta, and Leon Draisaitl.

---





RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)