# Comparing The Effectiveness Of RAG Between Models


### Dependencies

In [None]:
!pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 --quiet #!!!! nicht entfernen nur auskommentieren
!pip install sentence-transformers faiss-cpu transformers datasets transformers jupyterlab_widgets pandas numpy accelerate --quiet

In [None]:
import os
import sys
#------------------------------------------------------------------------------------------------
#os.environ["HF_HOME"] = "D:/AI_Models" #-> only for my computer delete on others!!!
#------------------------------------------------------------------------------------------------
import torch
import numpy as np
from numpy.linalg import norm
import faiss
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print(f"CUDA verfügbar: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    accelerator = Accelerator()

    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"Anzahl GPUs: {torch.cuda.device_count()}")
    print(f"CUDA Version (PyTorch): {torch.version.cuda}")
    print(f"Accelerator device: {accelerator.device}")
else:
    accelerator = Accelerator(cpu=True) 
    print(f"Accelerator device: {accelerator.device}")

### Import the models

In [None]:
"""
model_name_mistral = "mistralai/Mistral-7B-v0.1"
model_mistral = AutoModelForCausalLM.from_pretrained(
    model_name_mistral,
    dtype="auto"
    )
model_mistral = accelerator.prepare(model_mistral)
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)

model_name_deepseek = "deepseek-ai/deepseek-llm-7b-base"
model_deepseek = AutoModelForCausalLM.from_pretrained(
    model_name_deepseek,
    dtype="auto"
    )
model_deepseek = accelerator.prepare(model_deepseek)
tokenizer_deepseek = AutoTokenizer.from_pretrained(model_name_deepseek)

model_name_llama = "meta-llama/Llama-2-7b"
model_llama = AutoModelForCausalLM.from_pretrained(
    model_name_llama,
    dtype="auto"
    )
model_llama = accelerator.prepare(model_llama)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama)
"""

#"Qwen/Qwen2.5-0.5B" -> "Qwen/Qwen2-7B"
model_name_qwen = "Qwen/Qwen2-7B"
model_qwen = AutoModelForCausalLM.from_pretrained(
    model_name_qwen,
    dtype="auto"
)
model_qwen = accelerator.prepare(model_qwen) 
tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)

### Import the dataset

In [None]:
"""
trivia_qa = datasets.load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

documents_trivia_qa = [item["search_results"]["context"][0] for item in trivia_qa if item["search_results"]["context"]]
len(documents_trivia_qa)
"""

rag_dataset_1200 = datasets.load_dataset("neural-bridge/rag-dataset-1200", split="train")

documents_rag_1200 = [item["context"] for item in rag_dataset_1200]
len(documents_rag_1200)

960

In [4]:
documents = documents_rag_1200

def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

len(chunks)

8732

In [5]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")



embeddings = embed_model.encode(
    chunks,
    show_progress_bar=True
)

embeddings = np.array(embeddings)
embeddings.shape


Batches: 100%|██████████| 273/273 [02:36<00:00,  1.74it/s]


(8732, 384)

We use the FlatL2 index to measure the similarity of the embeddings. FlatL2 ueses the euclidian distance for that.

In [6]:
print(type(embeddings))
print(len(embeddings))  
if len(embeddings) > 0:
    print(np.array(embeddings).shape)

dimension = embeddings.shape[1]


index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

index.ntotal

<class 'numpy.ndarray'>
8732
(8732, 384)


8732

In [37]:
def retrieve_context(question, k=5):
    q_emb = embed_model.encode([question])
    distances, indices = index.search(q_emb, len(chunks))
    
    # Filter: nur Chunks mit mehr als 50 Zeichen UND Frage-Keywords enthalten
    keywords = [w.lower() for w in question.split()]
    selected_chunks = []
    for i in indices[0]:
        chunk = chunks[i].strip()
        if len(chunk) > 50 and any(kw in chunk.lower() for kw in keywords):
            selected_chunks.append(chunk)
        if len(selected_chunks) >= k:
            break
    return "\n".join(selected_chunks)


Prompt similar the the one from the Ragas paper

In [8]:
def build_rag_prompt(context, question):
    return f"""
Answer the question using ONLY the information from the context.
If the answer is not explicitly stated, reply with: I don't know.

Context:
{context}

Question:
{question}

Answer:
"""


In [9]:
def answer_without_context(model, tokenizer, prompt, max_tokens=800):
    tokanized_input = tokenizer(prompt, return_tensors="pt")
    input_ids = tokanized_input["input_ids"]

    prompt_length = input_ids.shape[1]

    for _ in range(max_tokens):
        with torch.no_grad():
            output = model(input_ids).logits
        output = output.squeeze(dim=0)
        next_token_scores = output[-1]
        next_token_id = next_token_scores.argmax(dim=-1)
        input_ids = torch.cat((input_ids, torch.LongTensor([next_token_id]).reshape(1, -1)), dim=-1)

    
    generated_ids = input_ids[0, prompt_length:]

    return tokenizer.decode(generated_ids)

In [36]:
def answer_with_context(model, tokenizer, question, max_tokens=50, k=5):
    context = retrieve_context(question, k=k)
    prompt = build_rag_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors="pt")
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,        # deterministisch
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = output_ids[0][prompt_len:]
    answer = tokenizer.decode(generated, skip_special_tokens=True).strip()

    # Nur erster Satz behalten
    for sep in [".", "!", "?"]:
        if sep in answer:
            answer = answer.split(sep)[0] + sep
            break

    return answer


### The three scores mentioned in the Ragas paper
In this section we implement all three scores mentioned in the Ragas paper.
We will use these to evaluate the models rag performance

$$F = \frac{|F|}{|V|}$$

$$AR = \frac{1}{n} \sum\limits^n_{i=1} sim(q,q_i)$$

In [11]:
def eval_similarity(q: str, q_answer: str) -> float:
    emb = embed_model.encode([q, q_answer])
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    return (sim + 1) / 2

$$CR = \frac{\text{number of extracted sentences}}{\text{total number of senctences in }c(q)}$$

In [12]:
def extract_statements(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Split the following answer into simple factual statements.
Return one statement per line. Do not add anything else.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [s.strip("- ").strip() for s in output.split("\n") if len(s.strip()) > 3]


In [13]:
def evaluate_faithfulness(statements, context, model, tokenizer):
    if len(statements) == 0:
        return 0.0

    prompt = build_faithfullness_prompt(statements, context)
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False,      
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)


    answer_lines = [line.split("Answer:")[-1].strip() for line in output.split("\n") if "Answer:" in line]

    supported = sum(1 for line in answer_lines if line.lower().startswith("yes"))

    return supported / len(statements)


In [14]:
def generate_questions_from_answer(answer: str, model, tokenizer, max_tokens=150):
    prompt = f"""
Generate up to 3 questions that could be answered by the following answer.
Return one question per line.

Answer:
{answer}
"""
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id
        )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return [q.strip() for q in output.split("\n") if q.strip().endswith("?")]


In [15]:
def calculate_answer_relevance_score(question: str, generated_questions: list):
    if len(generated_questions) == 0:
        return 0.0

    score = 0
    for q in generated_questions:
        score += eval_similarity(question, q)

    return score / len(generated_questions)


In [16]:
def calculate_answer_relevance_direct(question: str, answer: str):
    """
    Misst die semantische Ähnlichkeit zwischen Frage und Antwort direkt
    und skaliert sie auf [0,1].
    """
    if not answer or answer.lower() in ["i don't know", "unknown"]:
        return 0.0


    emb = embed_model.encode([question, answer])
    
  
    sim = np.dot(emb[0], emb[1]) / (norm(emb[0]) * norm(emb[1]))
    
 
    return (sim + 1) / 2


In [17]:
def evaluate_rag_answer(question, answer, context, model, tokenizer):
 
    if answer.lower() in ["i don't know", ""]:
        return {
            "faithfulness": 0.0,
            "answer_relevance": 0.0,
            "hallucination": True
        }

    statements = extract_statements(answer, model, tokenizer)[:10]

    


    if not statements:
        faithfulness = 0.0
    else:
        faithfulness = evaluate_faithfulness(statements, context, model, tokenizer)


    answer_relevance = calculate_answer_relevance_direct(question, answer)


    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": faithfulness < 1.0
    }


In [18]:
def build_faithfullness_prompt(statements, context):
    prompt = f"""
Context:
{context}

For each of the statements below, decide if it is fully supported by the context.
Respond with ONLY 'Yes' or 'No'.
Do not add any explanations.

Examples:
Statement: "Grasse Cathedral is the town's most notable landmark."
Answer: Yes

Statement: "Grasse is the capital of France."
Answer: No

Now evaluate the following statements:
"""
    for s in statements:
        prompt += f'Statement: "{s}"\nAnswer: '

    return prompt


In [52]:
def evaluate_generated_vs_reference(question, generated_answer, reference_answer, context, model, tokenizer):
    # 1. Answer Relevance berechnen (semantische Ähnlichkeit)
    answer_relevance = calculate_answer_relevance_direct(reference_answer, generated_answer)

    # 2. Faithfulness prüfen (bleibt wie bisher)
    statements = extract_statements(generated_answer, model, tokenizer)[:10]
    if not statements:
        faithfulness = 0.0
    else:
        faithfulness = evaluate_faithfulness(statements, context, model, tokenizer)

    # 3. Hallucination bestimmen
    # True, wenn Antwort komplett falsch ist oder sehr weit von Referenz entfernt
    if not generated_answer or generated_answer.lower() in ["i don't know", "unknown"]:
        hallucination = True
    elif answer_relevance < 0.5:  # Grenze, ab wann Antwort als zu anders gilt
        hallucination = True
    else:
        hallucination = False

    return {
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "hallucination": hallucination
    }


### RAG analysis

In [43]:
def analyse():
    results = []
    for i in range(2):  # oder len(rag_dataset_1200)
        question = rag_dataset_1200[i]["question"]
        context = rag_dataset_1200[i]["context"]
        reference_answer = rag_dataset_1200[i]["answer"]  # die gegebene korrekte Antwort

        # Modell generiert eigene Antwort
        generated_answer = answer_with_context(
            model=model_qwen,
            tokenizer=tokenizer_qwen,
            question=question,
            max_tokens=50,
            k=5
        )

        # Scores basierend auf Referenz-Antwort berechnen
        scores = evaluate_generated_vs_reference(
            question,
            generated_answer,
            reference_answer,
            context,
            model_qwen,
            tokenizer_qwen
        )

        results.append({
            "question": question,
            "reference_answer": reference_answer,
            "generated_answer": generated_answer,
            **scores
        })
    return results


In [53]:
# Kontext über deinen Retriever abrufen
context = retrieve_context(question, k=5)

# Abgerufenen Kontext anzeigen
print("Retrieved Context:")
print(context)
print("\n---\n")
question="What is the person seeking in the context?"
# Die gegebene, korrekte Antwort (Referenz)
reference_answer = "The person is seeking a woman who wants to watch him on cam, and a real lady in his life, not a little girl."

# Prompt wird intern in answer_with_context erstellt
generated_answer = answer_with_context(
    model=model_qwen,
    tokenizer=tokenizer_qwen,
    question=question,
    max_tokens=50,
    k=5
)

print("Generated Answer by LLM:")
print(generated_answer)
print("\n---\n")

# Scores basierend auf der Referenz-Antwort berechnen
scores = evaluate_generated_vs_reference(
    question=question,
    generated_answer=generated_answer,
    reference_answer=reference_answer,
    context=context,
    model=model_qwen,
    tokenizer=tokenizer_qwen
)

print("Evaluation Scores:")
print(scores)


Retrieved Context:
lso wildly wondered if I was perhaps part of an experiment; "The seemingly better dressed--and cleaner--of the two asks the customers for money (a specific amount, no less)...what will be the result? How will the customers react? What about the other man? Will he be acknowledged and rewarded? Or will he be 'punished' for being quiet and not asking? "No food for you 'cause you didn't ask--just as well, saved me a buck..."
Was the one who approached me even homeless? Or just testing society?
Inter
of their inner self. Often times, we are creatures quick to judge and we start making our own conclusions about them; but instead, remember that these people have their struggles too and show some kindness because you don’t know who might need it. Like the quote says “Don’t judge people, you never know what kind of battles they are fighting”.
Duplicate your love
We all have a little love to give someone and showing a little TLC can never hurt anybody. Instead of keeping all t

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Generated Answer by LLM:
The person seeking is seeking or requiring.

---



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Evaluation Scores:
{'faithfulness': 0.2, 'answer_relevance': np.float32(0.7127337), 'hallucination': False}
