In [2]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
from sklearn.metrics.pairwise import cosine_similarity



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
BASE_PATH = r"E:\Project2\data\en"

documents_path = os.path.join(BASE_PATH, "documents.csv")
queries_path = os.path.join(BASE_PATH, "queries.csv")
answers_path = os.path.join(BASE_PATH, "answers.csv")

print(documents_path)


E:\Project2\data\en\documents.csv


In [4]:
documents_df = pd.read_csv(documents_path)
queries_df = pd.read_csv(queries_path)
answers_df = pd.read_csv(answers_path)

print("Documents:", documents_df.shape)
print("Queries:", queries_df.shape)
print("Answers:", answers_df.shape)

documents_df.head()


Documents: (50, 2)
Queries: (500, 4)
Answers: (500, 4)


Unnamed: 0,doc_id,document_text
0,doc_1,Albert Einstein was a German-born theoretical ...
1,doc_2,Asia is the largest continent in the world by ...
2,doc_3,Bengaluru also known as Bangalore is the capit...
3,doc_4,Biology is the scientific study of life and li...
4,doc_5,Buddhism also known as Buddha-dharma and Dharm...


In [5]:
def clean_unicode(text):
    if pd.isna(text):
        return ""
    
    # Normalize to NFC
    text = unicodedata.normalize("NFC", str(text))
    
    # Remove zero-width characters
    text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [6]:
documents_df["document_text"] = documents_df["document_text"].apply(clean_unicode)
queries_df["question"] = queries_df["question"].apply(clean_unicode)
answers_df["answer"] = answers_df["answer"].apply(clean_unicode)
answers_df["evidence_span"] = answers_df["evidence_span"].apply(clean_unicode)


In [7]:
# 1. Check ≤5 word answers
answers_df["word_count"] = answers_df["answer"].apply(lambda x: len(x.split()) if x != "NA" else 0)
print("Answers >5 words:")
print(answers_df[answers_df["word_count"] > 5])

# 2. Evidence span exists in document
merged = answers_df.merge(documents_df, on="doc_id")

def evidence_check(row):
    if row["answer"] == "NA":
        return True
    return row["evidence_span"] in row["document_text"]

merged["evidence_valid"] = merged.apply(evidence_check, axis=1)
print("Invalid evidence spans:")
print(merged[merged["evidence_valid"] == False])


Answers >5 words:
         query_id  doc_id                                             answer  \
180  doc_19_q0181  doc_19                      get the ball over a goal line   
215  doc_22_q0216  doc_22     Sankhya Yoga Nyaya Vaisheshika Mimamsa Vedanta   
245  doc_25_q0246  doc_25  Internet Corporation for Assigned Names and Nu...   
371  doc_38_q0372  doc_38             Matter, energy, force, space, and time   
456  doc_46_q0457  doc_46  What percentage of households owned a TV in 2013?   

                                         evidence_span  word_count  
180  the object is to get the ball over a goal line...           7  
215  The six stika schools of Hindu philosophy that...           6  
245  the Domain Name System are directed by a maint...           7  
371  hysics is the scientific study of matter its f...           6  
456  In 2013 79% of the world's households owned a ...           9  
Invalid evidence spans:
         query_id  doc_id                         answer  \
13 

In [8]:
# Locate the row first (sanity check)
answers_df[answers_df["query_id"] == "doc_46_q0457"]


Unnamed: 0,query_id,doc_id,answer,evidence_span,word_count
456,doc_46_q0457,doc_46,What percentage of households owned a TV in 2013?,In 2013 79% of the world's households owned a ...,9


In [9]:
answers_df.loc[
    answers_df["query_id"] == "doc_46_q0457",
    "answer"
] = "79%"


In [10]:
answers_df[answers_df["query_id"] == "doc_46_q0457"]


Unnamed: 0,query_id,doc_id,answer,evidence_span,word_count
456,doc_46_q0457,doc_46,79%,In 2013 79% of the world's households owned a ...,9


In [11]:
answers_df["word_count"] = answers_df["answer"].apply(
    lambda x: len(x.split()) if x != "NA" else 0
)

print("Answers >5 words:")
print(answers_df[answers_df["word_count"] > 5])


Answers >5 words:
         query_id  doc_id                                             answer  \
180  doc_19_q0181  doc_19                      get the ball over a goal line   
215  doc_22_q0216  doc_22     Sankhya Yoga Nyaya Vaisheshika Mimamsa Vedanta   
245  doc_25_q0246  doc_25  Internet Corporation for Assigned Names and Nu...   
371  doc_38_q0372  doc_38             Matter, energy, force, space, and time   

                                         evidence_span  word_count  
180  the object is to get the ball over a goal line...           7  
215  The six stika schools of Hindu philosophy that...           6  
245  the Domain Name System are directed by a maint...           7  
371  hysics is the scientific study of matter its f...           6  


In [12]:
answers_df.drop(columns=["word_count"], errors="ignore").to_csv(
    answers_path,
    index=False
)


In [18]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

embedder = SentenceTransformer(MODEL_NAME)

print("MiniLM embedding model loaded successfully.")


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 768.71it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


MiniLM embedding model loaded successfully.


In [21]:
def embed_texts(texts):
    return embedder.encode(
        texts,
        normalize_embeddings=True,
        convert_to_numpy=True
    )


In [25]:
CHUNK_SIZE = 150
OVERLAP = 30

def chunk_text(text):
    words = text.split()
    chunks = []
    start = 0
    
    while start < len(words):
        end = start + CHUNK_SIZE
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += CHUNK_SIZE - OVERLAP
        
    return chunks


In [26]:
chunk_records = []

for _, row in documents_df.iterrows():
    doc_id = row["doc_id"]
    text = row["document_text"]
    
    chunks = chunk_text(text)
    
    for i, chunk in enumerate(chunks):
        chunk_records.append({
            "doc_id": doc_id,
            "chunk_id": f"{doc_id}_chunk_{i}",
            "chunk_text": chunk
        })

import pandas as pd
chunks_df = pd.DataFrame(chunk_records)

print("Total chunks:", chunks_df.shape)
chunks_df.head()


Total chunks: (449, 3)


Unnamed: 0,doc_id,chunk_id,chunk_text
0,doc_1,doc_1_chunk_0,Albert Einstein was a German-born theoretical ...
1,doc_1,doc_1_chunk_1,year later which he kept for the rest of his l...
2,doc_1,doc_1_chunk_2,he endorsed a letter to President Franklin D. ...
3,doc_1,doc_1_chunk_3,structure and evolution of the universe as a w...
4,doc_1,doc_1_chunk_4,physicist Satyendra Nath Bose he laid the grou...


In [27]:
def embed_texts(texts):
    return embedder.encode(
        texts,
        normalize_embeddings=True,
        convert_to_numpy=True
    )


In [28]:
chunk_embeddings = embed_texts(
    chunks_df["chunk_text"].tolist()
)

print("Chunk embeddings shape:", chunk_embeddings.shape)


Chunk embeddings shape: (449, 384)


In [29]:
import faiss

dimension = chunk_embeddings.shape[1]  # should be 384
index = faiss.IndexFlatIP(dimension)   # cosine similarity (since normalized)
index.add(chunk_embeddings)

print("FAISS index built successfully.")


FAISS index built successfully.


In [30]:
def retrieve_top_k(query, k=3):
    query_embedding = embed_texts([query])
    scores, indices = index.search(query_embedding, k)
    return chunks_df.iloc[indices[0]]["chunk_text"].tolist()


In [31]:
test_question = queries_df.iloc[0]["question"]
retrieved = retrieve_top_k(test_question)

for i, chunk in enumerate(retrieved):
    print(f"\n--- Chunk {i+1} ---\n")
    print(chunk[:400])



--- Chunk 1 ---

physicist Satyendra Nath Bose he laid the groundwork for BoseEinstein statistics. For much of the last phase of his academic life Einstein worked on two endeavors that ultimately proved unsuccessful. First he advocated against quantum theory's introduction of fundamental randomness into science's picture of the world objecting that "God does not play dice". Second he attempted to devise a unified 

--- Chunk 2 ---

Albert Einstein was a German-born theoretical physicist best known for developing the theory of relativity. Einstein also made important contributions to quantum theory. His massenergy equivalence formula E = mc2 which arises from special relativity has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics for "his services to theoretical physics and especiall

--- Chunk 3 ---

Wrttemberg in the German Empire on 14 March 1879. His parents secular Ashkenazi Jews were Hermann Einstein a salesman and engineer and Pauline K

In [32]:
import requests


In [36]:
import requests

MODEL_NAME = "llama3:8b"

PROMPT_TEMPLATE = """
You must answer using ONLY the exact phrase from the context.

Rules:
- Maximum 5 words.
- Do NOT rephrase.
- Do NOT explain.
- Do NOT add extra words.
- If the answer is not explicitly written in the context, output: NA

Context:
{retrieved_chunks}

Question:
{question}

Answer:
"""

def build_prompt(question, k=3):
    chunks = retrieve_top_k(question, k=k)
    context = "\n".join(chunks)
    return PROMPT_TEMPLATE.format(
        retrieved_chunks=context,
        question=question
    )

def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 10
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

# Test one question only
row = queries_df.iloc[0]
question = row["question"]

prompt = build_prompt(question)
prediction = generate_with_ollama(prompt)

print("Q:", question)
print("Prediction:", prediction)


Q: Where was Einstein born?
Prediction: Ulm in the Kingdom of Wrttemberg


In [37]:
import requests

# --------------------------------------------------
# 1️⃣ Correct Ollama Model Name
# --------------------------------------------------
MODEL_NAME = "llama3:8b"

# --------------------------------------------------
# 2️⃣ Stronger Extraction Prompt
# --------------------------------------------------
PROMPT_TEMPLATE = """
Extract the shortest exact answer from the context.

Strict Rules:
- Copy the answer exactly from the context.
- Use the minimum number of words possible.
- Maximum 5 words.
- Do NOT include surrounding phrases.
- Do NOT explain.
- Do NOT add punctuation.
- Output only the answer text.
- If the answer is not explicitly written in the context, output: NA

Context:
{retrieved_chunks}

Question:
{question}

Answer:
"""

# --------------------------------------------------
# 3️⃣ Build RAG Prompt
# --------------------------------------------------
def build_prompt(question, k=3):
    chunks = retrieve_top_k(question, k=k)
    context = "\n".join(chunks)
    return PROMPT_TEMPLATE.format(
        retrieved_chunks=context,
        question=question
    )

# --------------------------------------------------
# 4️⃣ Ollama Generation Function
# --------------------------------------------------
def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 8
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

# --------------------------------------------------
# 5️⃣ Test One Question
# --------------------------------------------------
row = queries_df.iloc[0]
question = row["question"]

prompt = build_prompt(question)
prediction = generate_with_ollama(prompt)

print("Q:", question)
print("Prediction:", prediction)


Q: Where was Einstein born?
Prediction: Ulm


In [38]:
import requests
import pandas as pd
from tqdm import tqdm

# --------------------------------------------------
# 1️⃣ Model Name
# --------------------------------------------------
MODEL_NAME = "llama3:8b"

# --------------------------------------------------
# 2️⃣ Strict Prompt
# --------------------------------------------------
PROMPT_TEMPLATE = """
Extract the shortest exact answer from the context.

Strict Rules:
- Copy the answer exactly from the context.
- Use the minimum number of words possible.
- Maximum 5 words.
- Do NOT include surrounding phrases.
- Do NOT explain.
- Do NOT add punctuation.
- Output only the answer text.
- If the answer is not explicitly written in the context, output: NA

Context:
{retrieved_chunks}

Question:
{question}

Answer:
"""

# --------------------------------------------------
# 3️⃣ Build Prompt
# --------------------------------------------------
def build_prompt(question, k=3):
    chunks = retrieve_top_k(question, k=k)
    context = "\n".join(chunks)
    return PROMPT_TEMPLATE.format(
        retrieved_chunks=context,
        question=question
    )

# --------------------------------------------------
# 4️⃣ Ollama Generation
# --------------------------------------------------
def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 8
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

# --------------------------------------------------
# 5️⃣ Run Over ALL Queries
# --------------------------------------------------
results = []

for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    
    query_id = row["query_id"]
    doc_id = row["doc_id"]
    question = row["question"]
    
    # Get gold answer
    gold = answers_df[
        answers_df["query_id"] == query_id
    ]["answer"].values[0]
    
    # Build prompt
    prompt = build_prompt(question)
    
    # Generate answer
    prediction = generate_with_ollama(prompt)
    
    results.append({
        "query_id": query_id,
        "doc_id": doc_id,
        "question": question,
        "gold_answer": gold,
        "model_answer": prediction
    })

results_df = pd.DataFrame(results)

print("Inference complete.")


100%|██████████| 500/500 [22:12<00:00,  2.66s/it]

Inference complete.





In [39]:
output_path = r"E:\Project2\data\en\llama3_results.csv"
results_df.to_csv(output_path, index=False, encoding="utf-8")

print("Results saved to:", output_path)


Results saved to: E:\Project2\data\en\llama3_results.csv


In [40]:
# Replace empty or NaN gold answers with "NA"
results_df["gold_answer"] = results_df["gold_answer"].fillna("NA")
results_df["gold_answer"] = results_df["gold_answer"].replace("", "NA")


In [41]:
results_df["gold_answer"].isna().sum()


np.int64(0)

In [42]:
import requests

# ----------------------------------------
# 1️⃣ Model Name
# ----------------------------------------
MODEL_NAME = "llama3:8b"

# ----------------------------------------
# 2️⃣ Strict NO-RAG Prompt
# ----------------------------------------
NO_RAG_PROMPT = """
Answer the question in at most 5 words.

Strict Rules:
- Maximum 5 words.
- Do NOT explain.
- Do NOT add extra text.
- Do NOT add punctuation.
- Output only the answer.
- If unsure, output: NA

Question:
{question}

Answer:
"""

# ----------------------------------------
# 3️⃣ Ollama Generation Function
# ----------------------------------------
def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 8
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

# ----------------------------------------
# 4️⃣ Build NO-RAG Prompt
# ----------------------------------------
def build_no_rag_prompt(question):
    return NO_RAG_PROMPT.format(question=question)

# ----------------------------------------
# 5️⃣ Generate NO-RAG Answers
# ----------------------------------------
no_rag_predictions = []

for _, row in results_df.iterrows():
    question = row["question"]
    
    prompt = build_no_rag_prompt(question)
    prediction = generate_with_ollama(prompt)
    
    no_rag_predictions.append(prediction)

# Save in NEW column
results_df["llama_no_rag"] = no_rag_predictions

print("LLaMA NO-RAG generation complete.")


LLaMA NO-RAG generation complete.


In [43]:
output_path = r"E:\Project2\data\en\llama3_results_full.csv"

results_df.to_csv(output_path, index=False, encoding="utf-8")

print("Updated CSV saved successfully.")


Updated CSV saved successfully.


In [44]:
def normalize_text(text):
    if pd.isna(text):
        return "NA"
    return str(text).strip().lower()


In [45]:
results_df["gold_norm"] = results_df["gold_answer"].apply(normalize_text)
results_df["rag_norm"] = results_df["model_answer"].apply(normalize_text)
results_df["no_rag_norm"] = results_df["llama_no_rag"].apply(normalize_text)


In [56]:
results_df["rag_exact"] = (
    results_df["gold_norm"] == results_df["rag_norm"]
).astype(int)

results_df["no_rag_exact"] = (
    results_df["gold_norm"] == results_df["no_rag_norm"]
).astype(int)


In [57]:
def substring_match(gold, pred):
    if gold == "na" or pred == "na":
        return 0
    return int(gold in pred or pred in gold)

results_df["rag_substring"] = results_df.apply(
    lambda x: substring_match(x["gold_norm"], x["rag_norm"]),
    axis=1
)

results_df["no_rag_substring"] = results_df.apply(
    lambda x: substring_match(x["gold_norm"], x["no_rag_norm"]),
    axis=1
)


In [59]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def semantic_score(gold, pred):
    if gold == "na" or pred == "na":
        return 0.0
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]


In [60]:
results_df["rag_semantic"] = results_df.apply(
    lambda x: semantic_score(x["gold_norm"], x["rag_norm"]),
    axis=1
)

results_df["no_rag_semantic"] = results_df.apply(
    lambda x: semantic_score(x["gold_norm"], x["no_rag_norm"]),
    axis=1
)


In [75]:
results_df["rag_semantic_match"] = (
    results_df["rag_semantic"] >= 0.50
).astype(int)

results_df["no_rag_semantic_match"] = (
    results_df["no_rag_semantic"] >= 0.50
).astype(int)


In [76]:
print("===== LLaMA PERFORMANCE =====")

print("\nExact Match:")
print("RAG:", results_df["rag_exact"].mean())
print("No-RAG:", results_df["no_rag_exact"].mean())

print("\nSubstring Match:")
print("RAG:", results_df["rag_substring"].mean())
print("No-RAG:", results_df["no_rag_substring"].mean())

print("\nSemantic Match:")
print("RAG:", results_df["rag_semantic_match"].mean())
print("No-RAG:", results_df["no_rag_semantic_match"].mean())


===== LLaMA PERFORMANCE =====

Exact Match:
RAG: 0.624
No-RAG: 0.228

Substring Match:
RAG: 0.536
No-RAG: 0.268

Semantic Match:
RAG: 0.596
No-RAG: 0.458


In [71]:
print("\nSemantic Match (Threshold = 0.75):")
print("RAG:", results_df["rag_semantic_match"].mean())
print("No-RAG:", results_df["no_rag_semantic_match"].mean())



Semantic Match (Threshold = 0.75):
RAG: 0.54
No-RAG: 0.348


In [72]:
print(results_df[["gold_norm", "rag_norm", "rag_semantic"]].head(15))


                   gold_norm                              rag_norm  \
0              german empire                                   ulm   
1                       1921                                  1921   
2                    e = mc2                               e = mc2   
3                       1895                                  1895   
4          swiss citizenship                                 swiss   
5                       1940                                  1940   
6   kaiser wilhelm institute  kaiser wilhelm institute for physics   
7                         na                                    na   
8                         na                                    na   
9                         na                                    na   
10                      asia                                  asia   
11                       30%                                   30%   
12               4.7 billion                           4.7 billion   
13              indi

In [73]:
print(results_df["rag_semantic"].describe())


count    500.000000
mean       0.570618
std        0.443530
min        0.000000
25%        0.000000
50%        0.788177
75%        1.000000
max        1.000000
Name: rag_semantic, dtype: float64


In [74]:
print(results_df["rag_semantic"].describe())


count    500.000000
mean       0.570618
std        0.443530
min        0.000000
25%        0.000000
50%        0.788177
75%        1.000000
max        1.000000
Name: rag_semantic, dtype: float64


In [79]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# -----------------------------------------
# 1️⃣ Normalize Text
# -----------------------------------------
def normalize_text(text):
    if pd.isna(text):
        return "na"
    return str(text).strip().lower()

results_df["gold_norm"] = results_df["gold_answer"].apply(normalize_text)
results_df["rag_norm"] = results_df["model_answer"].apply(normalize_text)
results_df["no_rag_norm"] = results_df["llama_no_rag"].apply(normalize_text)

# -----------------------------------------
# 2️⃣ Semantic Similarity Function
# -----------------------------------------
def semantic_score(gold, pred):
    if gold == "na" or pred == "na":
        return 0.0
    
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    
    return cosine_similarity(g_vec, p_vec)[0][0]

# -----------------------------------------
# 3️⃣ Compute Semantic Scores
# -----------------------------------------
results_df["rag_semantic"] = results_df.apply(
    lambda x: semantic_score(x["gold_norm"], x["rag_norm"]),
    axis=1
)

results_df["no_rag_semantic"] = results_df.apply(
    lambda x: semantic_score(x["gold_norm"], x["no_rag_norm"]),
    axis=1
)

# -----------------------------------------
# 4️⃣ Apply Threshold = 0.50
# -----------------------------------------
THRESHOLD = 0.75

results_df["rag_semantic_match"] = (
    results_df["rag_semantic"] >= THRESHOLD
).astype(int)

results_df["no_rag_semantic_match"] = (
    results_df["no_rag_semantic"] >= THRESHOLD
).astype(int)

# -----------------------------------------
# 5️⃣ Print Final Results
# -----------------------------------------
print("===== SEMANTIC MATCH (Threshold = 0.75) =====")

print("RAG Semantic Match:", results_df["rag_semantic_match"].mean())
print("No-RAG Semantic Match:", results_df["no_rag_semantic_match"].mean())

print("\nSemantic Score Distribution (RAG):")
print(results_df["rag_semantic"].describe())


===== SEMANTIC MATCH (Threshold = 0.75) =====
RAG Semantic Match: 0.512
No-RAG Semantic Match: 0.242

Semantic Score Distribution (RAG):
count    500.000000
mean       0.570618
std        0.443530
min        0.000000
25%        0.000000
50%        0.788177
75%        1.000000
max        1.000000
Name: rag_semantic, dtype: float64


In [80]:
import pandas as pd

summary_table = pd.DataFrame({
    "Metric": [
        "Exact Match",
        "Substring Match",
        "Semantic Match (0.75)"
    ],
    "RAG": [
        results_df["rag_exact"].mean(),
        results_df["rag_substring"].mean(),
        results_df["rag_semantic_match"].mean()
    ],
    "No-RAG": [
        results_df["no_rag_exact"].mean(),
        results_df["no_rag_substring"].mean(),
        results_df["no_rag_semantic_match"].mean()
    ]
})

summary_table


Unnamed: 0,Metric,RAG,No-RAG
0,Exact Match,0.624,0.228
1,Substring Match,0.536,0.268
2,Semantic Match (0.75),0.512,0.242


In [81]:
summary_table_rounded = summary_table.copy()
summary_table_rounded["RAG"] = summary_table_rounded["RAG"].round(3)
summary_table_rounded["No-RAG"] = summary_table_rounded["No-RAG"].round(3)

summary_table_rounded


Unnamed: 0,Metric,RAG,No-RAG
0,Exact Match,0.624,0.228
1,Substring Match,0.536,0.268
2,Semantic Match (0.75),0.512,0.242


In [82]:
summary_table = pd.DataFrame({
    "Metric": [
        "Exact Match",
        "Substring Match",
        "Semantic Match (0.75)",
        "Semantic Score Mean",
        "Semantic Score Median",
        "Semantic Score Std",
        "Semantic Score Min",
        "Semantic Score Max"
    ],
    "RAG": [
        results_df["rag_exact"].mean(),
        results_df["rag_substring"].mean(),
        results_df["rag_semantic_match"].mean(),
        results_df["rag_semantic"].mean(),
        results_df["rag_semantic"].median(),
        results_df["rag_semantic"].std(),
        results_df["rag_semantic"].min(),
        results_df["rag_semantic"].max()
    ],
    "No-RAG": [
        results_df["no_rag_exact"].mean(),
        results_df["no_rag_substring"].mean(),
        results_df["no_rag_semantic_match"].mean(),
        results_df["no_rag_semantic"].mean(),
        results_df["no_rag_semantic"].median(),
        results_df["no_rag_semantic"].std(),
        results_df["no_rag_semantic"].min(),
        results_df["no_rag_semantic"].max()
    ]
})

# Round for cleaner display
summary_table_rounded = summary_table.copy()
summary_table_rounded["RAG"] = summary_table_rounded["RAG"].round(3)
summary_table_rounded["No-RAG"] = summary_table_rounded["No-RAG"].round(3)

summary_table_rounded


Unnamed: 0,Metric,RAG,No-RAG
0,Exact Match,0.624,0.228
1,Substring Match,0.536,0.268
2,Semantic Match (0.75),0.512,0.242
3,Semantic Score Mean,0.571,0.414
4,Semantic Score Median,0.788,0.444
5,Semantic Score Std,0.444,0.365
6,Semantic Score Min,0.0,0.0
7,Semantic Score Max,1.0,1.0


In [83]:
import pandas as pd

quartile_table = pd.DataFrame({
    "Percentile": ["25%", "50% (Median)", "75%"],
    "RAG": [
        results_df["rag_semantic"].quantile(0.25),
        results_df["rag_semantic"].quantile(0.50),
        results_df["rag_semantic"].quantile(0.75)
    ],
    "No-RAG": [
        results_df["no_rag_semantic"].quantile(0.25),
        results_df["no_rag_semantic"].quantile(0.50),
        results_df["no_rag_semantic"].quantile(0.75)
    ]
})

quartile_table = quartile_table.round(3)

quartile_table


Unnamed: 0,Percentile,RAG,No-RAG
0,25%,0.0,0.0
1,50% (Median),0.788,0.444
2,75%,1.0,0.739


In [84]:
import pandas as pd

THRESHOLD = 0.75

master_table = pd.DataFrame({
    "Metric": [
        "Exact Match",
        "Substring Match",
        f"Semantic Match (≥ {THRESHOLD})",
        "Semantic Mean",
        "Semantic Std",
        "Semantic 25%",
        "Semantic 50% (Median)",
        "Semantic 75%",
        "Semantic Min",
        "Semantic Max"
    ],
    "RAG": [
        results_df["rag_exact"].mean(),
        results_df["rag_substring"].mean(),
        results_df["rag_semantic_match"].mean(),
        results_df["rag_semantic"].mean(),
        results_df["rag_semantic"].std(),
        results_df["rag_semantic"].quantile(0.25),
        results_df["rag_semantic"].quantile(0.50),
        results_df["rag_semantic"].quantile(0.75),
        results_df["rag_semantic"].min(),
        results_df["rag_semantic"].max()
    ],
    "No-RAG": [
        results_df["no_rag_exact"].mean(),
        results_df["no_rag_substring"].mean(),
        results_df["no_rag_semantic_match"].mean(),
        results_df["no_rag_semantic"].mean(),
        results_df["no_rag_semantic"].std(),
        results_df["no_rag_semantic"].quantile(0.25),
        results_df["no_rag_semantic"].quantile(0.50),
        results_df["no_rag_semantic"].quantile(0.75),
        results_df["no_rag_semantic"].min(),
        results_df["no_rag_semantic"].max()
    ]
})

# Round for clean display
master_table["RAG"] = master_table["RAG"].round(3)
master_table["No-RAG"] = master_table["No-RAG"].round(3)

master_table


Unnamed: 0,Metric,RAG,No-RAG
0,Exact Match,0.624,0.228
1,Substring Match,0.536,0.268
2,Semantic Match (≥ 0.75),0.512,0.242
3,Semantic Mean,0.571,0.414
4,Semantic Std,0.444,0.365
5,Semantic 25%,0.0,0.0
6,Semantic 50% (Median),0.788,0.444
7,Semantic 75%,1.0,0.739
8,Semantic Min,0.0,0.0
9,Semantic Max,1.0,1.0


In [85]:
import requests
from tqdm import tqdm

MODEL_NAME = "qwen2.5:7b"

# ----------------------------
# RAG Prompt
# ----------------------------
RAG_PROMPT = """
Extract the shortest exact answer from the context.

Strict Rules:
- Copy the answer exactly from the context.
- Use the minimum number of words possible.
- Maximum 5 words.
- Do NOT include surrounding phrases.
- Do NOT explain.
- Do NOT add punctuation.
- Output only the answer text.
- If the answer is not explicitly written in the context, output: NA

Context:
{retrieved_chunks}

Question:
{question}

Answer:
"""

# ----------------------------
# NO-RAG Prompt
# ----------------------------
NO_RAG_PROMPT = """
Answer the question in at most 5 words.

Strict Rules:
- Maximum 5 words.
- Do NOT explain.
- Do NOT add extra text.
- Output only the answer.
- If unsure, output: NA

Question:
{question}

Answer:
"""

# ----------------------------
# Ollama Call
# ----------------------------
def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 8
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

# ----------------------------
# Run Inference
# ----------------------------
qwen_rag = []
qwen_no_rag = []

for _, row in tqdm(results_df.iterrows(), total=len(results_df)):
    
    question = row["question"]
    
    # RAG
    chunks = retrieve_top_k(question, k=3)
    context = "\n".join(chunks)
    rag_prompt = RAG_PROMPT.format(
        retrieved_chunks=context,
        question=question
    )
    rag_answer = generate_with_ollama(rag_prompt)
    qwen_rag.append(rag_answer)
    
    # NO-RAG
    no_rag_prompt = NO_RAG_PROMPT.format(question=question)
    no_rag_answer = generate_with_ollama(no_rag_prompt)
    qwen_no_rag.append(no_rag_answer)

results_df["qwen_rag"] = qwen_rag
results_df["qwen_no_rag"] = qwen_no_rag

print("Qwen inference complete.")


100%|██████████| 500/500 [43:06<00:00,  5.17s/it]

Qwen inference complete.





In [86]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# ==============================
# LOAD CSV
# ==============================

df = pd.read_csv(r"E:\Project2\data\en\llama3_results_full.csv", encoding="utf-8")

# ==============================
# NORMALIZE TEXT
# ==============================

def normalize_text(text):
    if pd.isna(text):
        return "na"
    return str(text).strip().lower()

df["gold_norm"] = df["gold_answer"].apply(normalize_text)
df["rag_norm"] = df["model_answer"].apply(normalize_text)
df["no_rag_norm"] = df["llama_no_rag"].apply(normalize_text)

# ==============================
# SPLIT DATA
# ==============================

answerable_df = df[df["gold_norm"] != "na"].copy()
non_answerable_df = df[df["gold_norm"] == "na"].copy()

print("Total Questions:", len(df))
print("Answerable Questions:", len(answerable_df))
print("Non-Answerable Questions:", len(non_answerable_df))

# ==============================
# METRIC FUNCTIONS
# ==============================

def exact_match(pred, gold):
    return int(pred == gold)

def substring_match(pred, gold):
    return int(gold in pred or pred in gold)

def semantic_score(pred, gold):
    if gold == "na" or pred == "na":
        return 0.0
    
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]

# ==============================
# EVALUATE ANSWERABLE
# ==============================

def evaluate_answerable(data, pred_column, threshold=0.75):
    exact_scores = []
    substring_scores = []
    semantic_scores_list = []

    for _, row in data.iterrows():
        pred = row[pred_column]
        gold = row["gold_norm"]

        exact_scores.append(exact_match(pred, gold))
        substring_scores.append(substring_match(pred, gold))
        semantic_scores_list.append(semantic_score(pred, gold))

    semantic_binary = [int(s >= threshold) for s in semantic_scores_list]

    return {
        "Exact Match": np.mean(exact_scores),
        "Substring Match": np.mean(substring_scores),
        "Semantic Match (0.75)": np.mean(semantic_binary),
        "Semantic Mean Score": np.mean(semantic_scores_list)
    }

print("\n===== ANSWERABLE QUESTIONS =====")

rag_answerable = evaluate_answerable(answerable_df, "rag_norm")
no_rag_answerable = evaluate_answerable(answerable_df, "no_rag_norm")

results_answerable = pd.DataFrame({
    "Metric": rag_answerable.keys(),
    "RAG": rag_answerable.values(),
    "No-RAG": no_rag_answerable.values()
}).round(3)

print(results_answerable)

# ==============================
# EVALUATE NON-ANSWERABLE
# ==============================

def evaluate_non_answerable(data, pred_column):
    correct_na = []
    for _, row in data.iterrows():
        pred = row[pred_column]
        correct_na.append(int(pred == "na"))
    return np.mean(correct_na)

print("\n===== NON-ANSWERABLE QUESTIONS =====")

rag_na_score = evaluate_non_answerable(non_answerable_df, "rag_norm")
no_rag_na_score = evaluate_non_answerable(non_answerable_df, "no_rag_norm")

results_na = pd.DataFrame({
    "Metric": ["Correct NA Prediction"],
    "RAG": [rag_na_score],
    "No-RAG": [no_rag_na_score]
}).round(3)

print(results_na)


Total Questions: 500
Answerable Questions: 350
Non-Answerable Questions: 150

===== ANSWERABLE QUESTIONS =====
                  Metric    RAG  No-RAG
0            Exact Match  0.540   0.089
1        Substring Match  0.771   0.386
2  Semantic Match (0.75)  0.731   0.346
3    Semantic Mean Score  0.815   0.591

===== NON-ANSWERABLE QUESTIONS =====
                  Metric   RAG  No-RAG
0  Correct NA Prediction  0.82   0.553


In [88]:
# Select only required columns
qwen_export_df = results_df[[
    "query_id",
    "doc_id",
    "question",
    "gold_answer",
    "qwen_rag",
    "qwen_no_rag"
]].copy()

# Save to CSV
output_path = r"E:\Project2\data\en\qwen_results_full.csv"

qwen_export_df.to_csv(output_path, index=False, encoding="utf-8")

print("Clean Qwen CSV saved successfully.")
print("Saved at:", output_path)


Clean Qwen CSV saved successfully.
Saved at: E:\Project2\data\en\qwen_results_full.csv


In [89]:
import pandas as pd
import numpy as np
import unicodedata
from sklearn.metrics.pairwise import cosine_similarity

# ==============================
# LOAD CSV (UTF-8)
# ==============================

df = pd.read_csv(
    r"E:\Project2\data\en\qwen_results_full.csv",
    encoding="utf-8"
)

# ==============================
# UNICODE NORMALIZATION
# ==============================

def normalize_text(text):
    if pd.isna(text):
        return "na"
    
    text = str(text)
    
    # Normalize Unicode (important for Tamil/Malayalam later)
    text = unicodedata.normalize("NFC", text)
    
    # Strip whitespace and lowercase
    text = text.strip().lower()
    
    if text == "":
        return "na"
    
    return text

df["gold_norm"] = df["gold_answer"].apply(normalize_text)
df["rag_norm"] = df["qwen_rag"].apply(normalize_text)
df["no_rag_norm"] = df["qwen_no_rag"].apply(normalize_text)

# ==============================
# SPLIT DATA
# ==============================

answerable_df = df[df["gold_norm"] != "na"].copy()
non_answerable_df = df[df["gold_norm"] == "na"].copy()

print("Total Questions:", len(df))
print("Answerable Questions:", len(answerable_df))
print("Non-Answerable Questions:", len(non_answerable_df))

# ==============================
# METRIC FUNCTIONS
# ==============================

def exact_match(pred, gold):
    return int(pred == gold)

def substring_match(pred, gold):
    return int(gold in pred or pred in gold)

def semantic_score(pred, gold):
    if gold == "na" or pred == "na":
        return 0.0
    
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]

# ==============================
# EVALUATE ANSWERABLE
# ==============================

def evaluate_answerable(data, pred_column, threshold=0.75):
    exact_scores = []
    substring_scores = []
    semantic_scores_list = []

    for _, row in data.iterrows():
        pred = row[pred_column]
        gold = row["gold_norm"]

        exact_scores.append(exact_match(pred, gold))
        substring_scores.append(substring_match(pred, gold))
        semantic_scores_list.append(semantic_score(pred, gold))

    semantic_binary = [int(s >= threshold) for s in semantic_scores_list]

    return {
        "Exact Match": np.mean(exact_scores),
        "Substring Match": np.mean(substring_scores),
        "Semantic Match (0.75)": np.mean(semantic_binary),
        "Semantic Mean Score": np.mean(semantic_scores_list)
    }

print("\n===== ANSWERABLE QUESTIONS (QWEN) =====")

rag_answerable = evaluate_answerable(answerable_df, "rag_norm")
no_rag_answerable = evaluate_answerable(answerable_df, "no_rag_norm")

results_answerable = pd.DataFrame({
    "Metric": rag_answerable.keys(),
    "RAG": rag_answerable.values(),
    "No-RAG": no_rag_answerable.values()
}).round(3)

print(results_answerable)

# ==============================
# EVALUATE NON-ANSWERABLE
# ==============================

def evaluate_non_answerable(data, pred_column):
    correct_na = []
    for _, row in data.iterrows():
        pred = row[pred_column]
        correct_na.append(int(pred == "na"))
    return np.mean(correct_na)

print("\n===== NON-ANSWERABLE QUESTIONS (QWEN) =====")

rag_na_score = evaluate_non_answerable(non_answerable_df, "rag_norm")
no_rag_na_score = evaluate_non_answerable(non_answerable_df, "no_rag_norm")

results_na = pd.DataFrame({
    "Metric": ["Correct NA Prediction"],
    "RAG": [rag_na_score],
    "No-RAG": [no_rag_na_score]
}).round(3)

print(results_na)


Total Questions: 500
Answerable Questions: 350
Non-Answerable Questions: 150

===== ANSWERABLE QUESTIONS (QWEN) =====
                  Metric    RAG  No-RAG
0            Exact Match  0.609   0.211
1        Substring Match  0.774   0.331
2  Semantic Match (0.75)  0.786   0.363
3    Semantic Mean Score  0.850   0.489

===== NON-ANSWERABLE QUESTIONS (QWEN) =====
                  Metric    RAG  No-RAG
0  Correct NA Prediction  0.873    0.94


In [90]:
import requests
from tqdm import tqdm

MODEL_NAME = "mistral:7b"

# ----------------------------
# RAG Prompt
# ----------------------------
RAG_PROMPT = """
Extract the shortest exact answer from the context.

Strict Rules:
- Copy the answer exactly from the context.
- Use the minimum number of words possible.
- Maximum 5 words.
- Do NOT include surrounding phrases.
- Do NOT explain.
- Do NOT add punctuation.
- Output only the answer text.
- If the answer is not explicitly written in the context, output: NA

Context:
{retrieved_chunks}

Question:
{question}

Answer:
"""

# ----------------------------
# NO-RAG Prompt
# ----------------------------
NO_RAG_PROMPT = """
Answer the question in at most 5 words.

Strict Rules:
- Maximum 5 words.
- Do NOT explain.
- Do NOT add extra text.
- Output only the answer.
- If unsure, output: NA

Question:
{question}

Answer:
"""

def generate_with_ollama(prompt):
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0,
            "top_p": 1,
            "num_predict": 8
        }
    }
    
    response = requests.post(url, json=payload)
    result = response.json()
    
    return result["response"].strip()

mistral_rag = []
mistral_no_rag = []

for _, row in tqdm(results_df.iterrows(), total=len(results_df)):
    
    question = row["question"]
    
    # RAG
    chunks = retrieve_top_k(question, k=3)
    context = "\n".join(chunks)
    rag_prompt = RAG_PROMPT.format(
        retrieved_chunks=context,
        question=question
    )
    rag_answer = generate_with_ollama(rag_prompt)
    mistral_rag.append(rag_answer)
    
    # NO-RAG
    no_rag_prompt = NO_RAG_PROMPT.format(question=question)
    no_rag_answer = generate_with_ollama(no_rag_prompt)
    mistral_no_rag.append(no_rag_answer)

results_df["mistral_rag"] = mistral_rag
results_df["mistral_no_rag"] = mistral_no_rag

print("Mistral inference complete.")


100%|██████████| 500/500 [42:58<00:00,  5.16s/it]

Mistral inference complete.





In [91]:
mistral_export_df = results_df[[
    "query_id",
    "doc_id",
    "question",
    "gold_answer",
    "mistral_rag",
    "mistral_no_rag"
]].copy()

output_path = r"E:\Project2\data\en\mistral_results_full.csv"

mistral_export_df.to_csv(output_path, index=False, encoding="utf-8")

print("Clean Mistral CSV saved successfully.")


Clean Mistral CSV saved successfully.


In [92]:
import pandas as pd
import numpy as np
import unicodedata
from sklearn.metrics.pairwise import cosine_similarity

# ==============================
# LOAD CSV (UTF-8)
# ==============================

df = pd.read_csv(
    r"E:\Project2\data\en\mistral_results_full.csv",
    encoding="utf-8"
)

# ==============================
# UNICODE NORMALIZATION
# ==============================

def normalize_text(text):
    if pd.isna(text):
        return "na"
    
    text = str(text)
    text = unicodedata.normalize("NFC", text)
    text = text.strip().lower()
    
    if text == "":
        return "na"
    
    return text

df["gold_norm"] = df["gold_answer"].apply(normalize_text)
df["rag_norm"] = df["mistral_rag"].apply(normalize_text)
df["no_rag_norm"] = df["mistral_no_rag"].apply(normalize_text)

# ==============================
# SPLIT DATA
# ==============================

answerable_df = df[df["gold_norm"] != "na"].copy()
non_answerable_df = df[df["gold_norm"] == "na"].copy()

print("Total Questions:", len(df))
print("Answerable Questions:", len(answerable_df))
print("Non-Answerable Questions:", len(non_answerable_df))

# ==============================
# METRIC FUNCTIONS
# ==============================

def exact_match(pred, gold):
    return int(pred == gold)

def substring_match(pred, gold):
    return int(gold in pred or pred in gold)

def semantic_score(pred, gold):
    if gold == "na" or pred == "na":
        return 0.0
    
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]

# ==============================
# EVALUATE ANSWERABLE
# ==============================

def evaluate_answerable(data, pred_column, threshold=0.75):
    exact_scores = []
    substring_scores = []
    semantic_scores_list = []

    for _, row in data.iterrows():
        pred = row[pred_column]
        gold = row["gold_norm"]

        exact_scores.append(exact_match(pred, gold))
        substring_scores.append(substring_match(pred, gold))
        semantic_scores_list.append(semantic_score(pred, gold))

    semantic_binary = [int(s >= threshold) for s in semantic_scores_list]

    return {
        "Exact Match": np.mean(exact_scores),
        "Substring Match": np.mean(substring_scores),
        "Semantic Match (0.75)": np.mean(semantic_binary),
        "Semantic Mean Score": np.mean(semantic_scores_list)
    }

print("\n===== ANSWERABLE QUESTIONS (MISTRAL) =====")

rag_answerable = evaluate_answerable(answerable_df, "rag_norm")
no_rag_answerable = evaluate_answerable(answerable_df, "no_rag_norm")

results_answerable = pd.DataFrame({
    "Metric": rag_answerable.keys(),
    "RAG": rag_answerable.values(),
    "No-RAG": no_rag_answerable.values()
}).round(3)

print(results_answerable)

# ==============================
# EVALUATE NON-ANSWERABLE
# ==============================

def evaluate_non_answerable(data, pred_column):
    correct_na = []
    for _, row in data.iterrows():
        pred = row[pred_column]
        correct_na.append(int(pred == "na"))
    return np.mean(correct_na)

print("\n===== NON-ANSWERABLE QUESTIONS (MISTRAL) =====")

rag_na_score = evaluate_non_answerable(non_answerable_df, "rag_norm")
no_rag_na_score = evaluate_non_answerable(non_answerable_df, "no_rag_norm")

results_na = pd.DataFrame({
    "Metric": ["Correct NA Prediction"],
    "RAG": [rag_na_score],
    "No-RAG": [no_rag_na_score]
}).round(3)

print(results_na)


Total Questions: 500
Answerable Questions: 350
Non-Answerable Questions: 150

===== ANSWERABLE QUESTIONS (MISTRAL) =====
                  Metric    RAG  No-RAG
0            Exact Match  0.506   0.151
1        Substring Match  0.700   0.397
2  Semantic Match (0.75)  0.720   0.400
3    Semantic Mean Score  0.823   0.621

===== NON-ANSWERABLE QUESTIONS (MISTRAL) =====
                  Metric   RAG  No-RAG
0  Correct NA Prediction  0.08     0.0


In [94]:
def normalize_text(text):
    if pd.isna(text):
        return "na"
    return str(text).strip().lower()

results_df["gold_norm"] = results_df["gold_answer"].apply(normalize_text)

results_df["rag_norm"] = results_df["model_answer"].apply(normalize_text)
results_df["no_rag_norm"] = results_df["llama_no_rag"].apply(normalize_text)

results_df["qwen_rag_norm"] = results_df["qwen_rag"].apply(normalize_text)
results_df["qwen_no_rag_norm"] = results_df["qwen_no_rag"].apply(normalize_text)

results_df["mistral_rag_norm"] = results_df["mistral_rag"].apply(normalize_text)
results_df["mistral_no_rag_norm"] = results_df["mistral_no_rag"].apply(normalize_text)


In [95]:
from sklearn.metrics.pairwise import cosine_similarity

def exact_match(pred, gold):
    return int(pred == gold)

def substring_match(pred, gold):
    return int(gold in pred or pred in gold)

def semantic_score(pred, gold):
    if gold == "na" or pred == "na":
        return 0.0
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]

THRESHOLD = 0.75


In [96]:
def compute_all_metrics(prefix):
    
    # Exact
    results_df[f"{prefix}_exact"] = results_df.apply(
        lambda x: exact_match(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    # Substring
    results_df[f"{prefix}_substring"] = results_df.apply(
        lambda x: substring_match(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    # Semantic score
    results_df[f"{prefix}_semantic"] = results_df.apply(
        lambda x: semantic_score(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    # Semantic binary
    results_df[f"{prefix}_semantic_match"] = (
        results_df[f"{prefix}_semantic"] >= THRESHOLD
    ).astype(int)


In [97]:
# LLaMA
compute_all_metrics("rag")
compute_all_metrics("no_rag")

# Qwen
compute_all_metrics("qwen_rag")
compute_all_metrics("qwen_no_rag")

# Mistral
compute_all_metrics("mistral_rag")
compute_all_metrics("mistral_no_rag")

print("All evaluation columns created successfully.")


All evaluation columns created successfully.


In [98]:
import pandas as pd
import numpy as np

THRESHOLD = 0.75

# Split once
answerable_df = results_df[results_df["gold_norm"] != "na"]
non_answerable_df = results_df[results_df["gold_norm"] == "na"]

# Helper function
def compute_metrics(data, rag_exact, rag_sub, rag_sem_match, rag_sem,
                    no_rag_exact, no_rag_sub, no_rag_sem_match, no_rag_sem):

    return {
        "Exact Match (RAG)": data[rag_exact].mean(),
        "Exact Match (No-RAG)": data[no_rag_exact].mean(),

        "Substring Match (RAG)": data[rag_sub].mean(),
        "Substring Match (No-RAG)": data[no_rag_sub].mean(),

        f"Semantic Match ≥ {THRESHOLD} (RAG)": data[rag_sem_match].mean(),
        f"Semantic Match ≥ {THRESHOLD} (No-RAG)": data[no_rag_sem_match].mean(),

        "Semantic Mean (RAG)": data[rag_sem].mean(),
        "Semantic Mean (No-RAG)": data[no_rag_sem].mean(),
    }

def compute_na(data, rag_col, no_rag_col):
    return {
        "Correct NA (RAG)": (data[rag_col] == "na").mean(),
        "Correct NA (No-RAG)": (data[no_rag_col] == "na").mean()
    }

# -----------------------------
# LLaMA
# -----------------------------
llama_answerable = compute_metrics(
    answerable_df,
    "rag_exact", "rag_substring", "rag_semantic_match", "rag_semantic",
    "no_rag_exact", "no_rag_substring", "no_rag_semantic_match", "no_rag_semantic"
)

llama_na = compute_na(
    non_answerable_df,
    "rag_norm",
    "no_rag_norm"
)

# -----------------------------
# QWEN
# -----------------------------
qwen_answerable = compute_metrics(
    answerable_df,
    "qwen_rag_exact", "qwen_rag_substring", "qwen_rag_semantic_match", "qwen_rag_semantic",
    "qwen_no_rag_exact", "qwen_no_rag_substring", "qwen_no_rag_semantic_match", "qwen_no_rag_semantic"
)

qwen_na = compute_na(
    non_answerable_df,
    "qwen_rag_norm",
    "qwen_no_rag_norm"
)

# -----------------------------
# MISTRAL
# -----------------------------
mistral_answerable = compute_metrics(
    answerable_df,
    "mistral_rag_exact", "mistral_rag_substring", "mistral_rag_semantic_match", "mistral_rag_semantic",
    "mistral_no_rag_exact", "mistral_no_rag_substring", "mistral_no_rag_semantic_match", "mistral_no_rag_semantic"
)

mistral_na = compute_na(
    non_answerable_df,
    "mistral_rag_norm",
    "mistral_no_rag_norm"
)

# -----------------------------
# Combine Everything
# -----------------------------
comparison_table = pd.DataFrame({
    "Metric": list(llama_answerable.keys()) + list(llama_na.keys()),

    "LLaMA": list(llama_answerable.values()) + list(llama_na.values()),
    "Qwen": list(qwen_answerable.values()) + list(qwen_na.values()),
    "Mistral": list(mistral_answerable.values()) + list(mistral_na.values())
})

comparison_table = comparison_table.round(3)

comparison_table


Unnamed: 0,Metric,LLaMA,Qwen,Mistral
0,Exact Match (RAG),0.54,0.609,0.506
1,Exact Match (No-RAG),0.089,0.211,0.151
2,Substring Match (RAG),0.771,0.774,0.7
3,Substring Match (No-RAG),0.386,0.331,0.397
4,Semantic Match ≥ 0.75 (RAG),0.731,0.786,0.72
5,Semantic Match ≥ 0.75 (No-RAG),0.346,0.363,0.4
6,Semantic Mean (RAG),0.815,0.85,0.823
7,Semantic Mean (No-RAG),0.591,0.489,0.621
8,Correct NA (RAG),0.82,0.873,0.08
9,Correct NA (No-RAG),0.553,0.94,0.0


In [104]:
import unicodedata

# -----------------------------------
# Ensure normalization exists
# -----------------------------------

def normalize_text(text):
    if pd.isna(text):
        return "na"
    text = str(text)
    text = unicodedata.normalize("NFC", text)
    text = text.strip().lower()
    if text == "":
        return "na"
    return text

results_df["gold_norm"] = results_df["gold_answer"].apply(normalize_text)

# -----------------------------------
# Function to compute metrics
# -----------------------------------

from sklearn.metrics.pairwise import cosine_similarity

THRESHOLD = 0.75

def exact_match(pred, gold):
    return int(pred == gold)

def substring_match(pred, gold):
    return int(gold in pred or pred in gold)

def semantic_score(pred, gold):
    if gold == "na" or pred == "na":
        return 0.0
    g_vec = embed_texts([gold])
    p_vec = embed_texts([pred])
    return cosine_similarity(g_vec, p_vec)[0][0]

def compute_metrics(prefix):
    results_df[f"{prefix}_norm"] = results_df[prefix].apply(normalize_text)
    
    results_df[f"{prefix}_exact"] = results_df.apply(
        lambda x: exact_match(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    results_df[f"{prefix}_substring"] = results_df.apply(
        lambda x: substring_match(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    results_df[f"{prefix}_semantic"] = results_df.apply(
        lambda x: semantic_score(x[f"{prefix}_norm"], x["gold_norm"]),
        axis=1
    )
    
    results_df[f"{prefix}_semantic_match"] = (
        results_df[f"{prefix}_semantic"] >= THRESHOLD
    ).astype(int)

# -----------------------------------
# Compute for ALL models
# -----------------------------------

compute_metrics("model_answer")       # LLaMA RAG
compute_metrics("llama_no_rag")

compute_metrics("qwen_rag")
compute_metrics("qwen_no_rag")

compute_metrics("mistral_rag")
compute_metrics("mistral_no_rag")

print("All metrics computed.")

# -----------------------------------
# Create MASTER EXPORT DF
# -----------------------------------

master_export_df = results_df[[
    "query_id",
    "doc_id",
    "question",
    "gold_answer",

    # LLaMA
    "model_answer",
    "model_answer_exact",
    "model_answer_substring",
    "model_answer_semantic",
    "model_answer_semantic_match",

    "llama_no_rag",
    "llama_no_rag_exact",
    "llama_no_rag_substring",
    "llama_no_rag_semantic",
    "llama_no_rag_semantic_match",

    # Qwen
    "qwen_rag",
    "qwen_rag_exact",
    "qwen_rag_substring",
    "qwen_rag_semantic",
    "qwen_rag_semantic_match",

    "qwen_no_rag",
    "qwen_no_rag_exact",
    "qwen_no_rag_substring",
    "qwen_no_rag_semantic",
    "qwen_no_rag_semantic_match",

    # Mistral
    "mistral_rag",
    "mistral_rag_exact",
    "mistral_rag_substring",
    "mistral_rag_semantic",
    "mistral_rag_semantic_match",

    "mistral_no_rag",
    "mistral_no_rag_exact",
    "mistral_no_rag_substring",
    "mistral_no_rag_semantic",
    "mistral_no_rag_semantic_match"
]].copy()

# -----------------------------------
# Save MASTER CSV
# -----------------------------------

output_path = r"E:\Project2\data\en\MASTER_EVALUATION_ALL_MODELS.csv"

master_export_df.to_csv(output_path, index=False, encoding="utf-8")

print("MASTER CSV saved successfully.")
print("Saved at:", output_path)


All metrics computed.
MASTER CSV saved successfully.
Saved at: E:\Project2\data\en\MASTER_EVALUATION_ALL_MODELS.csv
